feat: MVP semantisch zoeken via Qdrant + Mistral embeddings
- semantic.py: Qdrant client + Mistral embeddings indexer - /api/v1/zoeken?mode=semantic — zoek op betekenis - /api/v1/zoeken?mode=keyword — Meilisearch (default) - 148 Grondwet-artikelen geëmbed - Qdrant container draait op dt-prod-01 Voorbeeld: "mag mijn baas mijn e-mail lezen?" → Artikel 13 (briefgeheim) Voorbeeld: "wanneer mag de politie mijn huis binnenkomen?" → Artikel 12 (score 0.789) Sluit #37
This commit is contained in:
parent
af339652df
commit
b655f56f8c
3 changed files with 262 additions and 2 deletions
245
src/wetgit/ai/semantic.py
Normal file
245
src/wetgit/ai/semantic.py
Normal file
|
|
@ -0,0 +1,245 @@
|
||||||
|
"""Semantisch zoeken via Qdrant + Mistral embeddings.
|
||||||
|
|
||||||
|
Indexeert artikelen als vector embeddings en biedt semantic search
|
||||||
|
waarmee je op betekenis kunt zoeken in plaats van exacte woorden.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python -m wetgit.ai.semantic index --repo /path/to/rijk
|
||||||
|
python -m wetgit.ai.semantic search "mag mijn baas mijn e-mail lezen?"
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
MISTRAL_EMBED_URL = "https://api.mistral.ai/v1/embeddings"
|
||||||
|
MISTRAL_EMBED_MODEL = "mistral-embed"
|
||||||
|
QDRANT_URL = "http://127.0.0.1:6333"
|
||||||
|
COLLECTION = "wetgit_artikelen"
|
||||||
|
VECTOR_DIM = 1024 # mistral-embed output dimension
|
||||||
|
|
||||||
|
|
||||||
|
class SemanticSearch:
|
||||||
|
"""Semantisch zoeken via Qdrant + Mistral embeddings."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
qdrant_url: str = QDRANT_URL,
|
||||||
|
mistral_api_key: str | None = None,
|
||||||
|
) -> None:
|
||||||
|
self.qdrant_url = qdrant_url.rstrip("/")
|
||||||
|
self.api_key = mistral_api_key or os.environ.get("MISTRAL_API_KEY", "")
|
||||||
|
|
||||||
|
def setup_collection(self) -> None:
|
||||||
|
"""Maak de Qdrant collectie aan."""
|
||||||
|
# Check of collectie al bestaat
|
||||||
|
resp = httpx.get(f"{self.qdrant_url}/collections/{COLLECTION}", timeout=5)
|
||||||
|
if resp.status_code == 200:
|
||||||
|
logger.info("Collectie '%s' bestaat al", COLLECTION)
|
||||||
|
return
|
||||||
|
|
||||||
|
httpx.put(
|
||||||
|
f"{self.qdrant_url}/collections/{COLLECTION}",
|
||||||
|
json={
|
||||||
|
"vectors": {
|
||||||
|
"size": VECTOR_DIM,
|
||||||
|
"distance": "Cosine",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
timeout=10,
|
||||||
|
)
|
||||||
|
logger.info("Collectie '%s' aangemaakt", COLLECTION)
|
||||||
|
|
||||||
|
def index_regeling(self, bwb_id: str, titel: str, type_: str, tekst: str) -> int:
|
||||||
|
"""Indexeer alle artikelen van een regeling als embeddings."""
|
||||||
|
# Extraheer artikelen
|
||||||
|
articles: list[dict] = []
|
||||||
|
pattern = r"### Artikel (\S+)(.*?)(?=\n### Artikel |\n## |\Z)"
|
||||||
|
|
||||||
|
for match in re.finditer(pattern, tekst, re.DOTALL):
|
||||||
|
nummer = match.group(1)
|
||||||
|
body = match.group(2).strip()
|
||||||
|
if len(body) < 10:
|
||||||
|
continue
|
||||||
|
|
||||||
|
articles.append({
|
||||||
|
"id": f"{bwb_id}_art_{nummer}",
|
||||||
|
"nummer": nummer,
|
||||||
|
"tekst": body[:2000], # Beperk voor embedding
|
||||||
|
"bwb_id": bwb_id,
|
||||||
|
"regeling_titel": titel,
|
||||||
|
"type": type_,
|
||||||
|
})
|
||||||
|
|
||||||
|
if not articles:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# Genereer embeddings in batches van 10
|
||||||
|
batch_size = 10
|
||||||
|
points: list[dict] = []
|
||||||
|
|
||||||
|
for i in range(0, len(articles), batch_size):
|
||||||
|
batch = articles[i:i + batch_size]
|
||||||
|
texts = [
|
||||||
|
f"{a['regeling_titel']} Artikel {a['nummer']}: {a['tekst']}"
|
||||||
|
for a in batch
|
||||||
|
]
|
||||||
|
|
||||||
|
embeddings = self._get_embeddings(texts)
|
||||||
|
if not embeddings:
|
||||||
|
logger.warning("Embedding mislukt voor batch %d", i)
|
||||||
|
continue
|
||||||
|
|
||||||
|
for j, (article, embedding) in enumerate(zip(batch, embeddings)):
|
||||||
|
point_id = abs(hash(article["id"])) % (2**63)
|
||||||
|
points.append({
|
||||||
|
"id": point_id,
|
||||||
|
"vector": embedding,
|
||||||
|
"payload": {
|
||||||
|
"article_id": article["id"],
|
||||||
|
"bwb_id": article["bwb_id"],
|
||||||
|
"regeling_titel": article["regeling_titel"],
|
||||||
|
"type": article["type"],
|
||||||
|
"artikel_nummer": article["nummer"],
|
||||||
|
"tekst": article["tekst"][:500],
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
# Upload naar Qdrant
|
||||||
|
if points:
|
||||||
|
resp = httpx.put(
|
||||||
|
f"{self.qdrant_url}/collections/{COLLECTION}/points",
|
||||||
|
json={"points": points},
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
logger.info("Geïndexeerd: %s — %d artikelen", bwb_id, len(points))
|
||||||
|
|
||||||
|
return len(points)
|
||||||
|
|
||||||
|
def search(self, query: str, limit: int = 10) -> list[dict]:
|
||||||
|
"""Zoek semantisch op betekenis."""
|
||||||
|
embeddings = self._get_embeddings([query])
|
||||||
|
if not embeddings:
|
||||||
|
return []
|
||||||
|
|
||||||
|
resp = httpx.post(
|
||||||
|
f"{self.qdrant_url}/collections/{COLLECTION}/points/search",
|
||||||
|
json={
|
||||||
|
"vector": embeddings[0],
|
||||||
|
"limit": limit,
|
||||||
|
"with_payload": True,
|
||||||
|
},
|
||||||
|
timeout=10,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
results = resp.json().get("result", [])
|
||||||
|
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"bwb_id": r["payload"]["bwb_id"],
|
||||||
|
"titel": r["payload"]["regeling_titel"],
|
||||||
|
"artikel": f"Artikel {r['payload']['artikel_nummer']}",
|
||||||
|
"context": r["payload"]["tekst"][:200],
|
||||||
|
"score": round(r["score"], 3),
|
||||||
|
}
|
||||||
|
for r in results
|
||||||
|
]
|
||||||
|
|
||||||
|
def health(self) -> bool:
|
||||||
|
"""Check of Qdrant bereikbaar is."""
|
||||||
|
try:
|
||||||
|
resp = httpx.get(f"{self.qdrant_url}/healthz", timeout=5)
|
||||||
|
return resp.status_code == 200
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _get_embeddings(self, texts: list[str]) -> list[list[float]] | None:
|
||||||
|
"""Genereer embeddings via Mistral API."""
|
||||||
|
try:
|
||||||
|
resp = httpx.post(
|
||||||
|
MISTRAL_EMBED_URL,
|
||||||
|
headers={
|
||||||
|
"Authorization": f"Bearer {self.api_key}",
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
},
|
||||||
|
json={
|
||||||
|
"model": MISTRAL_EMBED_MODEL,
|
||||||
|
"input": texts,
|
||||||
|
},
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
data = resp.json()
|
||||||
|
return [item["embedding"] for item in data["data"]]
|
||||||
|
except httpx.HTTPError as e:
|
||||||
|
logger.error("Mistral embedding fout: %s", e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def index_repo(repo_path: Path, qdrant_url: str = QDRANT_URL) -> int:
|
||||||
|
"""Indexeer de hele repo in Qdrant."""
|
||||||
|
search = SemanticSearch(qdrant_url=qdrant_url)
|
||||||
|
search.setup_collection()
|
||||||
|
|
||||||
|
total = 0
|
||||||
|
index_path = repo_path / "index.json"
|
||||||
|
if index_path.exists():
|
||||||
|
data = json.loads(index_path.read_text(encoding="utf-8"))
|
||||||
|
regelingen = data.get("regelingen", [])
|
||||||
|
else:
|
||||||
|
from wetgit.pipeline.indexer import generate_index
|
||||||
|
regelingen = generate_index(repo_path)
|
||||||
|
|
||||||
|
for regeling in regelingen:
|
||||||
|
md_path = repo_path / regeling["pad"] / "README.md"
|
||||||
|
if not md_path.exists():
|
||||||
|
continue
|
||||||
|
tekst = md_path.read_text(encoding="utf-8")
|
||||||
|
count = search.index_regeling(
|
||||||
|
bwb_id=regeling["bwb_id"],
|
||||||
|
titel=regeling.get("titel", ""),
|
||||||
|
type_=regeling.get("type", ""),
|
||||||
|
tekst=tekst,
|
||||||
|
)
|
||||||
|
total += count
|
||||||
|
|
||||||
|
logger.info("Totaal geïndexeerd: %d artikelen", total)
|
||||||
|
return total
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="WetGit semantisch zoeken")
|
||||||
|
sub = parser.add_subparsers(dest="command")
|
||||||
|
|
||||||
|
idx = sub.add_parser("index", help="Indexeer regelingen")
|
||||||
|
idx.add_argument("--repo", type=Path, required=True)
|
||||||
|
idx.add_argument("--qdrant-url", default=QDRANT_URL)
|
||||||
|
|
||||||
|
srch = sub.add_parser("search", help="Zoek semantisch")
|
||||||
|
srch.add_argument("query")
|
||||||
|
srch.add_argument("--qdrant-url", default=QDRANT_URL)
|
||||||
|
srch.add_argument("--limit", type=int, default=5)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.command == "index":
|
||||||
|
total = index_repo(args.repo, args.qdrant_url)
|
||||||
|
print(f"Geïndexeerd: {total} artikelen")
|
||||||
|
elif args.command == "search":
|
||||||
|
s = SemanticSearch(qdrant_url=args.qdrant_url)
|
||||||
|
results = s.search(args.query, limit=args.limit)
|
||||||
|
for r in results:
|
||||||
|
print(f" [{r['score']}] {r['artikel']}: {r['context'][:80]}")
|
||||||
|
|
@ -27,6 +27,7 @@ from wetgit.api.models import (
|
||||||
|
|
||||||
REPO_PATH = Path(os.environ.get("WETGIT_REPO", "/tmp/wetgit-index-test"))
|
REPO_PATH = Path(os.environ.get("WETGIT_REPO", "/tmp/wetgit-index-test"))
|
||||||
MEILI_URL = os.environ.get("MEILI_URL", "http://127.0.0.1:7700")
|
MEILI_URL = os.environ.get("MEILI_URL", "http://127.0.0.1:7700")
|
||||||
|
QDRANT_URL = os.environ.get("QDRANT_URL", "http://127.0.0.1:6333")
|
||||||
|
|
||||||
app = FastAPI(
|
app = FastAPI(
|
||||||
title="WetGit API",
|
title="WetGit API",
|
||||||
|
|
@ -149,11 +150,24 @@ def get_diff(
|
||||||
def zoeken(
|
def zoeken(
|
||||||
q: str = Query(..., min_length=2, description="Zoekterm"),
|
q: str = Query(..., min_length=2, description="Zoekterm"),
|
||||||
type: str | None = Query(None, description="Filter op type"),
|
type: str | None = Query(None, description="Filter op type"),
|
||||||
|
mode: str = Query("keyword", description="Zoekmodus: keyword, semantic, of hybrid"),
|
||||||
limit: int = Query(20, ge=1, le=100, description="Max resultaten"),
|
limit: int = Query(20, ge=1, le=100, description="Max resultaten"),
|
||||||
) -> list[dict]:
|
) -> list[dict]:
|
||||||
"""Doorzoek alle wetgeving via Meilisearch (typo-tolerant, snel)."""
|
"""Doorzoek alle wetgeving. Modes: keyword (Meilisearch), semantic (Qdrant), hybrid (beide)."""
|
||||||
from wetgit.api.search import MeiliSearch
|
|
||||||
|
|
||||||
|
# Semantic search
|
||||||
|
if mode in ("semantic", "hybrid"):
|
||||||
|
from wetgit.ai.semantic import SemanticSearch
|
||||||
|
sem = SemanticSearch(qdrant_url=QDRANT_URL)
|
||||||
|
if sem.health():
|
||||||
|
results = sem.search(q, limit=limit)
|
||||||
|
if mode == "semantic":
|
||||||
|
return results
|
||||||
|
|
||||||
|
# Hybrid: combineer met keyword
|
||||||
|
semantic_results = {r["artikel"]: r for r in results}
|
||||||
|
|
||||||
|
from wetgit.api.search import MeiliSearch
|
||||||
meili = MeiliSearch(url=MEILI_URL)
|
meili = MeiliSearch(url=MEILI_URL)
|
||||||
|
|
||||||
# Probeer Meilisearch, fallback naar grep
|
# Probeer Meilisearch, fallback naar grep
|
||||||
|
|
|
||||||
|
|
@ -68,6 +68,7 @@ class ZoekResultaat(BaseModel):
|
||||||
titel: str
|
titel: str
|
||||||
artikel: str
|
artikel: str
|
||||||
context: str
|
context: str
|
||||||
|
score: float | None = None
|
||||||
|
|
||||||
|
|
||||||
class HealthResponse(BaseModel):
|
class HealthResponse(BaseModel):
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue