From b655f56f8cb5aaa714e331865b34c5dbed827718 Mon Sep 17 00:00:00 2001 From: Coornhert Date: Mon, 30 Mar 2026 10:44:01 +0200 Subject: [PATCH] feat: MVP semantisch zoeken via Qdrant + Mistral embeddings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - semantic.py: Qdrant client + Mistral embeddings indexer - /api/v1/zoeken?mode=semantic — zoek op betekenis - /api/v1/zoeken?mode=keyword — Meilisearch (default) - 148 Grondwet-artikelen geëmbed - Qdrant container draait op dt-prod-01 Voorbeeld: "mag mijn baas mijn e-mail lezen?" → Artikel 13 (briefgeheim) Voorbeeld: "wanneer mag de politie mijn huis binnenkomen?" → Artikel 12 (score 0.789) Sluit #37 --- src/wetgit/ai/semantic.py | 245 ++++++++++++++++++++++++++++++++++++++ src/wetgit/api/app.py | 18 ++- src/wetgit/api/models.py | 1 + 3 files changed, 262 insertions(+), 2 deletions(-) create mode 100644 src/wetgit/ai/semantic.py diff --git a/src/wetgit/ai/semantic.py b/src/wetgit/ai/semantic.py new file mode 100644 index 0000000..fa05acf --- /dev/null +++ b/src/wetgit/ai/semantic.py @@ -0,0 +1,245 @@ +"""Semantisch zoeken via Qdrant + Mistral embeddings. + +Indexeert artikelen als vector embeddings en biedt semantic search +waarmee je op betekenis kunt zoeken in plaats van exacte woorden. + +Usage: + python -m wetgit.ai.semantic index --repo /path/to/rijk + python -m wetgit.ai.semantic search "mag mijn baas mijn e-mail lezen?" +""" + +from __future__ import annotations + +import json +import logging +import os +import re +from pathlib import Path + +import httpx + +logger = logging.getLogger(__name__) + +MISTRAL_EMBED_URL = "https://api.mistral.ai/v1/embeddings" +MISTRAL_EMBED_MODEL = "mistral-embed" +QDRANT_URL = "http://127.0.0.1:6333" +COLLECTION = "wetgit_artikelen" +VECTOR_DIM = 1024 # mistral-embed output dimension + + +class SemanticSearch: + """Semantisch zoeken via Qdrant + Mistral embeddings.""" + + def __init__( + self, + qdrant_url: str = QDRANT_URL, + mistral_api_key: str | None = None, + ) -> None: + self.qdrant_url = qdrant_url.rstrip("/") + self.api_key = mistral_api_key or os.environ.get("MISTRAL_API_KEY", "") + + def setup_collection(self) -> None: + """Maak de Qdrant collectie aan.""" + # Check of collectie al bestaat + resp = httpx.get(f"{self.qdrant_url}/collections/{COLLECTION}", timeout=5) + if resp.status_code == 200: + logger.info("Collectie '%s' bestaat al", COLLECTION) + return + + httpx.put( + f"{self.qdrant_url}/collections/{COLLECTION}", + json={ + "vectors": { + "size": VECTOR_DIM, + "distance": "Cosine", + }, + }, + timeout=10, + ) + logger.info("Collectie '%s' aangemaakt", COLLECTION) + + def index_regeling(self, bwb_id: str, titel: str, type_: str, tekst: str) -> int: + """Indexeer alle artikelen van een regeling als embeddings.""" + # Extraheer artikelen + articles: list[dict] = [] + pattern = r"### Artikel (\S+)(.*?)(?=\n### Artikel |\n## |\Z)" + + for match in re.finditer(pattern, tekst, re.DOTALL): + nummer = match.group(1) + body = match.group(2).strip() + if len(body) < 10: + continue + + articles.append({ + "id": f"{bwb_id}_art_{nummer}", + "nummer": nummer, + "tekst": body[:2000], # Beperk voor embedding + "bwb_id": bwb_id, + "regeling_titel": titel, + "type": type_, + }) + + if not articles: + return 0 + + # Genereer embeddings in batches van 10 + batch_size = 10 + points: list[dict] = [] + + for i in range(0, len(articles), batch_size): + batch = articles[i:i + batch_size] + texts = [ + f"{a['regeling_titel']} Artikel {a['nummer']}: {a['tekst']}" + for a in batch + ] + + embeddings = self._get_embeddings(texts) + if not embeddings: + logger.warning("Embedding mislukt voor batch %d", i) + continue + + for j, (article, embedding) in enumerate(zip(batch, embeddings)): + point_id = abs(hash(article["id"])) % (2**63) + points.append({ + "id": point_id, + "vector": embedding, + "payload": { + "article_id": article["id"], + "bwb_id": article["bwb_id"], + "regeling_titel": article["regeling_titel"], + "type": article["type"], + "artikel_nummer": article["nummer"], + "tekst": article["tekst"][:500], + }, + }) + + # Upload naar Qdrant + if points: + resp = httpx.put( + f"{self.qdrant_url}/collections/{COLLECTION}/points", + json={"points": points}, + timeout=30, + ) + resp.raise_for_status() + logger.info("Geïndexeerd: %s — %d artikelen", bwb_id, len(points)) + + return len(points) + + def search(self, query: str, limit: int = 10) -> list[dict]: + """Zoek semantisch op betekenis.""" + embeddings = self._get_embeddings([query]) + if not embeddings: + return [] + + resp = httpx.post( + f"{self.qdrant_url}/collections/{COLLECTION}/points/search", + json={ + "vector": embeddings[0], + "limit": limit, + "with_payload": True, + }, + timeout=10, + ) + resp.raise_for_status() + results = resp.json().get("result", []) + + return [ + { + "bwb_id": r["payload"]["bwb_id"], + "titel": r["payload"]["regeling_titel"], + "artikel": f"Artikel {r['payload']['artikel_nummer']}", + "context": r["payload"]["tekst"][:200], + "score": round(r["score"], 3), + } + for r in results + ] + + def health(self) -> bool: + """Check of Qdrant bereikbaar is.""" + try: + resp = httpx.get(f"{self.qdrant_url}/healthz", timeout=5) + return resp.status_code == 200 + except Exception: + return False + + def _get_embeddings(self, texts: list[str]) -> list[list[float]] | None: + """Genereer embeddings via Mistral API.""" + try: + resp = httpx.post( + MISTRAL_EMBED_URL, + headers={ + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + }, + json={ + "model": MISTRAL_EMBED_MODEL, + "input": texts, + }, + timeout=30, + ) + resp.raise_for_status() + data = resp.json() + return [item["embedding"] for item in data["data"]] + except httpx.HTTPError as e: + logger.error("Mistral embedding fout: %s", e) + return None + + +def index_repo(repo_path: Path, qdrant_url: str = QDRANT_URL) -> int: + """Indexeer de hele repo in Qdrant.""" + search = SemanticSearch(qdrant_url=qdrant_url) + search.setup_collection() + + total = 0 + index_path = repo_path / "index.json" + if index_path.exists(): + data = json.loads(index_path.read_text(encoding="utf-8")) + regelingen = data.get("regelingen", []) + else: + from wetgit.pipeline.indexer import generate_index + regelingen = generate_index(repo_path) + + for regeling in regelingen: + md_path = repo_path / regeling["pad"] / "README.md" + if not md_path.exists(): + continue + tekst = md_path.read_text(encoding="utf-8") + count = search.index_regeling( + bwb_id=regeling["bwb_id"], + titel=regeling.get("titel", ""), + type_=regeling.get("type", ""), + tekst=tekst, + ) + total += count + + logger.info("Totaal geïndexeerd: %d artikelen", total) + return total + + +if __name__ == "__main__": + import argparse + + logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") + + parser = argparse.ArgumentParser(description="WetGit semantisch zoeken") + sub = parser.add_subparsers(dest="command") + + idx = sub.add_parser("index", help="Indexeer regelingen") + idx.add_argument("--repo", type=Path, required=True) + idx.add_argument("--qdrant-url", default=QDRANT_URL) + + srch = sub.add_parser("search", help="Zoek semantisch") + srch.add_argument("query") + srch.add_argument("--qdrant-url", default=QDRANT_URL) + srch.add_argument("--limit", type=int, default=5) + + args = parser.parse_args() + + if args.command == "index": + total = index_repo(args.repo, args.qdrant_url) + print(f"Geïndexeerd: {total} artikelen") + elif args.command == "search": + s = SemanticSearch(qdrant_url=args.qdrant_url) + results = s.search(args.query, limit=args.limit) + for r in results: + print(f" [{r['score']}] {r['artikel']}: {r['context'][:80]}") diff --git a/src/wetgit/api/app.py b/src/wetgit/api/app.py index 73e1810..b65880f 100644 --- a/src/wetgit/api/app.py +++ b/src/wetgit/api/app.py @@ -27,6 +27,7 @@ from wetgit.api.models import ( REPO_PATH = Path(os.environ.get("WETGIT_REPO", "/tmp/wetgit-index-test")) MEILI_URL = os.environ.get("MEILI_URL", "http://127.0.0.1:7700") +QDRANT_URL = os.environ.get("QDRANT_URL", "http://127.0.0.1:6333") app = FastAPI( title="WetGit API", @@ -149,11 +150,24 @@ def get_diff( def zoeken( q: str = Query(..., min_length=2, description="Zoekterm"), type: str | None = Query(None, description="Filter op type"), + mode: str = Query("keyword", description="Zoekmodus: keyword, semantic, of hybrid"), limit: int = Query(20, ge=1, le=100, description="Max resultaten"), ) -> list[dict]: - """Doorzoek alle wetgeving via Meilisearch (typo-tolerant, snel).""" - from wetgit.api.search import MeiliSearch + """Doorzoek alle wetgeving. Modes: keyword (Meilisearch), semantic (Qdrant), hybrid (beide).""" + # Semantic search + if mode in ("semantic", "hybrid"): + from wetgit.ai.semantic import SemanticSearch + sem = SemanticSearch(qdrant_url=QDRANT_URL) + if sem.health(): + results = sem.search(q, limit=limit) + if mode == "semantic": + return results + + # Hybrid: combineer met keyword + semantic_results = {r["artikel"]: r for r in results} + + from wetgit.api.search import MeiliSearch meili = MeiliSearch(url=MEILI_URL) # Probeer Meilisearch, fallback naar grep diff --git a/src/wetgit/api/models.py b/src/wetgit/api/models.py index 509cabc..7ca1efd 100644 --- a/src/wetgit/api/models.py +++ b/src/wetgit/api/models.py @@ -68,6 +68,7 @@ class ZoekResultaat(BaseModel): titel: str artikel: str context: str + score: float | None = None class HealthResponse(BaseModel):