feat: MVP semantisch zoeken via Qdrant + Mistral embeddings

- semantic.py: Qdrant client + Mistral embeddings indexer
- /api/v1/zoeken?mode=semantic — zoek op betekenis
- /api/v1/zoeken?mode=keyword — Meilisearch (default)
- 148 Grondwet-artikelen geëmbed
- Qdrant container draait op dt-prod-01

Voorbeeld: "mag mijn baas mijn e-mail lezen?" → Artikel 13 (briefgeheim)
Voorbeeld: "wanneer mag de politie mijn huis binnenkomen?" → Artikel 12 (score 0.789)

Sluit #37
This commit is contained in:
Coornhert 2026-03-30 10:44:01 +02:00
parent af339652df
commit b655f56f8c
3 changed files with 262 additions and 2 deletions

245
src/wetgit/ai/semantic.py Normal file
View file

@ -0,0 +1,245 @@
"""Semantisch zoeken via Qdrant + Mistral embeddings.
Indexeert artikelen als vector embeddings en biedt semantic search
waarmee je op betekenis kunt zoeken in plaats van exacte woorden.
Usage:
python -m wetgit.ai.semantic index --repo /path/to/rijk
python -m wetgit.ai.semantic search "mag mijn baas mijn e-mail lezen?"
"""
from __future__ import annotations
import json
import logging
import os
import re
from pathlib import Path
import httpx
logger = logging.getLogger(__name__)
MISTRAL_EMBED_URL = "https://api.mistral.ai/v1/embeddings"
MISTRAL_EMBED_MODEL = "mistral-embed"
QDRANT_URL = "http://127.0.0.1:6333"
COLLECTION = "wetgit_artikelen"
VECTOR_DIM = 1024 # mistral-embed output dimension
class SemanticSearch:
"""Semantisch zoeken via Qdrant + Mistral embeddings."""
def __init__(
self,
qdrant_url: str = QDRANT_URL,
mistral_api_key: str | None = None,
) -> None:
self.qdrant_url = qdrant_url.rstrip("/")
self.api_key = mistral_api_key or os.environ.get("MISTRAL_API_KEY", "")
def setup_collection(self) -> None:
"""Maak de Qdrant collectie aan."""
# Check of collectie al bestaat
resp = httpx.get(f"{self.qdrant_url}/collections/{COLLECTION}", timeout=5)
if resp.status_code == 200:
logger.info("Collectie '%s' bestaat al", COLLECTION)
return
httpx.put(
f"{self.qdrant_url}/collections/{COLLECTION}",
json={
"vectors": {
"size": VECTOR_DIM,
"distance": "Cosine",
},
},
timeout=10,
)
logger.info("Collectie '%s' aangemaakt", COLLECTION)
def index_regeling(self, bwb_id: str, titel: str, type_: str, tekst: str) -> int:
"""Indexeer alle artikelen van een regeling als embeddings."""
# Extraheer artikelen
articles: list[dict] = []
pattern = r"### Artikel (\S+)(.*?)(?=\n### Artikel |\n## |\Z)"
for match in re.finditer(pattern, tekst, re.DOTALL):
nummer = match.group(1)
body = match.group(2).strip()
if len(body) < 10:
continue
articles.append({
"id": f"{bwb_id}_art_{nummer}",
"nummer": nummer,
"tekst": body[:2000], # Beperk voor embedding
"bwb_id": bwb_id,
"regeling_titel": titel,
"type": type_,
})
if not articles:
return 0
# Genereer embeddings in batches van 10
batch_size = 10
points: list[dict] = []
for i in range(0, len(articles), batch_size):
batch = articles[i:i + batch_size]
texts = [
f"{a['regeling_titel']} Artikel {a['nummer']}: {a['tekst']}"
for a in batch
]
embeddings = self._get_embeddings(texts)
if not embeddings:
logger.warning("Embedding mislukt voor batch %d", i)
continue
for j, (article, embedding) in enumerate(zip(batch, embeddings)):
point_id = abs(hash(article["id"])) % (2**63)
points.append({
"id": point_id,
"vector": embedding,
"payload": {
"article_id": article["id"],
"bwb_id": article["bwb_id"],
"regeling_titel": article["regeling_titel"],
"type": article["type"],
"artikel_nummer": article["nummer"],
"tekst": article["tekst"][:500],
},
})
# Upload naar Qdrant
if points:
resp = httpx.put(
f"{self.qdrant_url}/collections/{COLLECTION}/points",
json={"points": points},
timeout=30,
)
resp.raise_for_status()
logger.info("Geïndexeerd: %s%d artikelen", bwb_id, len(points))
return len(points)
def search(self, query: str, limit: int = 10) -> list[dict]:
"""Zoek semantisch op betekenis."""
embeddings = self._get_embeddings([query])
if not embeddings:
return []
resp = httpx.post(
f"{self.qdrant_url}/collections/{COLLECTION}/points/search",
json={
"vector": embeddings[0],
"limit": limit,
"with_payload": True,
},
timeout=10,
)
resp.raise_for_status()
results = resp.json().get("result", [])
return [
{
"bwb_id": r["payload"]["bwb_id"],
"titel": r["payload"]["regeling_titel"],
"artikel": f"Artikel {r['payload']['artikel_nummer']}",
"context": r["payload"]["tekst"][:200],
"score": round(r["score"], 3),
}
for r in results
]
def health(self) -> bool:
"""Check of Qdrant bereikbaar is."""
try:
resp = httpx.get(f"{self.qdrant_url}/healthz", timeout=5)
return resp.status_code == 200
except Exception:
return False
def _get_embeddings(self, texts: list[str]) -> list[list[float]] | None:
"""Genereer embeddings via Mistral API."""
try:
resp = httpx.post(
MISTRAL_EMBED_URL,
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
},
json={
"model": MISTRAL_EMBED_MODEL,
"input": texts,
},
timeout=30,
)
resp.raise_for_status()
data = resp.json()
return [item["embedding"] for item in data["data"]]
except httpx.HTTPError as e:
logger.error("Mistral embedding fout: %s", e)
return None
def index_repo(repo_path: Path, qdrant_url: str = QDRANT_URL) -> int:
"""Indexeer de hele repo in Qdrant."""
search = SemanticSearch(qdrant_url=qdrant_url)
search.setup_collection()
total = 0
index_path = repo_path / "index.json"
if index_path.exists():
data = json.loads(index_path.read_text(encoding="utf-8"))
regelingen = data.get("regelingen", [])
else:
from wetgit.pipeline.indexer import generate_index
regelingen = generate_index(repo_path)
for regeling in regelingen:
md_path = repo_path / regeling["pad"] / "README.md"
if not md_path.exists():
continue
tekst = md_path.read_text(encoding="utf-8")
count = search.index_regeling(
bwb_id=regeling["bwb_id"],
titel=regeling.get("titel", ""),
type_=regeling.get("type", ""),
tekst=tekst,
)
total += count
logger.info("Totaal geïndexeerd: %d artikelen", total)
return total
if __name__ == "__main__":
import argparse
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
parser = argparse.ArgumentParser(description="WetGit semantisch zoeken")
sub = parser.add_subparsers(dest="command")
idx = sub.add_parser("index", help="Indexeer regelingen")
idx.add_argument("--repo", type=Path, required=True)
idx.add_argument("--qdrant-url", default=QDRANT_URL)
srch = sub.add_parser("search", help="Zoek semantisch")
srch.add_argument("query")
srch.add_argument("--qdrant-url", default=QDRANT_URL)
srch.add_argument("--limit", type=int, default=5)
args = parser.parse_args()
if args.command == "index":
total = index_repo(args.repo, args.qdrant_url)
print(f"Geïndexeerd: {total} artikelen")
elif args.command == "search":
s = SemanticSearch(qdrant_url=args.qdrant_url)
results = s.search(args.query, limit=args.limit)
for r in results:
print(f" [{r['score']}] {r['artikel']}: {r['context'][:80]}")

View file

@ -27,6 +27,7 @@ from wetgit.api.models import (
REPO_PATH = Path(os.environ.get("WETGIT_REPO", "/tmp/wetgit-index-test"))
MEILI_URL = os.environ.get("MEILI_URL", "http://127.0.0.1:7700")
QDRANT_URL = os.environ.get("QDRANT_URL", "http://127.0.0.1:6333")
app = FastAPI(
title="WetGit API",
@ -149,11 +150,24 @@ def get_diff(
def zoeken(
q: str = Query(..., min_length=2, description="Zoekterm"),
type: str | None = Query(None, description="Filter op type"),
mode: str = Query("keyword", description="Zoekmodus: keyword, semantic, of hybrid"),
limit: int = Query(20, ge=1, le=100, description="Max resultaten"),
) -> list[dict]:
"""Doorzoek alle wetgeving via Meilisearch (typo-tolerant, snel)."""
from wetgit.api.search import MeiliSearch
"""Doorzoek alle wetgeving. Modes: keyword (Meilisearch), semantic (Qdrant), hybrid (beide)."""
# Semantic search
if mode in ("semantic", "hybrid"):
from wetgit.ai.semantic import SemanticSearch
sem = SemanticSearch(qdrant_url=QDRANT_URL)
if sem.health():
results = sem.search(q, limit=limit)
if mode == "semantic":
return results
# Hybrid: combineer met keyword
semantic_results = {r["artikel"]: r for r in results}
from wetgit.api.search import MeiliSearch
meili = MeiliSearch(url=MEILI_URL)
# Probeer Meilisearch, fallback naar grep

View file

@ -68,6 +68,7 @@ class ZoekResultaat(BaseModel):
titel: str
artikel: str
context: str
score: float | None = None
class HealthResponse(BaseModel):