feat: MVP semantisch zoeken via Qdrant + Mistral embeddings
- semantic.py: Qdrant client + Mistral embeddings indexer - /api/v1/zoeken?mode=semantic — zoek op betekenis - /api/v1/zoeken?mode=keyword — Meilisearch (default) - 148 Grondwet-artikelen geëmbed - Qdrant container draait op dt-prod-01 Voorbeeld: "mag mijn baas mijn e-mail lezen?" → Artikel 13 (briefgeheim) Voorbeeld: "wanneer mag de politie mijn huis binnenkomen?" → Artikel 12 (score 0.789) Sluit #37
This commit is contained in:
parent
af339652df
commit
b655f56f8c
3 changed files with 262 additions and 2 deletions
245
src/wetgit/ai/semantic.py
Normal file
245
src/wetgit/ai/semantic.py
Normal file
|
|
@ -0,0 +1,245 @@
|
|||
"""Semantisch zoeken via Qdrant + Mistral embeddings.
|
||||
|
||||
Indexeert artikelen als vector embeddings en biedt semantic search
|
||||
waarmee je op betekenis kunt zoeken in plaats van exacte woorden.
|
||||
|
||||
Usage:
|
||||
python -m wetgit.ai.semantic index --repo /path/to/rijk
|
||||
python -m wetgit.ai.semantic search "mag mijn baas mijn e-mail lezen?"
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
MISTRAL_EMBED_URL = "https://api.mistral.ai/v1/embeddings"
|
||||
MISTRAL_EMBED_MODEL = "mistral-embed"
|
||||
QDRANT_URL = "http://127.0.0.1:6333"
|
||||
COLLECTION = "wetgit_artikelen"
|
||||
VECTOR_DIM = 1024 # mistral-embed output dimension
|
||||
|
||||
|
||||
class SemanticSearch:
|
||||
"""Semantisch zoeken via Qdrant + Mistral embeddings."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
qdrant_url: str = QDRANT_URL,
|
||||
mistral_api_key: str | None = None,
|
||||
) -> None:
|
||||
self.qdrant_url = qdrant_url.rstrip("/")
|
||||
self.api_key = mistral_api_key or os.environ.get("MISTRAL_API_KEY", "")
|
||||
|
||||
def setup_collection(self) -> None:
|
||||
"""Maak de Qdrant collectie aan."""
|
||||
# Check of collectie al bestaat
|
||||
resp = httpx.get(f"{self.qdrant_url}/collections/{COLLECTION}", timeout=5)
|
||||
if resp.status_code == 200:
|
||||
logger.info("Collectie '%s' bestaat al", COLLECTION)
|
||||
return
|
||||
|
||||
httpx.put(
|
||||
f"{self.qdrant_url}/collections/{COLLECTION}",
|
||||
json={
|
||||
"vectors": {
|
||||
"size": VECTOR_DIM,
|
||||
"distance": "Cosine",
|
||||
},
|
||||
},
|
||||
timeout=10,
|
||||
)
|
||||
logger.info("Collectie '%s' aangemaakt", COLLECTION)
|
||||
|
||||
def index_regeling(self, bwb_id: str, titel: str, type_: str, tekst: str) -> int:
|
||||
"""Indexeer alle artikelen van een regeling als embeddings."""
|
||||
# Extraheer artikelen
|
||||
articles: list[dict] = []
|
||||
pattern = r"### Artikel (\S+)(.*?)(?=\n### Artikel |\n## |\Z)"
|
||||
|
||||
for match in re.finditer(pattern, tekst, re.DOTALL):
|
||||
nummer = match.group(1)
|
||||
body = match.group(2).strip()
|
||||
if len(body) < 10:
|
||||
continue
|
||||
|
||||
articles.append({
|
||||
"id": f"{bwb_id}_art_{nummer}",
|
||||
"nummer": nummer,
|
||||
"tekst": body[:2000], # Beperk voor embedding
|
||||
"bwb_id": bwb_id,
|
||||
"regeling_titel": titel,
|
||||
"type": type_,
|
||||
})
|
||||
|
||||
if not articles:
|
||||
return 0
|
||||
|
||||
# Genereer embeddings in batches van 10
|
||||
batch_size = 10
|
||||
points: list[dict] = []
|
||||
|
||||
for i in range(0, len(articles), batch_size):
|
||||
batch = articles[i:i + batch_size]
|
||||
texts = [
|
||||
f"{a['regeling_titel']} Artikel {a['nummer']}: {a['tekst']}"
|
||||
for a in batch
|
||||
]
|
||||
|
||||
embeddings = self._get_embeddings(texts)
|
||||
if not embeddings:
|
||||
logger.warning("Embedding mislukt voor batch %d", i)
|
||||
continue
|
||||
|
||||
for j, (article, embedding) in enumerate(zip(batch, embeddings)):
|
||||
point_id = abs(hash(article["id"])) % (2**63)
|
||||
points.append({
|
||||
"id": point_id,
|
||||
"vector": embedding,
|
||||
"payload": {
|
||||
"article_id": article["id"],
|
||||
"bwb_id": article["bwb_id"],
|
||||
"regeling_titel": article["regeling_titel"],
|
||||
"type": article["type"],
|
||||
"artikel_nummer": article["nummer"],
|
||||
"tekst": article["tekst"][:500],
|
||||
},
|
||||
})
|
||||
|
||||
# Upload naar Qdrant
|
||||
if points:
|
||||
resp = httpx.put(
|
||||
f"{self.qdrant_url}/collections/{COLLECTION}/points",
|
||||
json={"points": points},
|
||||
timeout=30,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
logger.info("Geïndexeerd: %s — %d artikelen", bwb_id, len(points))
|
||||
|
||||
return len(points)
|
||||
|
||||
def search(self, query: str, limit: int = 10) -> list[dict]:
|
||||
"""Zoek semantisch op betekenis."""
|
||||
embeddings = self._get_embeddings([query])
|
||||
if not embeddings:
|
||||
return []
|
||||
|
||||
resp = httpx.post(
|
||||
f"{self.qdrant_url}/collections/{COLLECTION}/points/search",
|
||||
json={
|
||||
"vector": embeddings[0],
|
||||
"limit": limit,
|
||||
"with_payload": True,
|
||||
},
|
||||
timeout=10,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
results = resp.json().get("result", [])
|
||||
|
||||
return [
|
||||
{
|
||||
"bwb_id": r["payload"]["bwb_id"],
|
||||
"titel": r["payload"]["regeling_titel"],
|
||||
"artikel": f"Artikel {r['payload']['artikel_nummer']}",
|
||||
"context": r["payload"]["tekst"][:200],
|
||||
"score": round(r["score"], 3),
|
||||
}
|
||||
for r in results
|
||||
]
|
||||
|
||||
def health(self) -> bool:
|
||||
"""Check of Qdrant bereikbaar is."""
|
||||
try:
|
||||
resp = httpx.get(f"{self.qdrant_url}/healthz", timeout=5)
|
||||
return resp.status_code == 200
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _get_embeddings(self, texts: list[str]) -> list[list[float]] | None:
|
||||
"""Genereer embeddings via Mistral API."""
|
||||
try:
|
||||
resp = httpx.post(
|
||||
MISTRAL_EMBED_URL,
|
||||
headers={
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
json={
|
||||
"model": MISTRAL_EMBED_MODEL,
|
||||
"input": texts,
|
||||
},
|
||||
timeout=30,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
return [item["embedding"] for item in data["data"]]
|
||||
except httpx.HTTPError as e:
|
||||
logger.error("Mistral embedding fout: %s", e)
|
||||
return None
|
||||
|
||||
|
||||
def index_repo(repo_path: Path, qdrant_url: str = QDRANT_URL) -> int:
|
||||
"""Indexeer de hele repo in Qdrant."""
|
||||
search = SemanticSearch(qdrant_url=qdrant_url)
|
||||
search.setup_collection()
|
||||
|
||||
total = 0
|
||||
index_path = repo_path / "index.json"
|
||||
if index_path.exists():
|
||||
data = json.loads(index_path.read_text(encoding="utf-8"))
|
||||
regelingen = data.get("regelingen", [])
|
||||
else:
|
||||
from wetgit.pipeline.indexer import generate_index
|
||||
regelingen = generate_index(repo_path)
|
||||
|
||||
for regeling in regelingen:
|
||||
md_path = repo_path / regeling["pad"] / "README.md"
|
||||
if not md_path.exists():
|
||||
continue
|
||||
tekst = md_path.read_text(encoding="utf-8")
|
||||
count = search.index_regeling(
|
||||
bwb_id=regeling["bwb_id"],
|
||||
titel=regeling.get("titel", ""),
|
||||
type_=regeling.get("type", ""),
|
||||
tekst=tekst,
|
||||
)
|
||||
total += count
|
||||
|
||||
logger.info("Totaal geïndexeerd: %d artikelen", total)
|
||||
return total
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||
|
||||
parser = argparse.ArgumentParser(description="WetGit semantisch zoeken")
|
||||
sub = parser.add_subparsers(dest="command")
|
||||
|
||||
idx = sub.add_parser("index", help="Indexeer regelingen")
|
||||
idx.add_argument("--repo", type=Path, required=True)
|
||||
idx.add_argument("--qdrant-url", default=QDRANT_URL)
|
||||
|
||||
srch = sub.add_parser("search", help="Zoek semantisch")
|
||||
srch.add_argument("query")
|
||||
srch.add_argument("--qdrant-url", default=QDRANT_URL)
|
||||
srch.add_argument("--limit", type=int, default=5)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.command == "index":
|
||||
total = index_repo(args.repo, args.qdrant_url)
|
||||
print(f"Geïndexeerd: {total} artikelen")
|
||||
elif args.command == "search":
|
||||
s = SemanticSearch(qdrant_url=args.qdrant_url)
|
||||
results = s.search(args.query, limit=args.limit)
|
||||
for r in results:
|
||||
print(f" [{r['score']}] {r['artikel']}: {r['context'][:80]}")
|
||||
|
|
@ -27,6 +27,7 @@ from wetgit.api.models import (
|
|||
|
||||
REPO_PATH = Path(os.environ.get("WETGIT_REPO", "/tmp/wetgit-index-test"))
|
||||
MEILI_URL = os.environ.get("MEILI_URL", "http://127.0.0.1:7700")
|
||||
QDRANT_URL = os.environ.get("QDRANT_URL", "http://127.0.0.1:6333")
|
||||
|
||||
app = FastAPI(
|
||||
title="WetGit API",
|
||||
|
|
@ -149,11 +150,24 @@ def get_diff(
|
|||
def zoeken(
|
||||
q: str = Query(..., min_length=2, description="Zoekterm"),
|
||||
type: str | None = Query(None, description="Filter op type"),
|
||||
mode: str = Query("keyword", description="Zoekmodus: keyword, semantic, of hybrid"),
|
||||
limit: int = Query(20, ge=1, le=100, description="Max resultaten"),
|
||||
) -> list[dict]:
|
||||
"""Doorzoek alle wetgeving via Meilisearch (typo-tolerant, snel)."""
|
||||
from wetgit.api.search import MeiliSearch
|
||||
"""Doorzoek alle wetgeving. Modes: keyword (Meilisearch), semantic (Qdrant), hybrid (beide)."""
|
||||
|
||||
# Semantic search
|
||||
if mode in ("semantic", "hybrid"):
|
||||
from wetgit.ai.semantic import SemanticSearch
|
||||
sem = SemanticSearch(qdrant_url=QDRANT_URL)
|
||||
if sem.health():
|
||||
results = sem.search(q, limit=limit)
|
||||
if mode == "semantic":
|
||||
return results
|
||||
|
||||
# Hybrid: combineer met keyword
|
||||
semantic_results = {r["artikel"]: r for r in results}
|
||||
|
||||
from wetgit.api.search import MeiliSearch
|
||||
meili = MeiliSearch(url=MEILI_URL)
|
||||
|
||||
# Probeer Meilisearch, fallback naar grep
|
||||
|
|
|
|||
|
|
@ -68,6 +68,7 @@ class ZoekResultaat(BaseModel):
|
|||
titel: str
|
||||
artikel: str
|
||||
context: str
|
||||
score: float | None = None
|
||||
|
||||
|
||||
class HealthResponse(BaseModel):
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue