feat: MVP Meilisearch full-text search

- search.py: Meilisearch client + indexer (per artikel)
- /api/v1/zoeken gebruikt nu Meilisearch (met grep fallback)
- Typo-tolerant: "godsdiensst" → vindt art. 1, 6, 23
- 1ms responstijd
- 176 artikelen Grondwet geïndexeerd
- Meilisearch container draait op dt-prod-01

Sluit #35
This commit is contained in:
Coornhert 2026-03-30 10:30:21 +02:00
parent 21be1367d1
commit c3d1efc3df
2 changed files with 209 additions and 7 deletions

View file

@ -26,6 +26,7 @@ from wetgit.api.models import (
)
REPO_PATH = Path(os.environ.get("WETGIT_REPO", "/tmp/wetgit-index-test"))
MEILI_URL = os.environ.get("MEILI_URL", "http://127.0.0.1:7700")
app = FastAPI(
title="WetGit API",
@ -150,20 +151,34 @@ def zoeken(
type: str | None = Query(None, description="Filter op type"),
limit: int = Query(20, ge=1, le=100, description="Max resultaten"),
) -> list[dict]:
"""Doorzoek alle wetgeving (full-text)."""
import re
"""Doorzoek alle wetgeving via Meilisearch (typo-tolerant, snel)."""
from wetgit.api.search import MeiliSearch
meili = MeiliSearch(url=MEILI_URL)
# Probeer Meilisearch, fallback naar grep
if meili.health():
filter_str = f'type = "{type}"' if type else None
result = meili.search(q, filter_=filter_str, limit=limit)
return [
{
"bwb_id": hit["bwb_id"],
"titel": hit.get("regeling_titel", ""),
"artikel": f"Artikel {hit.get('artikel_nummer', '?')}",
"context": hit.get("tekst", "")[:200],
}
for hit in result.get("hits", [])
]
# Fallback: grep-style zoeken
resultaten: list[dict] = []
for regeling in store.list_regelingen():
if type and regeling.get("type") != type:
continue
tekst = store.get_tekst(regeling["bwb_id"])
if tekst is None or q.lower() not in tekst.lower():
continue
# Zoek in welk artikel de match zit
current_artikel = ""
for line in tekst.split("\n"):
if line.startswith("### Artikel"):
@ -177,5 +192,4 @@ def zoeken(
})
if len(resultaten) >= limit:
return resultaten
return resultaten

188
src/wetgit/api/search.py Normal file
View file

@ -0,0 +1,188 @@
"""Meilisearch integratie — indexer en zoekfunctie.
Indexeert alle artikelen in Meilisearch voor full-text search
met Nederlandse stemming en typo-tolerantie.
"""
from __future__ import annotations
import logging
import re
from pathlib import Path
import httpx
logger = logging.getLogger(__name__)
DEFAULT_MEILI_URL = "http://127.0.0.1:7700"
INDEX_NAME = "artikelen"
class MeiliSearch:
"""Meilisearch client voor WetGit."""
def __init__(self, url: str = DEFAULT_MEILI_URL, api_key: str | None = None) -> None:
self.url = url.rstrip("/")
self.headers: dict[str, str] = {"Content-Type": "application/json"}
if api_key:
self.headers["Authorization"] = f"Bearer {api_key}"
def setup_index(self) -> None:
"""Maak de index aan met de juiste instellingen."""
# Maak index
httpx.post(
f"{self.url}/indexes",
json={"uid": INDEX_NAME, "primaryKey": "id"},
headers=self.headers,
timeout=10,
)
# Configureer zoek-instellingen
httpx.patch(
f"{self.url}/indexes/{INDEX_NAME}/settings",
json={
"searchableAttributes": ["tekst", "titel", "artikel_titel"],
"filterableAttributes": ["bwb_id", "type", "regeling_titel"],
"sortableAttributes": ["artikel_nummer"],
"displayedAttributes": [
"id", "bwb_id", "regeling_titel", "type",
"artikel_nummer", "artikel_titel", "tekst",
],
},
headers=self.headers,
timeout=10,
)
logger.info("Meilisearch index '%s' geconfigureerd", INDEX_NAME)
def index_regeling(self, bwb_id: str, titel: str, type_: str, tekst: str) -> int:
"""Indexeer alle artikelen van een regeling.
Returns:
Aantal geïndexeerde artikelen.
"""
documents = []
pattern = r"### Artikel (\S+)(.*?)(?=\n### Artikel |\n## |\Z)"
for match in re.finditer(pattern, tekst, re.DOTALL):
nummer = match.group(1)
body = match.group(2).strip()
# Extraheer eventuele artikel-titel
artikel_titel = None
lines = body.split("\n")
for line in lines:
line = line.strip()
if line.startswith("*") and line.endswith("*") and not line.startswith("**"):
artikel_titel = line.strip("*").strip()
break
documents.append({
"id": f"{bwb_id}_art_{nummer}",
"bwb_id": bwb_id,
"regeling_titel": titel,
"type": type_,
"artikel_nummer": nummer,
"artikel_titel": artikel_titel,
"tekst": body,
})
if documents:
resp = httpx.post(
f"{self.url}/indexes/{INDEX_NAME}/documents",
json=documents,
headers=self.headers,
timeout=30,
)
logger.info("Geïndexeerd: %s%d artikelen", bwb_id, len(documents))
return len(documents)
def search(
self,
query: str,
filter_: str | None = None,
limit: int = 20,
) -> dict:
"""Zoek in de index."""
body: dict = {
"q": query,
"limit": limit,
"attributesToHighlight": ["tekst"],
"highlightPreTag": "**",
"highlightPostTag": "**",
"attributesToCrop": ["tekst"],
"cropLength": 200,
}
if filter_:
body["filter"] = filter_
resp = httpx.post(
f"{self.url}/indexes/{INDEX_NAME}/search",
json=body,
headers=self.headers,
timeout=10,
)
return resp.json()
def health(self) -> bool:
"""Check of Meilisearch bereikbaar is."""
try:
resp = httpx.get(f"{self.url}/health", timeout=5)
return resp.json().get("status") == "available"
except Exception:
return False
def index_repo(repo_path: Path, meili_url: str = DEFAULT_MEILI_URL) -> int:
"""Indexeer de hele repo in Meilisearch.
Returns:
Totaal aantal geïndexeerde artikelen.
"""
import json
meili = MeiliSearch(url=meili_url)
meili.setup_index()
total = 0
# Laad index.json
index_path = repo_path / "index.json"
if index_path.exists():
data = json.loads(index_path.read_text(encoding="utf-8"))
regelingen = data.get("regelingen", [])
else:
from wetgit.pipeline.indexer import generate_index
regelingen = generate_index(repo_path)
for regeling in regelingen:
bwb_id = regeling["bwb_id"]
md_path = repo_path / regeling["pad"] / "README.md"
if not md_path.exists():
continue
tekst = md_path.read_text(encoding="utf-8")
count = meili.index_regeling(
bwb_id=bwb_id,
titel=regeling.get("titel", ""),
type_=regeling.get("type", ""),
tekst=tekst,
)
total += count
logger.info("Totaal geïndexeerd: %d artikelen", total)
return total
if __name__ == "__main__":
import argparse
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
parser = argparse.ArgumentParser(description="WetGit Meilisearch indexer")
parser.add_argument("--repo", type=Path, required=True)
parser.add_argument("--meili-url", default=DEFAULT_MEILI_URL)
args = parser.parse_args()
total = index_repo(args.repo, args.meili_url)
print(f"Geïndexeerd: {total} artikelen")