diff --git a/src/wetgit/api/app.py b/src/wetgit/api/app.py index b78aa4d..73e1810 100644 --- a/src/wetgit/api/app.py +++ b/src/wetgit/api/app.py @@ -26,6 +26,7 @@ from wetgit.api.models import ( ) REPO_PATH = Path(os.environ.get("WETGIT_REPO", "/tmp/wetgit-index-test")) +MEILI_URL = os.environ.get("MEILI_URL", "http://127.0.0.1:7700") app = FastAPI( title="WetGit API", @@ -150,20 +151,34 @@ def zoeken( type: str | None = Query(None, description="Filter op type"), limit: int = Query(20, ge=1, le=100, description="Max resultaten"), ) -> list[dict]: - """Doorzoek alle wetgeving (full-text).""" - import re + """Doorzoek alle wetgeving via Meilisearch (typo-tolerant, snel).""" + from wetgit.api.search import MeiliSearch + meili = MeiliSearch(url=MEILI_URL) + + # Probeer Meilisearch, fallback naar grep + if meili.health(): + filter_str = f'type = "{type}"' if type else None + result = meili.search(q, filter_=filter_str, limit=limit) + + return [ + { + "bwb_id": hit["bwb_id"], + "titel": hit.get("regeling_titel", ""), + "artikel": f"Artikel {hit.get('artikel_nummer', '?')}", + "context": hit.get("tekst", "")[:200], + } + for hit in result.get("hits", []) + ] + + # Fallback: grep-style zoeken resultaten: list[dict] = [] - for regeling in store.list_regelingen(): if type and regeling.get("type") != type: continue - tekst = store.get_tekst(regeling["bwb_id"]) if tekst is None or q.lower() not in tekst.lower(): continue - - # Zoek in welk artikel de match zit current_artikel = "" for line in tekst.split("\n"): if line.startswith("### Artikel"): @@ -177,5 +192,4 @@ def zoeken( }) if len(resultaten) >= limit: return resultaten - return resultaten diff --git a/src/wetgit/api/search.py b/src/wetgit/api/search.py new file mode 100644 index 0000000..d2c55f8 --- /dev/null +++ b/src/wetgit/api/search.py @@ -0,0 +1,188 @@ +"""Meilisearch integratie — indexer en zoekfunctie. + +Indexeert alle artikelen in Meilisearch voor full-text search +met Nederlandse stemming en typo-tolerantie. +""" + +from __future__ import annotations + +import logging +import re +from pathlib import Path + +import httpx + +logger = logging.getLogger(__name__) + +DEFAULT_MEILI_URL = "http://127.0.0.1:7700" +INDEX_NAME = "artikelen" + + +class MeiliSearch: + """Meilisearch client voor WetGit.""" + + def __init__(self, url: str = DEFAULT_MEILI_URL, api_key: str | None = None) -> None: + self.url = url.rstrip("/") + self.headers: dict[str, str] = {"Content-Type": "application/json"} + if api_key: + self.headers["Authorization"] = f"Bearer {api_key}" + + def setup_index(self) -> None: + """Maak de index aan met de juiste instellingen.""" + # Maak index + httpx.post( + f"{self.url}/indexes", + json={"uid": INDEX_NAME, "primaryKey": "id"}, + headers=self.headers, + timeout=10, + ) + + # Configureer zoek-instellingen + httpx.patch( + f"{self.url}/indexes/{INDEX_NAME}/settings", + json={ + "searchableAttributes": ["tekst", "titel", "artikel_titel"], + "filterableAttributes": ["bwb_id", "type", "regeling_titel"], + "sortableAttributes": ["artikel_nummer"], + "displayedAttributes": [ + "id", "bwb_id", "regeling_titel", "type", + "artikel_nummer", "artikel_titel", "tekst", + ], + }, + headers=self.headers, + timeout=10, + ) + logger.info("Meilisearch index '%s' geconfigureerd", INDEX_NAME) + + def index_regeling(self, bwb_id: str, titel: str, type_: str, tekst: str) -> int: + """Indexeer alle artikelen van een regeling. + + Returns: + Aantal geïndexeerde artikelen. + """ + documents = [] + pattern = r"### Artikel (\S+)(.*?)(?=\n### Artikel |\n## |\Z)" + + for match in re.finditer(pattern, tekst, re.DOTALL): + nummer = match.group(1) + body = match.group(2).strip() + + # Extraheer eventuele artikel-titel + artikel_titel = None + lines = body.split("\n") + for line in lines: + line = line.strip() + if line.startswith("*") and line.endswith("*") and not line.startswith("**"): + artikel_titel = line.strip("*").strip() + break + + documents.append({ + "id": f"{bwb_id}_art_{nummer}", + "bwb_id": bwb_id, + "regeling_titel": titel, + "type": type_, + "artikel_nummer": nummer, + "artikel_titel": artikel_titel, + "tekst": body, + }) + + if documents: + resp = httpx.post( + f"{self.url}/indexes/{INDEX_NAME}/documents", + json=documents, + headers=self.headers, + timeout=30, + ) + logger.info("Geïndexeerd: %s — %d artikelen", bwb_id, len(documents)) + + return len(documents) + + def search( + self, + query: str, + filter_: str | None = None, + limit: int = 20, + ) -> dict: + """Zoek in de index.""" + body: dict = { + "q": query, + "limit": limit, + "attributesToHighlight": ["tekst"], + "highlightPreTag": "**", + "highlightPostTag": "**", + "attributesToCrop": ["tekst"], + "cropLength": 200, + } + if filter_: + body["filter"] = filter_ + + resp = httpx.post( + f"{self.url}/indexes/{INDEX_NAME}/search", + json=body, + headers=self.headers, + timeout=10, + ) + return resp.json() + + def health(self) -> bool: + """Check of Meilisearch bereikbaar is.""" + try: + resp = httpx.get(f"{self.url}/health", timeout=5) + return resp.json().get("status") == "available" + except Exception: + return False + + +def index_repo(repo_path: Path, meili_url: str = DEFAULT_MEILI_URL) -> int: + """Indexeer de hele repo in Meilisearch. + + Returns: + Totaal aantal geïndexeerde artikelen. + """ + import json + + meili = MeiliSearch(url=meili_url) + meili.setup_index() + + total = 0 + + # Laad index.json + index_path = repo_path / "index.json" + if index_path.exists(): + data = json.loads(index_path.read_text(encoding="utf-8")) + regelingen = data.get("regelingen", []) + else: + from wetgit.pipeline.indexer import generate_index + regelingen = generate_index(repo_path) + + for regeling in regelingen: + bwb_id = regeling["bwb_id"] + md_path = repo_path / regeling["pad"] / "README.md" + if not md_path.exists(): + continue + + tekst = md_path.read_text(encoding="utf-8") + count = meili.index_regeling( + bwb_id=bwb_id, + titel=regeling.get("titel", ""), + type_=regeling.get("type", ""), + tekst=tekst, + ) + total += count + + logger.info("Totaal geïndexeerd: %d artikelen", total) + return total + + +if __name__ == "__main__": + import argparse + + logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") + + parser = argparse.ArgumentParser(description="WetGit Meilisearch indexer") + parser.add_argument("--repo", type=Path, required=True) + parser.add_argument("--meili-url", default=DEFAULT_MEILI_URL) + args = parser.parse_args() + + total = index_repo(args.repo, args.meili_url) + print(f"Geïndexeerd: {total} artikelen")