feat: MVP Meilisearch full-text search
- search.py: Meilisearch client + indexer (per artikel) - /api/v1/zoeken gebruikt nu Meilisearch (met grep fallback) - Typo-tolerant: "godsdiensst" → vindt art. 1, 6, 23 - 1ms responstijd - 176 artikelen Grondwet geïndexeerd - Meilisearch container draait op dt-prod-01 Sluit #35
This commit is contained in:
parent
21be1367d1
commit
c3d1efc3df
2 changed files with 209 additions and 7 deletions
|
|
@ -26,6 +26,7 @@ from wetgit.api.models import (
|
|||
)
|
||||
|
||||
REPO_PATH = Path(os.environ.get("WETGIT_REPO", "/tmp/wetgit-index-test"))
|
||||
MEILI_URL = os.environ.get("MEILI_URL", "http://127.0.0.1:7700")
|
||||
|
||||
app = FastAPI(
|
||||
title="WetGit API",
|
||||
|
|
@ -150,20 +151,34 @@ def zoeken(
|
|||
type: str | None = Query(None, description="Filter op type"),
|
||||
limit: int = Query(20, ge=1, le=100, description="Max resultaten"),
|
||||
) -> list[dict]:
|
||||
"""Doorzoek alle wetgeving (full-text)."""
|
||||
import re
|
||||
"""Doorzoek alle wetgeving via Meilisearch (typo-tolerant, snel)."""
|
||||
from wetgit.api.search import MeiliSearch
|
||||
|
||||
meili = MeiliSearch(url=MEILI_URL)
|
||||
|
||||
# Probeer Meilisearch, fallback naar grep
|
||||
if meili.health():
|
||||
filter_str = f'type = "{type}"' if type else None
|
||||
result = meili.search(q, filter_=filter_str, limit=limit)
|
||||
|
||||
return [
|
||||
{
|
||||
"bwb_id": hit["bwb_id"],
|
||||
"titel": hit.get("regeling_titel", ""),
|
||||
"artikel": f"Artikel {hit.get('artikel_nummer', '?')}",
|
||||
"context": hit.get("tekst", "")[:200],
|
||||
}
|
||||
for hit in result.get("hits", [])
|
||||
]
|
||||
|
||||
# Fallback: grep-style zoeken
|
||||
resultaten: list[dict] = []
|
||||
|
||||
for regeling in store.list_regelingen():
|
||||
if type and regeling.get("type") != type:
|
||||
continue
|
||||
|
||||
tekst = store.get_tekst(regeling["bwb_id"])
|
||||
if tekst is None or q.lower() not in tekst.lower():
|
||||
continue
|
||||
|
||||
# Zoek in welk artikel de match zit
|
||||
current_artikel = ""
|
||||
for line in tekst.split("\n"):
|
||||
if line.startswith("### Artikel"):
|
||||
|
|
@ -177,5 +192,4 @@ def zoeken(
|
|||
})
|
||||
if len(resultaten) >= limit:
|
||||
return resultaten
|
||||
|
||||
return resultaten
|
||||
|
|
|
|||
188
src/wetgit/api/search.py
Normal file
188
src/wetgit/api/search.py
Normal file
|
|
@ -0,0 +1,188 @@
|
|||
"""Meilisearch integratie — indexer en zoekfunctie.
|
||||
|
||||
Indexeert alle artikelen in Meilisearch voor full-text search
|
||||
met Nederlandse stemming en typo-tolerantie.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DEFAULT_MEILI_URL = "http://127.0.0.1:7700"
|
||||
INDEX_NAME = "artikelen"
|
||||
|
||||
|
||||
class MeiliSearch:
|
||||
"""Meilisearch client voor WetGit."""
|
||||
|
||||
def __init__(self, url: str = DEFAULT_MEILI_URL, api_key: str | None = None) -> None:
|
||||
self.url = url.rstrip("/")
|
||||
self.headers: dict[str, str] = {"Content-Type": "application/json"}
|
||||
if api_key:
|
||||
self.headers["Authorization"] = f"Bearer {api_key}"
|
||||
|
||||
def setup_index(self) -> None:
|
||||
"""Maak de index aan met de juiste instellingen."""
|
||||
# Maak index
|
||||
httpx.post(
|
||||
f"{self.url}/indexes",
|
||||
json={"uid": INDEX_NAME, "primaryKey": "id"},
|
||||
headers=self.headers,
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
# Configureer zoek-instellingen
|
||||
httpx.patch(
|
||||
f"{self.url}/indexes/{INDEX_NAME}/settings",
|
||||
json={
|
||||
"searchableAttributes": ["tekst", "titel", "artikel_titel"],
|
||||
"filterableAttributes": ["bwb_id", "type", "regeling_titel"],
|
||||
"sortableAttributes": ["artikel_nummer"],
|
||||
"displayedAttributes": [
|
||||
"id", "bwb_id", "regeling_titel", "type",
|
||||
"artikel_nummer", "artikel_titel", "tekst",
|
||||
],
|
||||
},
|
||||
headers=self.headers,
|
||||
timeout=10,
|
||||
)
|
||||
logger.info("Meilisearch index '%s' geconfigureerd", INDEX_NAME)
|
||||
|
||||
def index_regeling(self, bwb_id: str, titel: str, type_: str, tekst: str) -> int:
|
||||
"""Indexeer alle artikelen van een regeling.
|
||||
|
||||
Returns:
|
||||
Aantal geïndexeerde artikelen.
|
||||
"""
|
||||
documents = []
|
||||
pattern = r"### Artikel (\S+)(.*?)(?=\n### Artikel |\n## |\Z)"
|
||||
|
||||
for match in re.finditer(pattern, tekst, re.DOTALL):
|
||||
nummer = match.group(1)
|
||||
body = match.group(2).strip()
|
||||
|
||||
# Extraheer eventuele artikel-titel
|
||||
artikel_titel = None
|
||||
lines = body.split("\n")
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if line.startswith("*") and line.endswith("*") and not line.startswith("**"):
|
||||
artikel_titel = line.strip("*").strip()
|
||||
break
|
||||
|
||||
documents.append({
|
||||
"id": f"{bwb_id}_art_{nummer}",
|
||||
"bwb_id": bwb_id,
|
||||
"regeling_titel": titel,
|
||||
"type": type_,
|
||||
"artikel_nummer": nummer,
|
||||
"artikel_titel": artikel_titel,
|
||||
"tekst": body,
|
||||
})
|
||||
|
||||
if documents:
|
||||
resp = httpx.post(
|
||||
f"{self.url}/indexes/{INDEX_NAME}/documents",
|
||||
json=documents,
|
||||
headers=self.headers,
|
||||
timeout=30,
|
||||
)
|
||||
logger.info("Geïndexeerd: %s — %d artikelen", bwb_id, len(documents))
|
||||
|
||||
return len(documents)
|
||||
|
||||
def search(
|
||||
self,
|
||||
query: str,
|
||||
filter_: str | None = None,
|
||||
limit: int = 20,
|
||||
) -> dict:
|
||||
"""Zoek in de index."""
|
||||
body: dict = {
|
||||
"q": query,
|
||||
"limit": limit,
|
||||
"attributesToHighlight": ["tekst"],
|
||||
"highlightPreTag": "**",
|
||||
"highlightPostTag": "**",
|
||||
"attributesToCrop": ["tekst"],
|
||||
"cropLength": 200,
|
||||
}
|
||||
if filter_:
|
||||
body["filter"] = filter_
|
||||
|
||||
resp = httpx.post(
|
||||
f"{self.url}/indexes/{INDEX_NAME}/search",
|
||||
json=body,
|
||||
headers=self.headers,
|
||||
timeout=10,
|
||||
)
|
||||
return resp.json()
|
||||
|
||||
def health(self) -> bool:
|
||||
"""Check of Meilisearch bereikbaar is."""
|
||||
try:
|
||||
resp = httpx.get(f"{self.url}/health", timeout=5)
|
||||
return resp.json().get("status") == "available"
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def index_repo(repo_path: Path, meili_url: str = DEFAULT_MEILI_URL) -> int:
|
||||
"""Indexeer de hele repo in Meilisearch.
|
||||
|
||||
Returns:
|
||||
Totaal aantal geïndexeerde artikelen.
|
||||
"""
|
||||
import json
|
||||
|
||||
meili = MeiliSearch(url=meili_url)
|
||||
meili.setup_index()
|
||||
|
||||
total = 0
|
||||
|
||||
# Laad index.json
|
||||
index_path = repo_path / "index.json"
|
||||
if index_path.exists():
|
||||
data = json.loads(index_path.read_text(encoding="utf-8"))
|
||||
regelingen = data.get("regelingen", [])
|
||||
else:
|
||||
from wetgit.pipeline.indexer import generate_index
|
||||
regelingen = generate_index(repo_path)
|
||||
|
||||
for regeling in regelingen:
|
||||
bwb_id = regeling["bwb_id"]
|
||||
md_path = repo_path / regeling["pad"] / "README.md"
|
||||
if not md_path.exists():
|
||||
continue
|
||||
|
||||
tekst = md_path.read_text(encoding="utf-8")
|
||||
count = meili.index_regeling(
|
||||
bwb_id=bwb_id,
|
||||
titel=regeling.get("titel", ""),
|
||||
type_=regeling.get("type", ""),
|
||||
tekst=tekst,
|
||||
)
|
||||
total += count
|
||||
|
||||
logger.info("Totaal geïndexeerd: %d artikelen", total)
|
||||
return total
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||
|
||||
parser = argparse.ArgumentParser(description="WetGit Meilisearch indexer")
|
||||
parser.add_argument("--repo", type=Path, required=True)
|
||||
parser.add_argument("--meili-url", default=DEFAULT_MEILI_URL)
|
||||
args = parser.parse_args()
|
||||
|
||||
total = index_repo(args.repo, args.meili_url)
|
||||
print(f"Geïndexeerd: {total} artikelen")
|
||||
Loading…
Add table
Reference in a new issue