feat: MVP Meilisearch full-text search
- search.py: Meilisearch client + indexer (per artikel) - /api/v1/zoeken gebruikt nu Meilisearch (met grep fallback) - Typo-tolerant: "godsdiensst" → vindt art. 1, 6, 23 - 1ms responstijd - 176 artikelen Grondwet geïndexeerd - Meilisearch container draait op dt-prod-01 Sluit #35
This commit is contained in:
parent
21be1367d1
commit
c3d1efc3df
2 changed files with 209 additions and 7 deletions
|
|
@ -26,6 +26,7 @@ from wetgit.api.models import (
|
||||||
)
|
)
|
||||||
|
|
||||||
REPO_PATH = Path(os.environ.get("WETGIT_REPO", "/tmp/wetgit-index-test"))
|
REPO_PATH = Path(os.environ.get("WETGIT_REPO", "/tmp/wetgit-index-test"))
|
||||||
|
MEILI_URL = os.environ.get("MEILI_URL", "http://127.0.0.1:7700")
|
||||||
|
|
||||||
app = FastAPI(
|
app = FastAPI(
|
||||||
title="WetGit API",
|
title="WetGit API",
|
||||||
|
|
@ -150,20 +151,34 @@ def zoeken(
|
||||||
type: str | None = Query(None, description="Filter op type"),
|
type: str | None = Query(None, description="Filter op type"),
|
||||||
limit: int = Query(20, ge=1, le=100, description="Max resultaten"),
|
limit: int = Query(20, ge=1, le=100, description="Max resultaten"),
|
||||||
) -> list[dict]:
|
) -> list[dict]:
|
||||||
"""Doorzoek alle wetgeving (full-text)."""
|
"""Doorzoek alle wetgeving via Meilisearch (typo-tolerant, snel)."""
|
||||||
import re
|
from wetgit.api.search import MeiliSearch
|
||||||
|
|
||||||
|
meili = MeiliSearch(url=MEILI_URL)
|
||||||
|
|
||||||
|
# Probeer Meilisearch, fallback naar grep
|
||||||
|
if meili.health():
|
||||||
|
filter_str = f'type = "{type}"' if type else None
|
||||||
|
result = meili.search(q, filter_=filter_str, limit=limit)
|
||||||
|
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"bwb_id": hit["bwb_id"],
|
||||||
|
"titel": hit.get("regeling_titel", ""),
|
||||||
|
"artikel": f"Artikel {hit.get('artikel_nummer', '?')}",
|
||||||
|
"context": hit.get("tekst", "")[:200],
|
||||||
|
}
|
||||||
|
for hit in result.get("hits", [])
|
||||||
|
]
|
||||||
|
|
||||||
|
# Fallback: grep-style zoeken
|
||||||
resultaten: list[dict] = []
|
resultaten: list[dict] = []
|
||||||
|
|
||||||
for regeling in store.list_regelingen():
|
for regeling in store.list_regelingen():
|
||||||
if type and regeling.get("type") != type:
|
if type and regeling.get("type") != type:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
tekst = store.get_tekst(regeling["bwb_id"])
|
tekst = store.get_tekst(regeling["bwb_id"])
|
||||||
if tekst is None or q.lower() not in tekst.lower():
|
if tekst is None or q.lower() not in tekst.lower():
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Zoek in welk artikel de match zit
|
|
||||||
current_artikel = ""
|
current_artikel = ""
|
||||||
for line in tekst.split("\n"):
|
for line in tekst.split("\n"):
|
||||||
if line.startswith("### Artikel"):
|
if line.startswith("### Artikel"):
|
||||||
|
|
@ -177,5 +192,4 @@ def zoeken(
|
||||||
})
|
})
|
||||||
if len(resultaten) >= limit:
|
if len(resultaten) >= limit:
|
||||||
return resultaten
|
return resultaten
|
||||||
|
|
||||||
return resultaten
|
return resultaten
|
||||||
|
|
|
||||||
188
src/wetgit/api/search.py
Normal file
188
src/wetgit/api/search.py
Normal file
|
|
@ -0,0 +1,188 @@
|
||||||
|
"""Meilisearch integratie — indexer en zoekfunctie.
|
||||||
|
|
||||||
|
Indexeert alle artikelen in Meilisearch voor full-text search
|
||||||
|
met Nederlandse stemming en typo-tolerantie.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
DEFAULT_MEILI_URL = "http://127.0.0.1:7700"
|
||||||
|
INDEX_NAME = "artikelen"
|
||||||
|
|
||||||
|
|
||||||
|
class MeiliSearch:
|
||||||
|
"""Meilisearch client voor WetGit."""
|
||||||
|
|
||||||
|
def __init__(self, url: str = DEFAULT_MEILI_URL, api_key: str | None = None) -> None:
|
||||||
|
self.url = url.rstrip("/")
|
||||||
|
self.headers: dict[str, str] = {"Content-Type": "application/json"}
|
||||||
|
if api_key:
|
||||||
|
self.headers["Authorization"] = f"Bearer {api_key}"
|
||||||
|
|
||||||
|
def setup_index(self) -> None:
|
||||||
|
"""Maak de index aan met de juiste instellingen."""
|
||||||
|
# Maak index
|
||||||
|
httpx.post(
|
||||||
|
f"{self.url}/indexes",
|
||||||
|
json={"uid": INDEX_NAME, "primaryKey": "id"},
|
||||||
|
headers=self.headers,
|
||||||
|
timeout=10,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Configureer zoek-instellingen
|
||||||
|
httpx.patch(
|
||||||
|
f"{self.url}/indexes/{INDEX_NAME}/settings",
|
||||||
|
json={
|
||||||
|
"searchableAttributes": ["tekst", "titel", "artikel_titel"],
|
||||||
|
"filterableAttributes": ["bwb_id", "type", "regeling_titel"],
|
||||||
|
"sortableAttributes": ["artikel_nummer"],
|
||||||
|
"displayedAttributes": [
|
||||||
|
"id", "bwb_id", "regeling_titel", "type",
|
||||||
|
"artikel_nummer", "artikel_titel", "tekst",
|
||||||
|
],
|
||||||
|
},
|
||||||
|
headers=self.headers,
|
||||||
|
timeout=10,
|
||||||
|
)
|
||||||
|
logger.info("Meilisearch index '%s' geconfigureerd", INDEX_NAME)
|
||||||
|
|
||||||
|
def index_regeling(self, bwb_id: str, titel: str, type_: str, tekst: str) -> int:
|
||||||
|
"""Indexeer alle artikelen van een regeling.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Aantal geïndexeerde artikelen.
|
||||||
|
"""
|
||||||
|
documents = []
|
||||||
|
pattern = r"### Artikel (\S+)(.*?)(?=\n### Artikel |\n## |\Z)"
|
||||||
|
|
||||||
|
for match in re.finditer(pattern, tekst, re.DOTALL):
|
||||||
|
nummer = match.group(1)
|
||||||
|
body = match.group(2).strip()
|
||||||
|
|
||||||
|
# Extraheer eventuele artikel-titel
|
||||||
|
artikel_titel = None
|
||||||
|
lines = body.split("\n")
|
||||||
|
for line in lines:
|
||||||
|
line = line.strip()
|
||||||
|
if line.startswith("*") and line.endswith("*") and not line.startswith("**"):
|
||||||
|
artikel_titel = line.strip("*").strip()
|
||||||
|
break
|
||||||
|
|
||||||
|
documents.append({
|
||||||
|
"id": f"{bwb_id}_art_{nummer}",
|
||||||
|
"bwb_id": bwb_id,
|
||||||
|
"regeling_titel": titel,
|
||||||
|
"type": type_,
|
||||||
|
"artikel_nummer": nummer,
|
||||||
|
"artikel_titel": artikel_titel,
|
||||||
|
"tekst": body,
|
||||||
|
})
|
||||||
|
|
||||||
|
if documents:
|
||||||
|
resp = httpx.post(
|
||||||
|
f"{self.url}/indexes/{INDEX_NAME}/documents",
|
||||||
|
json=documents,
|
||||||
|
headers=self.headers,
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
logger.info("Geïndexeerd: %s — %d artikelen", bwb_id, len(documents))
|
||||||
|
|
||||||
|
return len(documents)
|
||||||
|
|
||||||
|
def search(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
filter_: str | None = None,
|
||||||
|
limit: int = 20,
|
||||||
|
) -> dict:
|
||||||
|
"""Zoek in de index."""
|
||||||
|
body: dict = {
|
||||||
|
"q": query,
|
||||||
|
"limit": limit,
|
||||||
|
"attributesToHighlight": ["tekst"],
|
||||||
|
"highlightPreTag": "**",
|
||||||
|
"highlightPostTag": "**",
|
||||||
|
"attributesToCrop": ["tekst"],
|
||||||
|
"cropLength": 200,
|
||||||
|
}
|
||||||
|
if filter_:
|
||||||
|
body["filter"] = filter_
|
||||||
|
|
||||||
|
resp = httpx.post(
|
||||||
|
f"{self.url}/indexes/{INDEX_NAME}/search",
|
||||||
|
json=body,
|
||||||
|
headers=self.headers,
|
||||||
|
timeout=10,
|
||||||
|
)
|
||||||
|
return resp.json()
|
||||||
|
|
||||||
|
def health(self) -> bool:
|
||||||
|
"""Check of Meilisearch bereikbaar is."""
|
||||||
|
try:
|
||||||
|
resp = httpx.get(f"{self.url}/health", timeout=5)
|
||||||
|
return resp.json().get("status") == "available"
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def index_repo(repo_path: Path, meili_url: str = DEFAULT_MEILI_URL) -> int:
|
||||||
|
"""Indexeer de hele repo in Meilisearch.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Totaal aantal geïndexeerde artikelen.
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
|
||||||
|
meili = MeiliSearch(url=meili_url)
|
||||||
|
meili.setup_index()
|
||||||
|
|
||||||
|
total = 0
|
||||||
|
|
||||||
|
# Laad index.json
|
||||||
|
index_path = repo_path / "index.json"
|
||||||
|
if index_path.exists():
|
||||||
|
data = json.loads(index_path.read_text(encoding="utf-8"))
|
||||||
|
regelingen = data.get("regelingen", [])
|
||||||
|
else:
|
||||||
|
from wetgit.pipeline.indexer import generate_index
|
||||||
|
regelingen = generate_index(repo_path)
|
||||||
|
|
||||||
|
for regeling in regelingen:
|
||||||
|
bwb_id = regeling["bwb_id"]
|
||||||
|
md_path = repo_path / regeling["pad"] / "README.md"
|
||||||
|
if not md_path.exists():
|
||||||
|
continue
|
||||||
|
|
||||||
|
tekst = md_path.read_text(encoding="utf-8")
|
||||||
|
count = meili.index_regeling(
|
||||||
|
bwb_id=bwb_id,
|
||||||
|
titel=regeling.get("titel", ""),
|
||||||
|
type_=regeling.get("type", ""),
|
||||||
|
tekst=tekst,
|
||||||
|
)
|
||||||
|
total += count
|
||||||
|
|
||||||
|
logger.info("Totaal geïndexeerd: %d artikelen", total)
|
||||||
|
return total
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="WetGit Meilisearch indexer")
|
||||||
|
parser.add_argument("--repo", type=Path, required=True)
|
||||||
|
parser.add_argument("--meili-url", default=DEFAULT_MEILI_URL)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
total = index_repo(args.repo, args.meili_url)
|
||||||
|
print(f"Geïndexeerd: {total} artikelen")
|
||||||
Loading…
Add table
Reference in a new issue