feat: BWB pipeline — SRU crawler, downloader, runner
- SRU client: crawlt zoekservice.overheid.nl voor alle BWB regelingen
- Downloader: haalt XML op van repository.officiele-overheidspublicaties.nl
- Pipeline runner: orchestreert crawl → download → parse → schrijf Markdown
- Deduplicatie: meerdere SRU toestanden → meest recente per BWB-ID
- Mappenstructuur conform PRD: wet/{slug}/{BWB_ID}/README.md
- CLI: python -m wetgit.pipeline.runner --output /path --type wet --limit N
Getest: 28 wetten succesvol gedownload en geparsed, 0 failures.
Refs #6
This commit is contained in:
parent
c481ebf9e7
commit
da7d11deb9
3 changed files with 378 additions and 0 deletions
41
src/wetgit/pipeline/downloader.py
Normal file
41
src/wetgit/pipeline/downloader.py
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
"""BWB XML downloader.
|
||||
|
||||
Downloadt BWB toestand-XML bestanden van de officiele repository.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
|
||||
from wetgit.pipeline.sru_client import SRURecord
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def download_xml(record: SRURecord, output_dir: Path) -> Path | None:
|
||||
"""Download de XML voor een SRU record.
|
||||
|
||||
Args:
|
||||
record: SRU record met de download-URL.
|
||||
output_dir: Directory om de XML op te slaan.
|
||||
|
||||
Returns:
|
||||
Pad naar het gedownloade bestand, of None bij fout.
|
||||
"""
|
||||
xml_path = output_dir / f"{record.bwb_id}.xml"
|
||||
|
||||
if xml_path.exists():
|
||||
return xml_path
|
||||
|
||||
try:
|
||||
resp = httpx.get(record.xml_url, timeout=60, follow_redirects=True)
|
||||
resp.raise_for_status()
|
||||
xml_path.write_bytes(resp.content)
|
||||
logger.debug("Downloaded %s (%d KB)", record.bwb_id, len(resp.content) // 1024)
|
||||
return xml_path
|
||||
except httpx.HTTPError as e:
|
||||
logger.warning("Download failed for %s: %s", record.bwb_id, e)
|
||||
return None
|
||||
151
src/wetgit/pipeline/runner.py
Normal file
151
src/wetgit/pipeline/runner.py
Normal file
|
|
@ -0,0 +1,151 @@
|
|||
"""Pipeline runner — orkestreert SRU crawl, download en parse.
|
||||
|
||||
Usage:
|
||||
python -m wetgit.pipeline.runner --type wet --limit 100 --output /tmp/wetgit-output
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from wetgit.pipeline.bwb_parser import parse_bwb_xml
|
||||
from wetgit.pipeline.downloader import download_xml
|
||||
from wetgit.pipeline.sru_client import BWB_TYPES, SRURecord, fetch_catalogue
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Mappenstructuur conform PRD
|
||||
TYPE_TO_DIR = {
|
||||
"wet": "wet",
|
||||
"AMvB": "amvb",
|
||||
"ministeriele-regeling": "ministeriele-regeling",
|
||||
"KB": "kb",
|
||||
"rijkswet": "rijkswet",
|
||||
"verdrag": "verdrag",
|
||||
"beleidsregel": "beleidsregel",
|
||||
"circulaire": "circulaire",
|
||||
"zbo": "zbo",
|
||||
"pbo": "pbo",
|
||||
}
|
||||
|
||||
|
||||
def run_pipeline(
|
||||
output_dir: Path,
|
||||
xml_cache_dir: Path | None = None,
|
||||
types: list[str] | None = None,
|
||||
limit: int | None = None,
|
||||
delay: float = 0.3,
|
||||
) -> dict[str, int]:
|
||||
"""Draai de volledige BWB pipeline.
|
||||
|
||||
Args:
|
||||
output_dir: Waar de Markdown bestanden geschreven worden.
|
||||
xml_cache_dir: Cache directory voor gedownloade XML (default: output_dir/.xml-cache).
|
||||
types: Regelingtypen om te verwerken (default: alle).
|
||||
limit: Maximum aantal regelingen (None = alles).
|
||||
delay: Vertraging tussen downloads (sec).
|
||||
|
||||
Returns:
|
||||
Dict met statistieken (success, failed, skipped).
|
||||
"""
|
||||
xml_cache_dir = xml_cache_dir or output_dir / ".xml-cache"
|
||||
xml_cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
stats = {"catalogus": 0, "downloaded": 0, "parsed": 0, "failed": 0, "skipped": 0}
|
||||
|
||||
# Stap 1: SRU catalogus ophalen
|
||||
logger.info("Stap 1: SRU catalogus ophalen (types=%s, limit=%s)", types, limit)
|
||||
records = fetch_catalogue(types=types, max_records=limit)
|
||||
stats["catalogus"] = len(records)
|
||||
logger.info("Catalogus: %d regelingen gevonden", len(records))
|
||||
|
||||
# Dedupliceer op BWB-ID (SRU kan meerdere toestanden per regeling geven)
|
||||
# Neem de laatste (meest recente) per BWB-ID
|
||||
seen: dict[str, SRURecord] = {}
|
||||
for r in records:
|
||||
seen[r.bwb_id] = r # Laatste wint
|
||||
unique_records = list(seen.values())
|
||||
logger.info("Na deduplicatie: %d unieke regelingen", len(unique_records))
|
||||
|
||||
# Stap 2+3: Download en parse
|
||||
for i, record in enumerate(unique_records):
|
||||
if i > 0 and i % 100 == 0:
|
||||
logger.info("Voortgang: %d/%d (%d parsed, %d failed)",
|
||||
i, len(unique_records), stats["parsed"], stats["failed"])
|
||||
|
||||
# Bepaal output pad: output_dir/{type}/{slug}/{BWB_ID}/README.md
|
||||
type_dir = TYPE_TO_DIR.get(record.type, "overig")
|
||||
slug = _slugify(record.titel) if record.titel else record.bwb_id.lower()
|
||||
regeling_dir = output_dir / type_dir / slug / record.bwb_id
|
||||
md_path = regeling_dir / "README.md"
|
||||
|
||||
if md_path.exists():
|
||||
stats["skipped"] += 1
|
||||
continue
|
||||
|
||||
# Download
|
||||
xml_path = download_xml(record, xml_cache_dir)
|
||||
if xml_path is None:
|
||||
stats["failed"] += 1
|
||||
continue
|
||||
stats["downloaded"] += 1
|
||||
|
||||
# Parse
|
||||
try:
|
||||
result = parse_bwb_xml(str(xml_path))
|
||||
regeling_dir.mkdir(parents=True, exist_ok=True)
|
||||
md_path.write_text(result.markdown, encoding="utf-8")
|
||||
stats["parsed"] += 1
|
||||
except Exception as e:
|
||||
logger.warning("Parse failed for %s: %s", record.bwb_id, e)
|
||||
stats["failed"] += 1
|
||||
|
||||
if delay > 0:
|
||||
time.sleep(delay)
|
||||
|
||||
logger.info(
|
||||
"Pipeline klaar: %d parsed, %d failed, %d skipped van %d",
|
||||
stats["parsed"], stats["failed"], stats["skipped"], stats["catalogus"],
|
||||
)
|
||||
return stats
|
||||
|
||||
|
||||
def _slugify(text: str) -> str:
|
||||
"""Maak een URL-veilige slug van een titel."""
|
||||
import re
|
||||
slug = text.lower().strip()
|
||||
slug = re.sub(r"[^\w\s-]", "", slug)
|
||||
slug = re.sub(r"[\s_]+", "-", slug)
|
||||
slug = re.sub(r"-+", "-", slug)
|
||||
return slug[:80].strip("-")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
)
|
||||
|
||||
parser = argparse.ArgumentParser(description="WetGit BWB pipeline")
|
||||
parser.add_argument("--output", type=Path, required=True, help="Output directory")
|
||||
parser.add_argument("--xml-cache", type=Path, help="XML cache directory")
|
||||
parser.add_argument("--type", action="append", dest="types", help="Regelingtype (herhaalbaar)")
|
||||
parser.add_argument("--limit", type=int, help="Maximum aantal regelingen")
|
||||
parser.add_argument("--delay", type=float, default=0.3, help="Delay tussen downloads (sec)")
|
||||
args = parser.parse_args()
|
||||
|
||||
stats = run_pipeline(
|
||||
output_dir=args.output,
|
||||
xml_cache_dir=args.xml_cache,
|
||||
types=args.types,
|
||||
limit=args.limit,
|
||||
delay=args.delay,
|
||||
)
|
||||
print(json.dumps(stats, indent=2))
|
||||
186
src/wetgit/pipeline/sru_client.py
Normal file
186
src/wetgit/pipeline/sru_client.py
Normal file
|
|
@ -0,0 +1,186 @@
|
|||
"""SRU client voor het Basiswettenbestand.
|
||||
|
||||
Crawlt de SRU-zoekservice van overheid.nl om BWB regelingen te vinden
|
||||
en download-URLs voor de XML-toestanden op te halen.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
import httpx
|
||||
from lxml import etree
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
SRU_BASE = "https://zoekservice.overheid.nl/sru/Search"
|
||||
SRU_NS = {
|
||||
"srw": "http://www.loc.gov/zing/srw/",
|
||||
"bwb": "http://standaarden.overheid.nl/bwb/terms/",
|
||||
"dcterms": "http://purl.org/dc/terms/",
|
||||
"overheid": "http://standaarden.overheid.nl/owms/terms/",
|
||||
}
|
||||
|
||||
# Alle regelingtypen in het BWB
|
||||
BWB_TYPES = [
|
||||
"wet", "AMvB", "ministeriele-regeling", "KB", "rijkswet",
|
||||
"verdrag", "beleidsregel", "circulaire", "zbo", "pbo",
|
||||
]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class SRURecord:
|
||||
"""Eén regeling uit de SRU-response."""
|
||||
|
||||
bwb_id: str
|
||||
titel: str
|
||||
type: str
|
||||
xml_url: str
|
||||
wti_url: str
|
||||
datum_geldig_van: str | None = None
|
||||
datum_geldig_tot: str | None = None
|
||||
ministerie: str | None = None
|
||||
|
||||
|
||||
def fetch_catalogue(
|
||||
types: list[str] | None = None,
|
||||
max_records: int | None = None,
|
||||
batch_size: int = 100,
|
||||
delay: float = 0.5,
|
||||
) -> list[SRURecord]:
|
||||
"""Haal de volledige BWB catalogus op via SRU.
|
||||
|
||||
Args:
|
||||
types: Lijst van regelingtypen om op te halen (default: alle).
|
||||
max_records: Maximum aantal records (None = alles).
|
||||
batch_size: Aantal records per SRU-request (max 100).
|
||||
delay: Vertraging tussen requests (sec) om de server niet te overbelasten.
|
||||
|
||||
Returns:
|
||||
Lijst van SRURecords met metadata en download-URLs.
|
||||
"""
|
||||
types = types or BWB_TYPES
|
||||
all_records: list[SRURecord] = []
|
||||
|
||||
for type_name in types:
|
||||
query = f"dcterms.type=={type_name}"
|
||||
start = 1
|
||||
|
||||
while True:
|
||||
records, total = _fetch_page(query, start, batch_size)
|
||||
all_records.extend(records)
|
||||
logger.info(
|
||||
"SRU %s: %d-%d / %d (totaal: %d)",
|
||||
type_name, start, start + len(records) - 1, total, len(all_records),
|
||||
)
|
||||
|
||||
if max_records and len(all_records) >= max_records:
|
||||
return all_records[:max_records]
|
||||
|
||||
start += len(records)
|
||||
if start > total or not records:
|
||||
break
|
||||
|
||||
time.sleep(delay)
|
||||
|
||||
return all_records
|
||||
|
||||
|
||||
def fetch_latest_toestand(bwb_id: str) -> SRURecord | None:
|
||||
"""Haal de meest recente toestand op voor één BWB-ID.
|
||||
|
||||
Args:
|
||||
bwb_id: Het BWB identificatienummer (bijv. BWBR0001840).
|
||||
|
||||
Returns:
|
||||
SRURecord of None als niet gevonden.
|
||||
"""
|
||||
records, _ = _fetch_page(f"dcterms.identifier={bwb_id}", 1, 100)
|
||||
if not records:
|
||||
return None
|
||||
# Neem de laatste (meest recente toestand)
|
||||
return records[-1]
|
||||
|
||||
|
||||
def _fetch_page(
|
||||
query: str, start_record: int, maximum_records: int,
|
||||
) -> tuple[list[SRURecord], int]:
|
||||
"""Haal één pagina SRU-resultaten op."""
|
||||
params = {
|
||||
"operation": "searchRetrieve",
|
||||
"version": "1.2",
|
||||
"x-connection": "BWB",
|
||||
"query": query,
|
||||
"startRecord": str(start_record),
|
||||
"maximumRecords": str(maximum_records),
|
||||
}
|
||||
|
||||
resp = httpx.get(SRU_BASE, params=params, timeout=30)
|
||||
resp.raise_for_status()
|
||||
tree = etree.fromstring(resp.content)
|
||||
|
||||
# Totaal aantal records
|
||||
total_el = tree.find(".//srw:numberOfRecords", SRU_NS)
|
||||
total = int(total_el.text) if total_el is not None and total_el.text else 0
|
||||
|
||||
records: list[SRURecord] = []
|
||||
for record in tree.findall(".//srw:record", SRU_NS):
|
||||
parsed = _parse_record(record)
|
||||
if parsed:
|
||||
records.append(parsed)
|
||||
|
||||
return records, total
|
||||
|
||||
|
||||
def _parse_record(record: etree._Element) -> SRURecord | None:
|
||||
"""Parse één SRU record naar een SRURecord."""
|
||||
# Zoek in originalData en enrichedData
|
||||
bwb_id = ""
|
||||
titel = ""
|
||||
type_ = ""
|
||||
xml_url = ""
|
||||
wti_url = ""
|
||||
datum_van = None
|
||||
datum_tot = None
|
||||
ministerie = None
|
||||
|
||||
for elem in record.iter():
|
||||
tag = elem.tag
|
||||
text = (elem.text or "").strip()
|
||||
|
||||
if not text:
|
||||
continue
|
||||
|
||||
if tag.endswith("}identifier") and "BWB" in text:
|
||||
bwb_id = text
|
||||
elif tag.endswith("}title"):
|
||||
titel = text
|
||||
elif tag.endswith("}type") and text in BWB_TYPES:
|
||||
type_ = text
|
||||
elif tag.endswith("}authority") or tag.endswith("}creator"):
|
||||
if not ministerie:
|
||||
ministerie = text
|
||||
elif "locatie_toestand" in tag:
|
||||
xml_url = text
|
||||
elif "locatie_wti" in tag:
|
||||
wti_url = text
|
||||
elif "geldigheidsperiode_startdatum" in tag:
|
||||
datum_van = text
|
||||
elif "geldigheidsperiode_einddatum" in tag:
|
||||
datum_tot = text
|
||||
|
||||
if not bwb_id or not xml_url:
|
||||
return None
|
||||
|
||||
return SRURecord(
|
||||
bwb_id=bwb_id,
|
||||
titel=titel,
|
||||
type=type_,
|
||||
xml_url=xml_url,
|
||||
wti_url=wti_url,
|
||||
datum_geldig_van=datum_van,
|
||||
datum_geldig_tot=datum_tot,
|
||||
ministerie=ministerie,
|
||||
)
|
||||
Loading…
Add table
Reference in a new issue