From 40c36d612a632e0da176e3accf3612d6239f2d09 Mon Sep 17 00:00:00 2001 From: Coornhert Date: Mon, 30 Mar 2026 07:37:47 +0200 Subject: [PATCH] feat: historie-reconstructie pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - history.py: reconstrueert volledige versiehistorie per regeling - sru_client.py: fetch_all_toestanden() voor alle versies per BWB-ID - Git commits met correcte author date (inwerkingtredings-datum) - flake.nix: venv vervangen door PYTHONPATH + pip --user Pilot: Grondwet (BWBR0001840) — 11 toestanden, 11 commits, 0 failures. git diff toont exacte wetswijzigingen (bijv. art. 131 Grondwet). Sluit #28, #29, #30, #31 --- flake.nix | 13 +- src/wetgit/pipeline/history.py | 220 ++++++++++++++++++++++++++++++ src/wetgit/pipeline/sru_client.py | 27 ++++ 3 files changed, 254 insertions(+), 6 deletions(-) create mode 100644 src/wetgit/pipeline/history.py diff --git a/flake.nix b/flake.nix index d2c683e..dfa32d4 100644 --- a/flake.nix +++ b/flake.nix @@ -103,13 +103,14 @@ echo "" fi - # Venv voor PyPI-only packages (agentmail etc.) - if [ ! -d .venv ]; then - uv venv .venv --python python3.13 --seed - uv pip install --python .venv/bin/python agentmail - echo "Created .venv and installed PyPI dependencies" + # PYTHONPATH voor lokale wetgit package + export PYTHONPATH="$PWD/src:$PYTHONPATH" + + # PyPI-only packages (niet in nixpkgs) installeren in user site + if ! python -c "import agentmail" 2>/dev/null; then + pip install --user --quiet agentmail + echo "Installed agentmail via pip --user" fi - source .venv/bin/activate ''; }; }); diff --git a/src/wetgit/pipeline/history.py b/src/wetgit/pipeline/history.py new file mode 100644 index 0000000..7a3a7f9 --- /dev/null +++ b/src/wetgit/pipeline/history.py @@ -0,0 +1,220 @@ +"""Historie-reconstructie — alle toestanden als git commits met correcte datums. + +Neemt een BWB-ID, haalt alle historische toestanden op via SRU, +downloadt de XML, parseert naar Markdown, en maakt per toestand +een git commit met de inwerkingtredings-datum als author date. + +Usage: + python -m wetgit.pipeline.history --bwb-id BWBR0001840 --repo /path/to/rijk --xml-cache /path/to/cache +""" + +from __future__ import annotations + +import logging +import subprocess +import time +from datetime import datetime, timezone +from pathlib import Path + +import httpx + +from wetgit.pipeline.bwb_parser import parse_bwb_xml +from wetgit.pipeline.sru_client import SRURecord, fetch_all_toestanden + +logger = logging.getLogger(__name__) + +TYPE_TO_DIR = { + "wet": "wet", + "AMvB": "amvb", + "ministeriele-regeling": "ministeriele-regeling", + "KB": "kb", + "rijkswet": "rijkswet", + "verdrag": "verdrag", + "beleidsregel": "beleidsregel", + "circulaire": "circulaire", + "zbo": "zbo", + "pbo": "pbo", +} + + +def reconstruct_history( + bwb_id: str, + repo_path: Path, + xml_cache: Path, + delay: float = 0.3, + dry_run: bool = False, +) -> dict[str, int]: + """Reconstrueer de volledige historie van één regeling. + + Args: + bwb_id: BWB identificatienummer (bijv. BWBR0001840). + repo_path: Pad naar de lokale clone van wetgit/rijk. + xml_cache: Pad naar de XML cache directory. + delay: Vertraging tussen downloads (sec). + dry_run: Als True, toon commits maar maak ze niet. + + Returns: + Dict met statistieken. + """ + xml_cache.mkdir(parents=True, exist_ok=True) + stats = {"toestanden": 0, "commits": 0, "skipped": 0, "failed": 0} + + # Stap 1: Alle toestanden ophalen + logger.info("Ophalen toestanden voor %s...", bwb_id) + toestanden = fetch_all_toestanden(bwb_id) + stats["toestanden"] = len(toestanden) + logger.info("Gevonden: %d toestanden", len(toestanden)) + + if not toestanden: + logger.warning("Geen toestanden gevonden voor %s", bwb_id) + return stats + + # Stap 2: Download alle XMLs + for i, toestand in enumerate(toestanden): + datum = toestand.datum_geldig_van or "onbekend" + cache_name = f"{bwb_id}_{datum}.xml" + xml_path = xml_cache / cache_name + + if not xml_path.exists(): + logger.info(" [%d/%d] Downloaden %s...", i + 1, len(toestanden), datum) + try: + resp = httpx.get(toestand.xml_url, timeout=60, follow_redirects=True) + resp.raise_for_status() + xml_path.write_bytes(resp.content) + except httpx.HTTPError as e: + logger.warning(" Download mislukt voor %s: %s", datum, e) + stats["failed"] += 1 + continue + if delay > 0: + time.sleep(delay) + + # Stap 3: Parse en commit (chronologisch, oudste eerst) + for i, toestand in enumerate(toestanden): + datum = toestand.datum_geldig_van or "onbekend" + cache_name = f"{bwb_id}_{datum}.xml" + xml_path = xml_cache / cache_name + + if not xml_path.exists(): + stats["skipped"] += 1 + continue + + logger.info(" [%d/%d] Parsing %s...", i + 1, len(toestanden), datum) + + try: + result = parse_bwb_xml(str(xml_path)) + except Exception as e: + logger.warning(" Parse mislukt voor %s: %s", datum, e) + stats["failed"] += 1 + continue + + # Bepaal output pad + type_dir = TYPE_TO_DIR.get(result.soort, "overig") + slug = _slugify(result.titel) if result.titel else bwb_id.lower() + md_path = repo_path / type_dir / slug / bwb_id / "README.md" + md_path.parent.mkdir(parents=True, exist_ok=True) + + # Check of er iets veranderd is + if md_path.exists(): + existing = md_path.read_text(encoding="utf-8") + if existing == result.markdown: + logger.debug(" Geen wijziging voor %s, overslaan", datum) + stats["skipped"] += 1 + continue + + # Schrijf het bestand + md_path.write_text(result.markdown, encoding="utf-8") + + if dry_run: + logger.info(" [DRY-RUN] Zou committen: %s | %s", datum, bwb_id) + stats["commits"] += 1 + continue + + # Git commit met correcte author date + commit_msg = f"{datum} | {bwb_id} | {result.titel}" + author_date = _to_git_date(datum) + + try: + # Stage het bestand + rel_path = md_path.relative_to(repo_path) + subprocess.run( + ["git", "add", str(rel_path)], + cwd=repo_path, check=True, capture_output=True, + ) + + # Commit met author date + env = { + "GIT_AUTHOR_DATE": author_date, + "GIT_COMMITTER_DATE": author_date, + } + subprocess.run( + ["git", "commit", "-m", commit_msg, + "--author", "Coornhert "], + cwd=repo_path, check=True, capture_output=True, + env={**subprocess.os.environ, **env}, + ) + stats["commits"] += 1 + logger.info(" Commit: %s", commit_msg) + + except subprocess.CalledProcessError as e: + stderr = e.stderr.decode() if e.stderr else "" + if "nothing to commit" in stderr: + stats["skipped"] += 1 + else: + logger.warning(" Git error: %s", stderr[:200]) + stats["failed"] += 1 + + logger.info( + "Historie %s: %d toestanden, %d commits, %d skipped, %d failed", + bwb_id, stats["toestanden"], stats["commits"], + stats["skipped"], stats["failed"], + ) + return stats + + +def _to_git_date(date_str: str) -> str: + """Converteer YYYY-MM-DD naar git-compatible datum string.""" + try: + dt = datetime.strptime(date_str, "%Y-%m-%d").replace( + hour=12, tzinfo=timezone.utc, + ) + return dt.strftime("%Y-%m-%dT%H:%M:%S%z") + except ValueError: + return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S%z") + + +def _slugify(text: str) -> str: + """Maak een URL-veilige slug van een titel.""" + import re + slug = text.lower().strip() + slug = re.sub(r"[^\w\s-]", "", slug) + slug = re.sub(r"[\s_]+", "-", slug) + slug = re.sub(r"-+", "-", slug) + return slug[:80].strip("-") + + +if __name__ == "__main__": + import argparse + import json + + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s %(message)s", + datefmt="%H:%M:%S", + ) + + parser = argparse.ArgumentParser(description="WetGit historie-reconstructie") + parser.add_argument("--bwb-id", required=True, help="BWB identificatienummer") + parser.add_argument("--repo", type=Path, required=True, help="Pad naar wetgit/rijk clone") + parser.add_argument("--xml-cache", type=Path, required=True, help="XML cache directory") + parser.add_argument("--delay", type=float, default=0.3, help="Delay tussen downloads") + parser.add_argument("--dry-run", action="store_true", help="Geen git commits") + args = parser.parse_args() + + stats = reconstruct_history( + bwb_id=args.bwb_id, + repo_path=args.repo, + xml_cache=args.xml_cache, + delay=args.delay, + dry_run=args.dry_run, + ) + print(json.dumps(stats, indent=2)) diff --git a/src/wetgit/pipeline/sru_client.py b/src/wetgit/pipeline/sru_client.py index 08e53c4..6a30942 100644 --- a/src/wetgit/pipeline/sru_client.py +++ b/src/wetgit/pipeline/sru_client.py @@ -104,6 +104,33 @@ def fetch_latest_toestand(bwb_id: str) -> SRURecord | None: return records[-1] +def fetch_all_toestanden(bwb_id: str) -> list[SRURecord]: + """Haal alle toestanden (versies) op voor één BWB-ID. + + Retourneert alle historische versies, gesorteerd op datum (oudste eerst). + + Args: + bwb_id: Het BWB identificatienummer (bijv. BWBR0001840). + + Returns: + Lijst van SRURecords, gesorteerd op datum_geldig_van. + """ + all_records: list[SRURecord] = [] + start = 1 + + while True: + records, total = _fetch_page(f"dcterms.identifier={bwb_id}", start, 100) + all_records.extend(records) + start += len(records) + if start > total or not records: + break + time.sleep(0.3) + + # Sorteer op geldigheids-startdatum (oudste eerst) + all_records.sort(key=lambda r: r.datum_geldig_van or "0000-00-00") + return all_records + + def _fetch_page( query: str, start_record: int, maximum_records: int, ) -> tuple[list[SRURecord], int]: