feat: historie-reconstructie pipeline

- history.py: reconstrueert volledige versiehistorie per regeling
- sru_client.py: fetch_all_toestanden() voor alle versies per BWB-ID
- Git commits met correcte author date (inwerkingtredings-datum)
- flake.nix: venv vervangen door PYTHONPATH + pip --user

Pilot: Grondwet (BWBR0001840) — 11 toestanden, 11 commits, 0 failures.
git diff toont exacte wetswijzigingen (bijv. art. 131 Grondwet).

Sluit #28, #29, #30, #31
This commit is contained in:
Coornhert 2026-03-30 07:37:47 +02:00
parent 03402cdfa0
commit 40c36d612a
3 changed files with 254 additions and 6 deletions

View file

@ -103,13 +103,14 @@
echo "" echo ""
fi fi
# Venv voor PyPI-only packages (agentmail etc.) # PYTHONPATH voor lokale wetgit package
if [ ! -d .venv ]; then export PYTHONPATH="$PWD/src:$PYTHONPATH"
uv venv .venv --python python3.13 --seed
uv pip install --python .venv/bin/python agentmail # PyPI-only packages (niet in nixpkgs) installeren in user site
echo "Created .venv and installed PyPI dependencies" if ! python -c "import agentmail" 2>/dev/null; then
pip install --user --quiet agentmail
echo "Installed agentmail via pip --user"
fi fi
source .venv/bin/activate
''; '';
}; };
}); });

View file

@ -0,0 +1,220 @@
"""Historie-reconstructie — alle toestanden als git commits met correcte datums.
Neemt een BWB-ID, haalt alle historische toestanden op via SRU,
downloadt de XML, parseert naar Markdown, en maakt per toestand
een git commit met de inwerkingtredings-datum als author date.
Usage:
python -m wetgit.pipeline.history --bwb-id BWBR0001840 --repo /path/to/rijk --xml-cache /path/to/cache
"""
from __future__ import annotations
import logging
import subprocess
import time
from datetime import datetime, timezone
from pathlib import Path
import httpx
from wetgit.pipeline.bwb_parser import parse_bwb_xml
from wetgit.pipeline.sru_client import SRURecord, fetch_all_toestanden
logger = logging.getLogger(__name__)
TYPE_TO_DIR = {
"wet": "wet",
"AMvB": "amvb",
"ministeriele-regeling": "ministeriele-regeling",
"KB": "kb",
"rijkswet": "rijkswet",
"verdrag": "verdrag",
"beleidsregel": "beleidsregel",
"circulaire": "circulaire",
"zbo": "zbo",
"pbo": "pbo",
}
def reconstruct_history(
bwb_id: str,
repo_path: Path,
xml_cache: Path,
delay: float = 0.3,
dry_run: bool = False,
) -> dict[str, int]:
"""Reconstrueer de volledige historie van één regeling.
Args:
bwb_id: BWB identificatienummer (bijv. BWBR0001840).
repo_path: Pad naar de lokale clone van wetgit/rijk.
xml_cache: Pad naar de XML cache directory.
delay: Vertraging tussen downloads (sec).
dry_run: Als True, toon commits maar maak ze niet.
Returns:
Dict met statistieken.
"""
xml_cache.mkdir(parents=True, exist_ok=True)
stats = {"toestanden": 0, "commits": 0, "skipped": 0, "failed": 0}
# Stap 1: Alle toestanden ophalen
logger.info("Ophalen toestanden voor %s...", bwb_id)
toestanden = fetch_all_toestanden(bwb_id)
stats["toestanden"] = len(toestanden)
logger.info("Gevonden: %d toestanden", len(toestanden))
if not toestanden:
logger.warning("Geen toestanden gevonden voor %s", bwb_id)
return stats
# Stap 2: Download alle XMLs
for i, toestand in enumerate(toestanden):
datum = toestand.datum_geldig_van or "onbekend"
cache_name = f"{bwb_id}_{datum}.xml"
xml_path = xml_cache / cache_name
if not xml_path.exists():
logger.info(" [%d/%d] Downloaden %s...", i + 1, len(toestanden), datum)
try:
resp = httpx.get(toestand.xml_url, timeout=60, follow_redirects=True)
resp.raise_for_status()
xml_path.write_bytes(resp.content)
except httpx.HTTPError as e:
logger.warning(" Download mislukt voor %s: %s", datum, e)
stats["failed"] += 1
continue
if delay > 0:
time.sleep(delay)
# Stap 3: Parse en commit (chronologisch, oudste eerst)
for i, toestand in enumerate(toestanden):
datum = toestand.datum_geldig_van or "onbekend"
cache_name = f"{bwb_id}_{datum}.xml"
xml_path = xml_cache / cache_name
if not xml_path.exists():
stats["skipped"] += 1
continue
logger.info(" [%d/%d] Parsing %s...", i + 1, len(toestanden), datum)
try:
result = parse_bwb_xml(str(xml_path))
except Exception as e:
logger.warning(" Parse mislukt voor %s: %s", datum, e)
stats["failed"] += 1
continue
# Bepaal output pad
type_dir = TYPE_TO_DIR.get(result.soort, "overig")
slug = _slugify(result.titel) if result.titel else bwb_id.lower()
md_path = repo_path / type_dir / slug / bwb_id / "README.md"
md_path.parent.mkdir(parents=True, exist_ok=True)
# Check of er iets veranderd is
if md_path.exists():
existing = md_path.read_text(encoding="utf-8")
if existing == result.markdown:
logger.debug(" Geen wijziging voor %s, overslaan", datum)
stats["skipped"] += 1
continue
# Schrijf het bestand
md_path.write_text(result.markdown, encoding="utf-8")
if dry_run:
logger.info(" [DRY-RUN] Zou committen: %s | %s", datum, bwb_id)
stats["commits"] += 1
continue
# Git commit met correcte author date
commit_msg = f"{datum} | {bwb_id} | {result.titel}"
author_date = _to_git_date(datum)
try:
# Stage het bestand
rel_path = md_path.relative_to(repo_path)
subprocess.run(
["git", "add", str(rel_path)],
cwd=repo_path, check=True, capture_output=True,
)
# Commit met author date
env = {
"GIT_AUTHOR_DATE": author_date,
"GIT_COMMITTER_DATE": author_date,
}
subprocess.run(
["git", "commit", "-m", commit_msg,
"--author", "Coornhert <coornhert@wetgit.nl>"],
cwd=repo_path, check=True, capture_output=True,
env={**subprocess.os.environ, **env},
)
stats["commits"] += 1
logger.info(" Commit: %s", commit_msg)
except subprocess.CalledProcessError as e:
stderr = e.stderr.decode() if e.stderr else ""
if "nothing to commit" in stderr:
stats["skipped"] += 1
else:
logger.warning(" Git error: %s", stderr[:200])
stats["failed"] += 1
logger.info(
"Historie %s: %d toestanden, %d commits, %d skipped, %d failed",
bwb_id, stats["toestanden"], stats["commits"],
stats["skipped"], stats["failed"],
)
return stats
def _to_git_date(date_str: str) -> str:
"""Converteer YYYY-MM-DD naar git-compatible datum string."""
try:
dt = datetime.strptime(date_str, "%Y-%m-%d").replace(
hour=12, tzinfo=timezone.utc,
)
return dt.strftime("%Y-%m-%dT%H:%M:%S%z")
except ValueError:
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S%z")
def _slugify(text: str) -> str:
"""Maak een URL-veilige slug van een titel."""
import re
slug = text.lower().strip()
slug = re.sub(r"[^\w\s-]", "", slug)
slug = re.sub(r"[\s_]+", "-", slug)
slug = re.sub(r"-+", "-", slug)
return slug[:80].strip("-")
if __name__ == "__main__":
import argparse
import json
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
datefmt="%H:%M:%S",
)
parser = argparse.ArgumentParser(description="WetGit historie-reconstructie")
parser.add_argument("--bwb-id", required=True, help="BWB identificatienummer")
parser.add_argument("--repo", type=Path, required=True, help="Pad naar wetgit/rijk clone")
parser.add_argument("--xml-cache", type=Path, required=True, help="XML cache directory")
parser.add_argument("--delay", type=float, default=0.3, help="Delay tussen downloads")
parser.add_argument("--dry-run", action="store_true", help="Geen git commits")
args = parser.parse_args()
stats = reconstruct_history(
bwb_id=args.bwb_id,
repo_path=args.repo,
xml_cache=args.xml_cache,
delay=args.delay,
dry_run=args.dry_run,
)
print(json.dumps(stats, indent=2))

View file

@ -104,6 +104,33 @@ def fetch_latest_toestand(bwb_id: str) -> SRURecord | None:
return records[-1] return records[-1]
def fetch_all_toestanden(bwb_id: str) -> list[SRURecord]:
"""Haal alle toestanden (versies) op voor één BWB-ID.
Retourneert alle historische versies, gesorteerd op datum (oudste eerst).
Args:
bwb_id: Het BWB identificatienummer (bijv. BWBR0001840).
Returns:
Lijst van SRURecords, gesorteerd op datum_geldig_van.
"""
all_records: list[SRURecord] = []
start = 1
while True:
records, total = _fetch_page(f"dcterms.identifier={bwb_id}", start, 100)
all_records.extend(records)
start += len(records)
if start > total or not records:
break
time.sleep(0.3)
# Sorteer op geldigheids-startdatum (oudste eerst)
all_records.sort(key=lambda r: r.datum_geldig_van or "0000-00-00")
return all_records
def _fetch_page( def _fetch_page(
query: str, start_record: int, maximum_records: int, query: str, start_record: int, maximum_records: int,
) -> tuple[list[SRURecord], int]: ) -> tuple[list[SRURecord], int]: