feat: historie-reconstructie pipeline
- history.py: reconstrueert volledige versiehistorie per regeling - sru_client.py: fetch_all_toestanden() voor alle versies per BWB-ID - Git commits met correcte author date (inwerkingtredings-datum) - flake.nix: venv vervangen door PYTHONPATH + pip --user Pilot: Grondwet (BWBR0001840) — 11 toestanden, 11 commits, 0 failures. git diff toont exacte wetswijzigingen (bijv. art. 131 Grondwet). Sluit #28, #29, #30, #31
This commit is contained in:
parent
03402cdfa0
commit
40c36d612a
3 changed files with 254 additions and 6 deletions
13
flake.nix
13
flake.nix
|
|
@ -103,13 +103,14 @@
|
|||
echo ""
|
||||
fi
|
||||
|
||||
# Venv voor PyPI-only packages (agentmail etc.)
|
||||
if [ ! -d .venv ]; then
|
||||
uv venv .venv --python python3.13 --seed
|
||||
uv pip install --python .venv/bin/python agentmail
|
||||
echo "Created .venv and installed PyPI dependencies"
|
||||
# PYTHONPATH voor lokale wetgit package
|
||||
export PYTHONPATH="$PWD/src:$PYTHONPATH"
|
||||
|
||||
# PyPI-only packages (niet in nixpkgs) installeren in user site
|
||||
if ! python -c "import agentmail" 2>/dev/null; then
|
||||
pip install --user --quiet agentmail
|
||||
echo "Installed agentmail via pip --user"
|
||||
fi
|
||||
source .venv/bin/activate
|
||||
'';
|
||||
};
|
||||
});
|
||||
|
|
|
|||
220
src/wetgit/pipeline/history.py
Normal file
220
src/wetgit/pipeline/history.py
Normal file
|
|
@ -0,0 +1,220 @@
|
|||
"""Historie-reconstructie — alle toestanden als git commits met correcte datums.
|
||||
|
||||
Neemt een BWB-ID, haalt alle historische toestanden op via SRU,
|
||||
downloadt de XML, parseert naar Markdown, en maakt per toestand
|
||||
een git commit met de inwerkingtredings-datum als author date.
|
||||
|
||||
Usage:
|
||||
python -m wetgit.pipeline.history --bwb-id BWBR0001840 --repo /path/to/rijk --xml-cache /path/to/cache
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import subprocess
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
|
||||
from wetgit.pipeline.bwb_parser import parse_bwb_xml
|
||||
from wetgit.pipeline.sru_client import SRURecord, fetch_all_toestanden
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
TYPE_TO_DIR = {
|
||||
"wet": "wet",
|
||||
"AMvB": "amvb",
|
||||
"ministeriele-regeling": "ministeriele-regeling",
|
||||
"KB": "kb",
|
||||
"rijkswet": "rijkswet",
|
||||
"verdrag": "verdrag",
|
||||
"beleidsregel": "beleidsregel",
|
||||
"circulaire": "circulaire",
|
||||
"zbo": "zbo",
|
||||
"pbo": "pbo",
|
||||
}
|
||||
|
||||
|
||||
def reconstruct_history(
|
||||
bwb_id: str,
|
||||
repo_path: Path,
|
||||
xml_cache: Path,
|
||||
delay: float = 0.3,
|
||||
dry_run: bool = False,
|
||||
) -> dict[str, int]:
|
||||
"""Reconstrueer de volledige historie van één regeling.
|
||||
|
||||
Args:
|
||||
bwb_id: BWB identificatienummer (bijv. BWBR0001840).
|
||||
repo_path: Pad naar de lokale clone van wetgit/rijk.
|
||||
xml_cache: Pad naar de XML cache directory.
|
||||
delay: Vertraging tussen downloads (sec).
|
||||
dry_run: Als True, toon commits maar maak ze niet.
|
||||
|
||||
Returns:
|
||||
Dict met statistieken.
|
||||
"""
|
||||
xml_cache.mkdir(parents=True, exist_ok=True)
|
||||
stats = {"toestanden": 0, "commits": 0, "skipped": 0, "failed": 0}
|
||||
|
||||
# Stap 1: Alle toestanden ophalen
|
||||
logger.info("Ophalen toestanden voor %s...", bwb_id)
|
||||
toestanden = fetch_all_toestanden(bwb_id)
|
||||
stats["toestanden"] = len(toestanden)
|
||||
logger.info("Gevonden: %d toestanden", len(toestanden))
|
||||
|
||||
if not toestanden:
|
||||
logger.warning("Geen toestanden gevonden voor %s", bwb_id)
|
||||
return stats
|
||||
|
||||
# Stap 2: Download alle XMLs
|
||||
for i, toestand in enumerate(toestanden):
|
||||
datum = toestand.datum_geldig_van or "onbekend"
|
||||
cache_name = f"{bwb_id}_{datum}.xml"
|
||||
xml_path = xml_cache / cache_name
|
||||
|
||||
if not xml_path.exists():
|
||||
logger.info(" [%d/%d] Downloaden %s...", i + 1, len(toestanden), datum)
|
||||
try:
|
||||
resp = httpx.get(toestand.xml_url, timeout=60, follow_redirects=True)
|
||||
resp.raise_for_status()
|
||||
xml_path.write_bytes(resp.content)
|
||||
except httpx.HTTPError as e:
|
||||
logger.warning(" Download mislukt voor %s: %s", datum, e)
|
||||
stats["failed"] += 1
|
||||
continue
|
||||
if delay > 0:
|
||||
time.sleep(delay)
|
||||
|
||||
# Stap 3: Parse en commit (chronologisch, oudste eerst)
|
||||
for i, toestand in enumerate(toestanden):
|
||||
datum = toestand.datum_geldig_van or "onbekend"
|
||||
cache_name = f"{bwb_id}_{datum}.xml"
|
||||
xml_path = xml_cache / cache_name
|
||||
|
||||
if not xml_path.exists():
|
||||
stats["skipped"] += 1
|
||||
continue
|
||||
|
||||
logger.info(" [%d/%d] Parsing %s...", i + 1, len(toestanden), datum)
|
||||
|
||||
try:
|
||||
result = parse_bwb_xml(str(xml_path))
|
||||
except Exception as e:
|
||||
logger.warning(" Parse mislukt voor %s: %s", datum, e)
|
||||
stats["failed"] += 1
|
||||
continue
|
||||
|
||||
# Bepaal output pad
|
||||
type_dir = TYPE_TO_DIR.get(result.soort, "overig")
|
||||
slug = _slugify(result.titel) if result.titel else bwb_id.lower()
|
||||
md_path = repo_path / type_dir / slug / bwb_id / "README.md"
|
||||
md_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Check of er iets veranderd is
|
||||
if md_path.exists():
|
||||
existing = md_path.read_text(encoding="utf-8")
|
||||
if existing == result.markdown:
|
||||
logger.debug(" Geen wijziging voor %s, overslaan", datum)
|
||||
stats["skipped"] += 1
|
||||
continue
|
||||
|
||||
# Schrijf het bestand
|
||||
md_path.write_text(result.markdown, encoding="utf-8")
|
||||
|
||||
if dry_run:
|
||||
logger.info(" [DRY-RUN] Zou committen: %s | %s", datum, bwb_id)
|
||||
stats["commits"] += 1
|
||||
continue
|
||||
|
||||
# Git commit met correcte author date
|
||||
commit_msg = f"{datum} | {bwb_id} | {result.titel}"
|
||||
author_date = _to_git_date(datum)
|
||||
|
||||
try:
|
||||
# Stage het bestand
|
||||
rel_path = md_path.relative_to(repo_path)
|
||||
subprocess.run(
|
||||
["git", "add", str(rel_path)],
|
||||
cwd=repo_path, check=True, capture_output=True,
|
||||
)
|
||||
|
||||
# Commit met author date
|
||||
env = {
|
||||
"GIT_AUTHOR_DATE": author_date,
|
||||
"GIT_COMMITTER_DATE": author_date,
|
||||
}
|
||||
subprocess.run(
|
||||
["git", "commit", "-m", commit_msg,
|
||||
"--author", "Coornhert <coornhert@wetgit.nl>"],
|
||||
cwd=repo_path, check=True, capture_output=True,
|
||||
env={**subprocess.os.environ, **env},
|
||||
)
|
||||
stats["commits"] += 1
|
||||
logger.info(" Commit: %s", commit_msg)
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
stderr = e.stderr.decode() if e.stderr else ""
|
||||
if "nothing to commit" in stderr:
|
||||
stats["skipped"] += 1
|
||||
else:
|
||||
logger.warning(" Git error: %s", stderr[:200])
|
||||
stats["failed"] += 1
|
||||
|
||||
logger.info(
|
||||
"Historie %s: %d toestanden, %d commits, %d skipped, %d failed",
|
||||
bwb_id, stats["toestanden"], stats["commits"],
|
||||
stats["skipped"], stats["failed"],
|
||||
)
|
||||
return stats
|
||||
|
||||
|
||||
def _to_git_date(date_str: str) -> str:
|
||||
"""Converteer YYYY-MM-DD naar git-compatible datum string."""
|
||||
try:
|
||||
dt = datetime.strptime(date_str, "%Y-%m-%d").replace(
|
||||
hour=12, tzinfo=timezone.utc,
|
||||
)
|
||||
return dt.strftime("%Y-%m-%dT%H:%M:%S%z")
|
||||
except ValueError:
|
||||
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S%z")
|
||||
|
||||
|
||||
def _slugify(text: str) -> str:
|
||||
"""Maak een URL-veilige slug van een titel."""
|
||||
import re
|
||||
slug = text.lower().strip()
|
||||
slug = re.sub(r"[^\w\s-]", "", slug)
|
||||
slug = re.sub(r"[\s_]+", "-", slug)
|
||||
slug = re.sub(r"-+", "-", slug)
|
||||
return slug[:80].strip("-")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
)
|
||||
|
||||
parser = argparse.ArgumentParser(description="WetGit historie-reconstructie")
|
||||
parser.add_argument("--bwb-id", required=True, help="BWB identificatienummer")
|
||||
parser.add_argument("--repo", type=Path, required=True, help="Pad naar wetgit/rijk clone")
|
||||
parser.add_argument("--xml-cache", type=Path, required=True, help="XML cache directory")
|
||||
parser.add_argument("--delay", type=float, default=0.3, help="Delay tussen downloads")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Geen git commits")
|
||||
args = parser.parse_args()
|
||||
|
||||
stats = reconstruct_history(
|
||||
bwb_id=args.bwb_id,
|
||||
repo_path=args.repo,
|
||||
xml_cache=args.xml_cache,
|
||||
delay=args.delay,
|
||||
dry_run=args.dry_run,
|
||||
)
|
||||
print(json.dumps(stats, indent=2))
|
||||
|
|
@ -104,6 +104,33 @@ def fetch_latest_toestand(bwb_id: str) -> SRURecord | None:
|
|||
return records[-1]
|
||||
|
||||
|
||||
def fetch_all_toestanden(bwb_id: str) -> list[SRURecord]:
|
||||
"""Haal alle toestanden (versies) op voor één BWB-ID.
|
||||
|
||||
Retourneert alle historische versies, gesorteerd op datum (oudste eerst).
|
||||
|
||||
Args:
|
||||
bwb_id: Het BWB identificatienummer (bijv. BWBR0001840).
|
||||
|
||||
Returns:
|
||||
Lijst van SRURecords, gesorteerd op datum_geldig_van.
|
||||
"""
|
||||
all_records: list[SRURecord] = []
|
||||
start = 1
|
||||
|
||||
while True:
|
||||
records, total = _fetch_page(f"dcterms.identifier={bwb_id}", start, 100)
|
||||
all_records.extend(records)
|
||||
start += len(records)
|
||||
if start > total or not records:
|
||||
break
|
||||
time.sleep(0.3)
|
||||
|
||||
# Sorteer op geldigheids-startdatum (oudste eerst)
|
||||
all_records.sort(key=lambda r: r.datum_geldig_van or "0000-00-00")
|
||||
return all_records
|
||||
|
||||
|
||||
def _fetch_page(
|
||||
query: str, start_record: int, maximum_records: int,
|
||||
) -> tuple[list[SRURecord], int]:
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue