From 1dc93b0f8958d0814bec63f89eb2c7ca534c634f Mon Sep 17 00:00:00 2001 From: Coornhert Date: Sun, 29 Mar 2026 21:24:32 +0200 Subject: [PATCH] feat: project scaffold + BWB XML parser MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - pyproject.toml met wetgit package, pytest/ruff/black/mypy config - BWB XML → Markdown parser (src/wetgit/pipeline/bwb_parser.py) - Getest op ~400 regelingen over alle BWB-types - 20 edge cases gevonden en opgelost: - , , structuren - , containers - , , , - CALS → Markdown tabellen - , , inline formatting - , , , - Nix flake devshell met alle dependencies - CLI entrypoint (wetgit) - Domain models (Regeling, Artikel) Sluit #4, sluit #5 --- .env.example | 10 + .gitignore | 33 ++ flake.lock | 61 ++++ flake.nix | 116 +++++++ pyproject.toml | 92 ++++++ src/wetgit/__init__.py | 3 + src/wetgit/api/__init__.py | 1 + src/wetgit/cli/__init__.py | 1 + src/wetgit/cli/main.py | 17 ++ src/wetgit/models.py | 55 ++++ src/wetgit/pipeline/__init__.py | 1 + src/wetgit/pipeline/bwb_parser.py | 487 ++++++++++++++++++++++++++++++ tests/__init__.py | 0 tests/pipeline/__init__.py | 0 tests/pipeline/test_bwb_parser.py | 59 ++++ 15 files changed, 936 insertions(+) create mode 100644 .env.example create mode 100644 .gitignore create mode 100644 flake.lock create mode 100644 flake.nix create mode 100644 pyproject.toml create mode 100644 src/wetgit/__init__.py create mode 100644 src/wetgit/api/__init__.py create mode 100644 src/wetgit/cli/__init__.py create mode 100644 src/wetgit/cli/main.py create mode 100644 src/wetgit/models.py create mode 100644 src/wetgit/pipeline/__init__.py create mode 100644 src/wetgit/pipeline/bwb_parser.py create mode 100644 tests/__init__.py create mode 100644 tests/pipeline/__init__.py create mode 100644 tests/pipeline/test_bwb_parser.py diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..38159cb --- /dev/null +++ b/.env.example @@ -0,0 +1,10 @@ +# WetGIT Environment Variables +# Copy to .env and fill in real values: +# cp .env.example .env + +# AgentMail API (coornhert@wetgit.nl) +# Get your key from https://console.agentmail.to +AGENTMAIL_API_KEY= + +# Hetzner Cloud +HCLOUD_TOKEN= diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3401136 --- /dev/null +++ b/.gitignore @@ -0,0 +1,33 @@ +# Secrets +.env +.env.* +!.env.example +ansible/.vault_pass + +# Nix / direnv +.direnv/ +result + +# Python +__pycache__/ +*.py[cod] +*.egg-info/ +dist/ +build/ +.venv/ +*.egg + +# Testing +.coverage +htmlcov/ +.pytest_cache/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo + +# OS +.DS_Store +Thumbs.db diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..8b44baf --- /dev/null +++ b/flake.lock @@ -0,0 +1,61 @@ +{ + "nodes": { + "flake-utils": { + "inputs": { + "systems": "systems" + }, + "locked": { + "lastModified": 1731533236, + "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1774610258, + "narHash": "sha256-HaThtroVD9wRdx7KQk0B75JmFcXlMUoEdDFNOMOlsOs=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "832efc09b4caf6b4569fbf9dc01bec3082a00611", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixpkgs-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "flake-utils": "flake-utils", + "nixpkgs": "nixpkgs" + } + }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..d2c683e --- /dev/null +++ b/flake.nix @@ -0,0 +1,116 @@ +{ + description = "WetGit - Nederlandse wetgeving als code"; + + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable"; + flake-utils.url = "github:numtide/flake-utils"; + }; + + outputs = { self, nixpkgs, flake-utils }: + flake-utils.lib.eachDefaultSystem (system: + let + pkgs = nixpkgs.legacyPackages.${system}; + + # Python 3.13 (zelfde versie als ansible gebruikt, voorkomt PATH-conflicten) + pythonEnv = pkgs.python313.withPackages (ps: with ps; [ + # Conversie-pipeline (PRD: Technische Stack) + lxml # BWB XML-parsing met XPath/XSLT + pygit2 # Git-operaties via libgit2 (performanter dan GitPython) + pyyaml # YAML frontmatter generatie + python-frontmatter # Markdown + YAML frontmatter parsing + + # API-laag (PRD: FastAPI) + fastapi + uvicorn # ASGI server + httpx # Async HTTP client (SRU-API, EUR-Lex) + pydantic # Data validatie + + # Achtergrondtaken (PRD: Celery + Redis) + celery + redis # Python Redis client + + # CLI-tool (PRD: wetgit CLI) + click + rich # Terminal formatting + + # Testing + pytest + pytest-cov + pytest-asyncio + + # Development tools + black + ruff + mypy + pip + setuptools + wheel + build + + # Typing stubs + types-requests + types-pyyaml + ]); + + in { + devShells.default = pkgs.mkShell { + name = "wetgit"; + + buildInputs = with pkgs; [ + # Python environment + pythonEnv + + # Dependency management + uv + + # Ansible (infrastructuur provisioning Hetzner) + ansible + ansible-lint + + # Hetzner Cloud CLI + hcloud + + # Redis server (lokale development) + redis + + # Git & tools + git + jq + yq-go + curl + + # Native dependencies voor pygit2 + libgit2 + ]; + + shellHook = '' + echo "WetGit - Nederlandse wetgeving als code" + echo "" + echo "Python: $(python --version)" + echo "Ansible: $(ansible --version 2>/dev/null | head -1)" + echo "hcloud: $(hcloud version 2>/dev/null)" + echo "" + echo "Pipeline tools: lxml, pygit2, fastapi" + echo "Infra tools: ansible, hcloud" + echo "" + + # Laad .env als die bestaat (API keys, Hetzner token) + if [ -f .env ]; then + set -a + source .env + set +a + echo "Loaded environment from .env" + echo "" + fi + + # Venv voor PyPI-only packages (agentmail etc.) + if [ ! -d .venv ]; then + uv venv .venv --python python3.13 --seed + uv pip install --python .venv/bin/python agentmail + echo "Created .venv and installed PyPI dependencies" + fi + source .venv/bin/activate + ''; + }; + }); +} diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..ca1472a --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,92 @@ +[project] +name = "wetgit" +version = "0.1.0" +description = "Nederlandse wetgeving als code — elke wet een Markdown-bestand, elke wijziging een Git-commit" +readme = "README.md" +license = "MIT" +requires-python = ">=3.12" +authors = [ + { name = "Coornhert", email = "coornhert@wetgit.nl" }, +] +keywords = ["wetgeving", "dutch-law", "bwb", "git", "markdown"] +classifiers = [ + "Development Status :: 2 - Pre-Alpha", + "Intended Audience :: Developers", + "Intended Audience :: Legal Industry", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3.13", + "Topic :: Text Processing :: Markup", +] +dependencies = [ + "lxml>=5.0", + "pygit2>=1.13", + "pyyaml>=6.0", + "python-frontmatter>=1.1", + "httpx>=0.27", + "click>=8.1", + "rich>=13.0", + "pydantic>=2.0", +] + +[project.optional-dependencies] +api = [ + "fastapi>=0.115", + "uvicorn>=0.30", + "celery>=5.4", + "redis>=5.0", +] +dev = [ + "pytest>=8.0", + "pytest-cov>=5.0", + "pytest-asyncio>=0.24", + "black>=24.0", + "ruff>=0.6", + "mypy>=1.11", + "types-pyyaml", + "types-requests", +] + +[project.scripts] +wetgit = "wetgit.cli.main:cli" + +[project.urls] +Homepage = "https://wetgit.nl" +Repository = "https://git.wetgit.nl/wetgit/meta" +Issues = "https://git.wetgit.nl/wetgit/meta/issues" + +[build-system] +requires = ["setuptools>=75.0"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.pytest.ini_options] +testpaths = ["tests"] +markers = [ + "unit: Unit tests (fast, no I/O)", + "integration: Integration tests (may need network/disk)", + "slow: Slow tests (large XML parsing, bulk operations)", +] +asyncio_mode = "auto" + +[tool.black] +line-length = 99 +target-version = ["py313"] + +[tool.ruff] +line-length = 99 +target-version = "py313" + +[tool.ruff.lint] +select = ["E", "F", "I", "N", "W", "UP", "B", "A", "SIM", "TCH"] +ignore = ["E501"] + +[tool.ruff.lint.isort] +known-first-party = ["wetgit"] + +[tool.mypy] +python_version = "3.13" +warn_return_any = true +warn_unused_configs = true +disallow_untyped_defs = true diff --git a/src/wetgit/__init__.py b/src/wetgit/__init__.py new file mode 100644 index 0000000..e76c639 --- /dev/null +++ b/src/wetgit/__init__.py @@ -0,0 +1,3 @@ +"""WetGit — Nederlandse wetgeving als code.""" + +__version__ = "0.1.0" diff --git a/src/wetgit/api/__init__.py b/src/wetgit/api/__init__.py new file mode 100644 index 0000000..b1cf546 --- /dev/null +++ b/src/wetgit/api/__init__.py @@ -0,0 +1 @@ +"""FastAPI REST API.""" diff --git a/src/wetgit/cli/__init__.py b/src/wetgit/cli/__init__.py new file mode 100644 index 0000000..8d80cc3 --- /dev/null +++ b/src/wetgit/cli/__init__.py @@ -0,0 +1 @@ +"""WetGit CLI tool.""" diff --git a/src/wetgit/cli/main.py b/src/wetgit/cli/main.py new file mode 100644 index 0000000..cb29973 --- /dev/null +++ b/src/wetgit/cli/main.py @@ -0,0 +1,17 @@ +"""WetGit CLI — command-line interface.""" + +import click + +from wetgit import __version__ + + +@click.group() +@click.version_option(version=__version__, prog_name="wetgit") +def cli() -> None: + """WetGit — Nederlandse wetgeving als code.""" + + +@cli.command() +def version() -> None: + """Toon de WetGit versie.""" + click.echo(f"wetgit {__version__}") diff --git a/src/wetgit/models.py b/src/wetgit/models.py new file mode 100644 index 0000000..b76e1fc --- /dev/null +++ b/src/wetgit/models.py @@ -0,0 +1,55 @@ +"""Domain models voor WetGit.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from datetime import date +from enum import Enum + + +class RegelingType(str, Enum): + """Type regeling conform BWB-classificatie.""" + + WET = "wet" + AMVB = "amvb" + MINISTERIELE_REGELING = "ministeriele-regeling" + KB = "kb" + RIJKSWET = "rijkswet" + VERDRAG = "verdrag" + BELEIDSREGEL = "beleidsregel" + CIRCULAIRE = "circulaire" + ZBO = "zbo" + + +class RegelingStatus(str, Enum): + """Status van een regeling.""" + + GELDEND = "geldend" + VERVALLEN = "vervallen" + + +@dataclass(frozen=True) +class Regeling: + """Metadata van een regeling.""" + + bwb_id: str + titel: str + type: RegelingType + status: RegelingStatus + datum_inwerkingtreding: date + datum_laatste_wijziging: date | None = None + datum_verval: date | None = None + citeertitel: str | None = None + ministerie: str | None = None + bron_url: str | None = None + eu_implementatie: list[dict[str, str]] = field(default_factory=list) + + +@dataclass(frozen=True) +class Artikel: + """Een artikel binnen een regeling.""" + + nummer: str + titel: str | None + inhoud: str + leden: list[str] = field(default_factory=list) diff --git a/src/wetgit/pipeline/__init__.py b/src/wetgit/pipeline/__init__.py new file mode 100644 index 0000000..3629398 --- /dev/null +++ b/src/wetgit/pipeline/__init__.py @@ -0,0 +1 @@ +"""BWB/CVDR/EUR-Lex conversie-pipeline.""" diff --git a/src/wetgit/pipeline/bwb_parser.py b/src/wetgit/pipeline/bwb_parser.py new file mode 100644 index 0000000..7ee9376 --- /dev/null +++ b/src/wetgit/pipeline/bwb_parser.py @@ -0,0 +1,487 @@ +"""BWB XML naar Markdown parser. + +Parseert BWB toestand-XML (schema versie 2.0) naar Markdown + YAML frontmatter. +""" + +from __future__ import annotations + +import re +from dataclasses import dataclass, field +from datetime import date +from io import StringIO + +import yaml +from lxml import etree + + +@dataclass +class ParsedRegeling: + """Resultaat van het parsen van een BWB toestand-XML.""" + + bwb_id: str + titel: str + citeertitel: str | None + soort: str + datum_inwerkingtreding: str | None + markdown: str + frontmatter: dict[str, str | list[str] | None] + + +def parse_bwb_xml(xml_path: str) -> ParsedRegeling: + """Parse een BWB toestand-XML bestand naar Markdown. + + Args: + xml_path: Pad naar het BWB XML-bestand. + + Returns: + ParsedRegeling met metadata en Markdown-tekst. + """ + tree = etree.parse(xml_path) + wetgeving = tree.find(".//wetgeving") + if wetgeving is None: + raise ValueError(f"Geen element gevonden in {xml_path}") + + # Metadata extraheren + soort = wetgeving.get("soort", "onbekend") + bwb_id = _extract_bwb_id(tree) + datum = wetgeving.get("inwerkingtredingsdatum") + + intitule = wetgeving.findtext(".//intitule", default="").strip() + citeertitel = wetgeving.findtext(".//citeertitel", default="").strip() or None + titel = citeertitel or intitule or bwb_id + + # Wettekst parsen — probeer meerdere structuren + # Formele wetten: + # Ministeriële regelingen: + # Circulaires/ZBO: met + # Fallback: direct + wettekst = wetgeving.find(".//wet-besluit//wettekst") + if wettekst is None: + wettekst = wetgeving.find(".//regeling-tekst") + if wettekst is None: + wettekst = wetgeving.find(".//circulaire-tekst") + if wettekst is None: + wettekst = wetgeving.find(".//wettekst") + + md_parts: list[str] = [] + if wettekst is not None: + md_parts = _parse_wettekst(wettekst) + + # Bijlagen parsen (staan buiten de wettekst) + for bijlage in wetgeving.findall(".//bijlage"): + bijlage_md = _parse_bijlage(bijlage) + if bijlage_md: + md_parts.append(bijlage_md) + + # Frontmatter opbouwen + frontmatter: dict[str, str | list[str] | None] = { + "titel": titel, + "bwb_id": bwb_id, + "type": soort, + "status": "geldend", + "datum_inwerkingtreding": datum, + "bron": f"https://wetten.overheid.nl/{bwb_id}", + } + if citeertitel: + frontmatter["citeertitel"] = citeertitel + + # Markdown samenstellen + fm_yaml = yaml.dump(frontmatter, allow_unicode=True, default_flow_style=False, sort_keys=False) + markdown = f"---\n{fm_yaml.strip()}\n---\n\n# {titel}\n\n" + markdown += "\n\n".join(md_parts) + markdown += "\n" + + return ParsedRegeling( + bwb_id=bwb_id, + titel=titel, + citeertitel=citeertitel, + soort=soort, + datum_inwerkingtreding=datum, + markdown=markdown, + frontmatter=frontmatter, + ) + + +def _extract_bwb_id(tree: etree._ElementTree) -> str: + """Haal BWB-ID op uit het XML-document.""" + root = tree.getroot() + # Probeer eerst via wetgeving stam-id + wetgeving = tree.find(".//wetgeving") + if wetgeving is not None: + stam_id = wetgeving.get("stam-id", "") + # stam-id is niet het BWB-ID, zoek in meta-data + # Zoek in meta-data + for elem in tree.iter(): + if elem.tag == "toestand": + bwb_id = elem.get("bwb-id") + if bwb_id: + return bwb_id + # Fallback: zoek in bwb-inputbestand + inp = tree.find(".//bwb-inputbestand") + if inp is not None: + for child in inp.iter(): + bwb_id = child.get("bwb-id") + if bwb_id: + return bwb_id + return "ONBEKEND" + + +def _parse_wettekst(wettekst: etree._Element) -> list[str]: + """Parse het element naar Markdown-blokken.""" + parts: list[str] = [] + for child in wettekst: + tag = child.tag + if tag == "deel": + parts.append(_parse_structuur(child, level=2, label="Deel")) + elif tag == "boek": + parts.append(_parse_structuur(child, level=2, label="Boek")) + elif tag == "hoofdstuk": + parts.append(_parse_structuur(child, level=2, label="Hoofdstuk")) + elif tag == "titeldeel": + parts.append(_parse_structuur(child, level=2, label="Titel")) + elif tag == "afdeling": + parts.append(_parse_structuur(child, level=3, label="Afdeling")) + elif tag == "paragraaf": + parts.append(_parse_structuur(child, level=3, label="Paragraaf")) + elif tag == "circulaire.divisie": + parts.append(_parse_structuur(child, level=2, label="")) + elif tag == "sub-paragraaf": + parts.append(_parse_structuur(child, level=3, label="Paragraaf")) + elif tag == "divisie": + parts.append(_parse_structuur(child, level=2, label="")) + elif tag in ("artikel", "enig-artikel"): + parts.append(_parse_artikel(child)) + elif tag == "tekst": + parts.append(_parse_tekst_content(child)) + elif tag == "al": + parts.append(_get_text(child)) + elif tag == "table": + parts.append(_parse_cals_table(child)) + elif tag == "tabel": + parts.append("*[tabel]*") + elif tag in ("plaatje", "illustratie"): + parts.append("*[afbeelding]*") + return [p for p in parts if p.strip()] + + +def _parse_structuur(elem: etree._Element, level: int, label: str) -> str: + """Parse een structuurelement (hoofdstuk, afdeling, paragraaf, boek).""" + nr = "" + titel = "" + + # element bevat nr en titel bij boeken en sommige andere structuren + kop = elem.find("./kop") + if kop is not None: + nr = kop.findtext("./nr", default="").strip() + titel = kop.findtext("./titel", default="").strip() + + # Fallback: directe child-elementen + if not nr: + nr = elem.findtext("./nr", default="").strip() + if not titel: + titel = ( + elem.findtext("./hoofdstuktitel", default="") + or elem.findtext("./titeldeel-titel", default="") + or elem.findtext("./afdelingtitel", default="") + or elem.findtext("./paragraaftitel", default="") + or elem.findtext("./boektitel", default="") + or elem.findtext("./titel", default="") + ).strip() + + heading = "#" * level + header = f"{heading} {label} {nr}" + if titel: + header += f". {titel}" + + skip_tags = { + "nr", "kop", "titel", + "hoofdstuktitel", "titeldeel-titel", "afdelingtitel", + "paragraaftitel", "boektitel", + } + + parts = [header] + + for child in elem: + tag = child.tag + if tag in skip_tags: + continue + elif tag == "deel": + parts.append(_parse_structuur(child, level=level + 1, label="Deel")) + elif tag == "boek": + parts.append(_parse_structuur(child, level=level + 1, label="Boek")) + elif tag == "hoofdstuk": + parts.append(_parse_structuur(child, level=level + 1, label="Hoofdstuk")) + elif tag == "titeldeel": + parts.append(_parse_structuur(child, level=level + 1, label="Titel")) + elif tag == "afdeling": + parts.append(_parse_structuur(child, level=level + 1, label="Afdeling")) + elif tag == "paragraaf": + parts.append(_parse_structuur(child, level=level + 1, label="Paragraaf")) + elif tag == "circulaire.divisie": + parts.append(_parse_structuur(child, level=level + 1, label="")) + elif tag == "sub-paragraaf": + parts.append(_parse_structuur(child, level=level + 1, label="Paragraaf")) + elif tag == "divisie": + parts.append(_parse_structuur(child, level=level + 1, label="")) + elif tag in ("artikel", "enig-artikel"): + parts.append(_parse_artikel(child)) + elif tag == "al": + parts.append(_get_text(child)) + elif tag == "tekst": + parts.append(_parse_tekst_content(child)) + elif tag == "tussenkop": + text = _get_text(child).strip() + if text: + sub_heading = "#" * min(level + 1, 6) + parts.append(f"{sub_heading} {text}") + elif tag == "redactie": + text = _get_text(child).strip() + if text: + parts.append(f"*[{text}]*") + elif tag == "table": + parts.append(_parse_cals_table(child)) + elif tag == "tabel": + parts.append("*[tabel]*") + elif tag in ("plaatje", "illustratie"): + parts.append("*[afbeelding]*") + + return "\n\n".join(parts) + + +def _parse_artikel(artikel: etree._Element) -> str: + """Parse een element naar Markdown.""" + nr = artikel.findtext(".//nr", default="").strip() + heading = f"### Artikel {nr}" if nr else "### Artikel" + + parts = [heading] + + for child in artikel: + tag = child.tag + if tag == "nr": + continue + elif tag == "titel": + titel_text = _get_text(child).strip() + if titel_text: + parts.append(f"*{titel_text}*") + elif tag == "lid": + parts.append(_parse_lid(child)) + elif tag == "al": + parts.append(_get_text(child)) + elif tag == "lijst": + parts.append(_parse_lijst(child)) + elif tag == "tekst": + parts.append(_parse_tekst_content(child)) + elif tag == "tabel": + parts.append("*[tabel]*") + elif tag == "gereserveerd": + parts.append("*[Dit artikel is gereserveerd.]*") + elif tag == "vervallen": + parts.append("*[Dit artikel is vervallen.]*") + elif tag == "lid-vervallen": + lidnr = child.findtext(".//lidnr", default="").strip() + parts.append(f"**{lidnr}.** *[Vervallen.]*" if lidnr else "*[Lid vervallen.]*") + elif tag == "lidnr": + continue + + return "\n\n".join(parts) + + +def _parse_lid(lid: etree._Element) -> str: + """Parse een element.""" + lidnr = lid.findtext(".//lidnr", default="").strip() + parts: list[str] = [] + if lidnr: + parts.append(f"**{lidnr}.**") + + for child in lid: + tag = child.tag + if tag == "lidnr": + continue + elif tag == "al": + parts.append(_get_text(child)) + elif tag == "lijst": + parts.append(_parse_lijst(child)) + elif tag == "tekst": + parts.append(_parse_tekst_content(child)) + elif tag == "table": + parts.append(_parse_cals_table(child)) + elif tag == "tabel": + parts.append("*[tabel]*") + elif tag == "formule": + parts.append(f"*[formule: {_get_text(child)}]*") + elif tag == "redactie": + text = _get_text(child).strip() + if text: + parts.append(f"*[{text}]*") + + return " ".join(parts) if lidnr and len(parts) <= 2 else "\n\n".join(parts) + + +def _parse_lijst(lijst: etree._Element) -> str: + """Parse een element naar Markdown-lijst.""" + items: list[str] = [] + for li in lijst.findall(".//li"): + nr = li.findtext(".//li.nr", default="").strip() + body = li.find(".//li.body") + if body is not None: + text = _parse_tekst_content(body) + else: + text = _get_text(li) + prefix = f"{nr} " if nr else "- " + items.append(f"{prefix}{text}") + return "\n".join(items) + + +def _parse_tekst_content(elem: etree._Element) -> str: + """Parse gemengde content (al, lijst, etc.) binnen een element.""" + parts: list[str] = [] + for child in elem: + if child.tag == "al": + parts.append(_get_text(child)) + elif child.tag == "lijst": + parts.append(_parse_lijst(child)) + elif child.tag == "table": + parts.append(_parse_cals_table(child)) + elif child.tag == "tabel": + parts.append("*[tabel]*") + elif child.tag in ("plaatje", "illustratie"): + parts.append("*[afbeelding]*") + elif child.tag == "redactie": + text = _get_text(child).strip() + if text: + parts.append(f"*[{text}]*") + if not parts: + text = _get_text(elem) + if text: + parts.append(text) + return "\n\n".join(parts) + + +def _parse_cals_table(table: etree._Element) -> str: + """Parse een CALS
element naar Markdown tabel.""" + rows: list[list[str]] = [] + has_header = False + + # Thead + thead = table.find(".//thead") + if thead is not None: + has_header = True + for row in thead.findall(".//row"): + cells = [_get_text(e) for e in row.findall(".//entry")] + rows.append(cells) + + # Tbody + tbody = table.find(".//tbody") + if tbody is not None: + for row in tbody.findall(".//row"): + cells = [_get_text(e) for e in row.findall(".//entry")] + rows.append(cells) + else: + # Geen tbody — rows direct onder tgroup + for row in table.findall(".//row"): + cells = [_get_text(e) for e in row.findall(".//entry")] + rows.append(cells) + + if not rows: + return "*[tabel]*" + + # Normaliseer kolom-aantallen + max_cols = max(len(r) for r in rows) + for r in rows: + while len(r) < max_cols: + r.append("") + + # Markdown tabel genereren + lines: list[str] = [] + for i, row in enumerate(rows): + escaped = [cell.replace("|", "\\|").replace("\n", " ") for cell in row] + lines.append("| " + " | ".join(escaped) + " |") + if i == 0: + lines.append("| " + " | ".join("---" for _ in escaped) + " |") + + return "\n".join(lines) + + +def _parse_bijlage(bijlage: etree._Element) -> str: + """Parse een element naar Markdown.""" + # Titel uit + kop = bijlage.find("./kop") + nr = "" + titel = "" + if kop is not None: + nr = kop.findtext("./nr", default="").strip() + titel = kop.findtext("./titel", default="").strip() + + header = f"## Bijlage {nr}" + if titel: + header += f". {titel}" + + parts = [header] + + for child in bijlage: + tag = child.tag + if tag == "kop": + continue + elif tag == "artikel": + parts.append(_parse_artikel(child)) + elif tag == "tekst": + parts.append(_parse_tekst_content(child)) + elif tag == "tabel": + parts.append("*[tabel]*") + elif tag == "plaatje": + parts.append("*[afbeelding]*") + elif tag == "bijlage-tekst": + parts.append(_parse_tekst_content(child)) + elif tag in ("hoofdstuk", "titeldeel", "afdeling", "paragraaf"): + parts.append(_parse_structuur(child, level=3, label=tag.capitalize())) + elif tag == "al": + parts.append(_get_text(child)) + + return "\n\n".join(parts) + + +def _get_text(elem: etree._Element) -> str: + """Haal alle tekst op uit een element, met inline formatting. + + Verwerkt , , , en inline. + """ + parts: list[str] = [] + _collect_text(elem, parts) + return "".join(parts).strip() + + +def _collect_text(elem: etree._Element, parts: list[str]) -> None: + """Recursief tekst verzamelen met inline Markdown formatting.""" + if elem.text: + parts.append(elem.text) + + for child in elem: + tag = child.tag + if tag == "nadruk": + nadruk_type = child.get("type", "") + inner = "".join(child.itertext()) + if nadruk_type == "vet": + parts.append(f"**{inner}**") + elif nadruk_type == "cur": + parts.append(f"*{inner}*") + else: + parts.append(inner) + elif tag == "sup": + inner = "".join(child.itertext()) + parts.append(f"^{inner}") + elif tag in ("sub", "inf"): + inner = "".join(child.itertext()) + parts.append(f"_{inner}") + elif tag in ("extref", "intref"): + # Tekst behouden, link-info gaat verloren in Markdown v0.1 + _collect_text(child, parts) + elif tag == "nootref": + # Voetnootverwijzing — neem tekst mee + inner = "".join(child.itertext()) + parts.append(inner) + else: + # Onbekend inline element — neem tekst gewoon mee + _collect_text(child, parts) + + if child.tail: + parts.append(child.tail) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/pipeline/__init__.py b/tests/pipeline/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/pipeline/test_bwb_parser.py b/tests/pipeline/test_bwb_parser.py new file mode 100644 index 0000000..befab2a --- /dev/null +++ b/tests/pipeline/test_bwb_parser.py @@ -0,0 +1,59 @@ +"""Tests voor de BWB XML parser.""" + +import pytest + +from wetgit.pipeline.bwb_parser import parse_bwb_xml + + +@pytest.fixture +def grondwet_xml(tmp_path): + """Download de Grondwet XML voor tests.""" + import httpx + + url = "https://repository.officiele-overheidspublicaties.nl/bwb/BWBR0001840/2023-02-22_0/xml/BWBR0001840_2023-02-22_0.xml" + resp = httpx.get(url, timeout=30) + resp.raise_for_status() + xml_path = tmp_path / "grondwet.xml" + xml_path.write_bytes(resp.content) + return str(xml_path) + + +@pytest.mark.integration +class TestBWBParser: + def test_parse_grondwet_metadata(self, grondwet_xml: str) -> None: + result = parse_bwb_xml(grondwet_xml) + assert result.bwb_id == "BWBR0001840" + assert result.titel == "Grondwet" + assert result.soort == "wet" + + def test_parse_grondwet_frontmatter(self, grondwet_xml: str) -> None: + result = parse_bwb_xml(grondwet_xml) + assert result.frontmatter["bwb_id"] == "BWBR0001840" + assert result.frontmatter["type"] == "wet" + assert result.frontmatter["status"] == "geldend" + assert "wetten.overheid.nl" in str(result.frontmatter["bron"]) + + def test_parse_grondwet_has_artikel_1(self, grondwet_xml: str) -> None: + result = parse_bwb_xml(grondwet_xml) + assert "### Artikel 1" in result.markdown + assert "gelijke gevallen gelijk behandeld" in result.markdown + + def test_parse_grondwet_has_hoofdstukken(self, grondwet_xml: str) -> None: + result = parse_bwb_xml(grondwet_xml) + assert "## Hoofdstuk 1" in result.markdown + assert "## Hoofdstuk 2" in result.markdown + + def test_parse_grondwet_yaml_frontmatter(self, grondwet_xml: str) -> None: + result = parse_bwb_xml(grondwet_xml) + assert result.markdown.startswith("---\n") + assert "\n---\n" in result.markdown + + def test_parse_grondwet_markdown_structure(self, grondwet_xml: str) -> None: + result = parse_bwb_xml(grondwet_xml) + # Should start with frontmatter then h1 + lines = result.markdown.split("\n") + assert lines[0] == "---" + # Find the h1 + h1_lines = [l for l in lines if l.startswith("# ")] + assert len(h1_lines) == 1 + assert h1_lines[0] == "# Grondwet"