feat: project scaffold + BWB XML parser

- pyproject.toml met wetgit package, pytest/ruff/black/mypy config
- BWB XML → Markdown parser (src/wetgit/pipeline/bwb_parser.py)
- Getest op ~400 regelingen over alle BWB-types
- 20 edge cases gevonden en opgelost:
  - <boek>, <deel>, <kop> structuren
  - <regeling-tekst>, <circulaire-tekst> containers
  - <bijlage>, <enig-artikel>, <sub-paragraaf>, <divisie>
  - CALS <table> → Markdown tabellen
  - <nadruk>, <sup>, <sub> inline formatting
  - <redactie>, <tussenkop>, <gereserveerd>, <vervallen>
- Nix flake devshell met alle dependencies
- CLI entrypoint (wetgit)
- Domain models (Regeling, Artikel)

Sluit #4, sluit #5
This commit is contained in:
Coornhert 2026-03-29 21:24:32 +02:00
parent bed91e891e
commit 1dc93b0f89
15 changed files with 936 additions and 0 deletions

10
.env.example Normal file
View file

@ -0,0 +1,10 @@
# WetGIT Environment Variables
# Copy to .env and fill in real values:
# cp .env.example .env
# AgentMail API (coornhert@wetgit.nl)
# Get your key from https://console.agentmail.to
AGENTMAIL_API_KEY=
# Hetzner Cloud
HCLOUD_TOKEN=

33
.gitignore vendored Normal file
View file

@ -0,0 +1,33 @@
# Secrets
.env
.env.*
!.env.example
ansible/.vault_pass
# Nix / direnv
.direnv/
result
# Python
__pycache__/
*.py[cod]
*.egg-info/
dist/
build/
.venv/
*.egg
# Testing
.coverage
htmlcov/
.pytest_cache/
# IDE
.idea/
.vscode/
*.swp
*.swo
# OS
.DS_Store
Thumbs.db

61
flake.lock generated Normal file
View file

@ -0,0 +1,61 @@
{
"nodes": {
"flake-utils": {
"inputs": {
"systems": "systems"
},
"locked": {
"lastModified": 1731533236,
"narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "flake-utils",
"type": "github"
}
},
"nixpkgs": {
"locked": {
"lastModified": 1774610258,
"narHash": "sha256-HaThtroVD9wRdx7KQk0B75JmFcXlMUoEdDFNOMOlsOs=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "832efc09b4caf6b4569fbf9dc01bec3082a00611",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixpkgs-unstable",
"repo": "nixpkgs",
"type": "github"
}
},
"root": {
"inputs": {
"flake-utils": "flake-utils",
"nixpkgs": "nixpkgs"
}
},
"systems": {
"locked": {
"lastModified": 1681028828,
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
"owner": "nix-systems",
"repo": "default",
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
"type": "github"
},
"original": {
"owner": "nix-systems",
"repo": "default",
"type": "github"
}
}
},
"root": "root",
"version": 7
}

116
flake.nix Normal file
View file

@ -0,0 +1,116 @@
{
description = "WetGit - Nederlandse wetgeving als code";
inputs = {
nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable";
flake-utils.url = "github:numtide/flake-utils";
};
outputs = { self, nixpkgs, flake-utils }:
flake-utils.lib.eachDefaultSystem (system:
let
pkgs = nixpkgs.legacyPackages.${system};
# Python 3.13 (zelfde versie als ansible gebruikt, voorkomt PATH-conflicten)
pythonEnv = pkgs.python313.withPackages (ps: with ps; [
# Conversie-pipeline (PRD: Technische Stack)
lxml # BWB XML-parsing met XPath/XSLT
pygit2 # Git-operaties via libgit2 (performanter dan GitPython)
pyyaml # YAML frontmatter generatie
python-frontmatter # Markdown + YAML frontmatter parsing
# API-laag (PRD: FastAPI)
fastapi
uvicorn # ASGI server
httpx # Async HTTP client (SRU-API, EUR-Lex)
pydantic # Data validatie
# Achtergrondtaken (PRD: Celery + Redis)
celery
redis # Python Redis client
# CLI-tool (PRD: wetgit CLI)
click
rich # Terminal formatting
# Testing
pytest
pytest-cov
pytest-asyncio
# Development tools
black
ruff
mypy
pip
setuptools
wheel
build
# Typing stubs
types-requests
types-pyyaml
]);
in {
devShells.default = pkgs.mkShell {
name = "wetgit";
buildInputs = with pkgs; [
# Python environment
pythonEnv
# Dependency management
uv
# Ansible (infrastructuur provisioning Hetzner)
ansible
ansible-lint
# Hetzner Cloud CLI
hcloud
# Redis server (lokale development)
redis
# Git & tools
git
jq
yq-go
curl
# Native dependencies voor pygit2
libgit2
];
shellHook = ''
echo "WetGit - Nederlandse wetgeving als code"
echo ""
echo "Python: $(python --version)"
echo "Ansible: $(ansible --version 2>/dev/null | head -1)"
echo "hcloud: $(hcloud version 2>/dev/null)"
echo ""
echo "Pipeline tools: lxml, pygit2, fastapi"
echo "Infra tools: ansible, hcloud"
echo ""
# Laad .env als die bestaat (API keys, Hetzner token)
if [ -f .env ]; then
set -a
source .env
set +a
echo "Loaded environment from .env"
echo ""
fi
# Venv voor PyPI-only packages (agentmail etc.)
if [ ! -d .venv ]; then
uv venv .venv --python python3.13 --seed
uv pip install --python .venv/bin/python agentmail
echo "Created .venv and installed PyPI dependencies"
fi
source .venv/bin/activate
'';
};
});
}

92
pyproject.toml Normal file
View file

@ -0,0 +1,92 @@
[project]
name = "wetgit"
version = "0.1.0"
description = "Nederlandse wetgeving als code — elke wet een Markdown-bestand, elke wijziging een Git-commit"
readme = "README.md"
license = "MIT"
requires-python = ">=3.12"
authors = [
{ name = "Coornhert", email = "coornhert@wetgit.nl" },
]
keywords = ["wetgeving", "dutch-law", "bwb", "git", "markdown"]
classifiers = [
"Development Status :: 2 - Pre-Alpha",
"Intended Audience :: Developers",
"Intended Audience :: Legal Industry",
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3.13",
"Topic :: Text Processing :: Markup",
]
dependencies = [
"lxml>=5.0",
"pygit2>=1.13",
"pyyaml>=6.0",
"python-frontmatter>=1.1",
"httpx>=0.27",
"click>=8.1",
"rich>=13.0",
"pydantic>=2.0",
]
[project.optional-dependencies]
api = [
"fastapi>=0.115",
"uvicorn>=0.30",
"celery>=5.4",
"redis>=5.0",
]
dev = [
"pytest>=8.0",
"pytest-cov>=5.0",
"pytest-asyncio>=0.24",
"black>=24.0",
"ruff>=0.6",
"mypy>=1.11",
"types-pyyaml",
"types-requests",
]
[project.scripts]
wetgit = "wetgit.cli.main:cli"
[project.urls]
Homepage = "https://wetgit.nl"
Repository = "https://git.wetgit.nl/wetgit/meta"
Issues = "https://git.wetgit.nl/wetgit/meta/issues"
[build-system]
requires = ["setuptools>=75.0"]
build-backend = "setuptools.build_meta"
[tool.setuptools.packages.find]
where = ["src"]
[tool.pytest.ini_options]
testpaths = ["tests"]
markers = [
"unit: Unit tests (fast, no I/O)",
"integration: Integration tests (may need network/disk)",
"slow: Slow tests (large XML parsing, bulk operations)",
]
asyncio_mode = "auto"
[tool.black]
line-length = 99
target-version = ["py313"]
[tool.ruff]
line-length = 99
target-version = "py313"
[tool.ruff.lint]
select = ["E", "F", "I", "N", "W", "UP", "B", "A", "SIM", "TCH"]
ignore = ["E501"]
[tool.ruff.lint.isort]
known-first-party = ["wetgit"]
[tool.mypy]
python_version = "3.13"
warn_return_any = true
warn_unused_configs = true
disallow_untyped_defs = true

3
src/wetgit/__init__.py Normal file
View file

@ -0,0 +1,3 @@
"""WetGit — Nederlandse wetgeving als code."""
__version__ = "0.1.0"

View file

@ -0,0 +1 @@
"""FastAPI REST API."""

View file

@ -0,0 +1 @@
"""WetGit CLI tool."""

17
src/wetgit/cli/main.py Normal file
View file

@ -0,0 +1,17 @@
"""WetGit CLI — command-line interface."""
import click
from wetgit import __version__
@click.group()
@click.version_option(version=__version__, prog_name="wetgit")
def cli() -> None:
"""WetGit — Nederlandse wetgeving als code."""
@cli.command()
def version() -> None:
"""Toon de WetGit versie."""
click.echo(f"wetgit {__version__}")

55
src/wetgit/models.py Normal file
View file

@ -0,0 +1,55 @@
"""Domain models voor WetGit."""
from __future__ import annotations
from dataclasses import dataclass, field
from datetime import date
from enum import Enum
class RegelingType(str, Enum):
"""Type regeling conform BWB-classificatie."""
WET = "wet"
AMVB = "amvb"
MINISTERIELE_REGELING = "ministeriele-regeling"
KB = "kb"
RIJKSWET = "rijkswet"
VERDRAG = "verdrag"
BELEIDSREGEL = "beleidsregel"
CIRCULAIRE = "circulaire"
ZBO = "zbo"
class RegelingStatus(str, Enum):
"""Status van een regeling."""
GELDEND = "geldend"
VERVALLEN = "vervallen"
@dataclass(frozen=True)
class Regeling:
"""Metadata van een regeling."""
bwb_id: str
titel: str
type: RegelingType
status: RegelingStatus
datum_inwerkingtreding: date
datum_laatste_wijziging: date | None = None
datum_verval: date | None = None
citeertitel: str | None = None
ministerie: str | None = None
bron_url: str | None = None
eu_implementatie: list[dict[str, str]] = field(default_factory=list)
@dataclass(frozen=True)
class Artikel:
"""Een artikel binnen een regeling."""
nummer: str
titel: str | None
inhoud: str
leden: list[str] = field(default_factory=list)

View file

@ -0,0 +1 @@
"""BWB/CVDR/EUR-Lex conversie-pipeline."""

View file

@ -0,0 +1,487 @@
"""BWB XML naar Markdown parser.
Parseert BWB toestand-XML (schema versie 2.0) naar Markdown + YAML frontmatter.
"""
from __future__ import annotations
import re
from dataclasses import dataclass, field
from datetime import date
from io import StringIO
import yaml
from lxml import etree
@dataclass
class ParsedRegeling:
"""Resultaat van het parsen van een BWB toestand-XML."""
bwb_id: str
titel: str
citeertitel: str | None
soort: str
datum_inwerkingtreding: str | None
markdown: str
frontmatter: dict[str, str | list[str] | None]
def parse_bwb_xml(xml_path: str) -> ParsedRegeling:
"""Parse een BWB toestand-XML bestand naar Markdown.
Args:
xml_path: Pad naar het BWB XML-bestand.
Returns:
ParsedRegeling met metadata en Markdown-tekst.
"""
tree = etree.parse(xml_path)
wetgeving = tree.find(".//wetgeving")
if wetgeving is None:
raise ValueError(f"Geen <wetgeving> element gevonden in {xml_path}")
# Metadata extraheren
soort = wetgeving.get("soort", "onbekend")
bwb_id = _extract_bwb_id(tree)
datum = wetgeving.get("inwerkingtredingsdatum")
intitule = wetgeving.findtext(".//intitule", default="").strip()
citeertitel = wetgeving.findtext(".//citeertitel", default="").strip() or None
titel = citeertitel or intitule or bwb_id
# Wettekst parsen — probeer meerdere structuren
# Formele wetten: <wet-besluit><wettekst>
# Ministeriële regelingen: <regeling-tekst>
# Circulaires/ZBO: <circulaire-tekst> met <circulaire.divisie>
# Fallback: <wettekst> direct
wettekst = wetgeving.find(".//wet-besluit//wettekst")
if wettekst is None:
wettekst = wetgeving.find(".//regeling-tekst")
if wettekst is None:
wettekst = wetgeving.find(".//circulaire-tekst")
if wettekst is None:
wettekst = wetgeving.find(".//wettekst")
md_parts: list[str] = []
if wettekst is not None:
md_parts = _parse_wettekst(wettekst)
# Bijlagen parsen (staan buiten de wettekst)
for bijlage in wetgeving.findall(".//bijlage"):
bijlage_md = _parse_bijlage(bijlage)
if bijlage_md:
md_parts.append(bijlage_md)
# Frontmatter opbouwen
frontmatter: dict[str, str | list[str] | None] = {
"titel": titel,
"bwb_id": bwb_id,
"type": soort,
"status": "geldend",
"datum_inwerkingtreding": datum,
"bron": f"https://wetten.overheid.nl/{bwb_id}",
}
if citeertitel:
frontmatter["citeertitel"] = citeertitel
# Markdown samenstellen
fm_yaml = yaml.dump(frontmatter, allow_unicode=True, default_flow_style=False, sort_keys=False)
markdown = f"---\n{fm_yaml.strip()}\n---\n\n# {titel}\n\n"
markdown += "\n\n".join(md_parts)
markdown += "\n"
return ParsedRegeling(
bwb_id=bwb_id,
titel=titel,
citeertitel=citeertitel,
soort=soort,
datum_inwerkingtreding=datum,
markdown=markdown,
frontmatter=frontmatter,
)
def _extract_bwb_id(tree: etree._ElementTree) -> str:
"""Haal BWB-ID op uit het XML-document."""
root = tree.getroot()
# Probeer eerst via wetgeving stam-id
wetgeving = tree.find(".//wetgeving")
if wetgeving is not None:
stam_id = wetgeving.get("stam-id", "")
# stam-id is niet het BWB-ID, zoek in meta-data
# Zoek in meta-data
for elem in tree.iter():
if elem.tag == "toestand":
bwb_id = elem.get("bwb-id")
if bwb_id:
return bwb_id
# Fallback: zoek in bwb-inputbestand
inp = tree.find(".//bwb-inputbestand")
if inp is not None:
for child in inp.iter():
bwb_id = child.get("bwb-id")
if bwb_id:
return bwb_id
return "ONBEKEND"
def _parse_wettekst(wettekst: etree._Element) -> list[str]:
"""Parse het <wettekst> element naar Markdown-blokken."""
parts: list[str] = []
for child in wettekst:
tag = child.tag
if tag == "deel":
parts.append(_parse_structuur(child, level=2, label="Deel"))
elif tag == "boek":
parts.append(_parse_structuur(child, level=2, label="Boek"))
elif tag == "hoofdstuk":
parts.append(_parse_structuur(child, level=2, label="Hoofdstuk"))
elif tag == "titeldeel":
parts.append(_parse_structuur(child, level=2, label="Titel"))
elif tag == "afdeling":
parts.append(_parse_structuur(child, level=3, label="Afdeling"))
elif tag == "paragraaf":
parts.append(_parse_structuur(child, level=3, label="Paragraaf"))
elif tag == "circulaire.divisie":
parts.append(_parse_structuur(child, level=2, label=""))
elif tag == "sub-paragraaf":
parts.append(_parse_structuur(child, level=3, label="Paragraaf"))
elif tag == "divisie":
parts.append(_parse_structuur(child, level=2, label=""))
elif tag in ("artikel", "enig-artikel"):
parts.append(_parse_artikel(child))
elif tag == "tekst":
parts.append(_parse_tekst_content(child))
elif tag == "al":
parts.append(_get_text(child))
elif tag == "table":
parts.append(_parse_cals_table(child))
elif tag == "tabel":
parts.append("*[tabel]*")
elif tag in ("plaatje", "illustratie"):
parts.append("*[afbeelding]*")
return [p for p in parts if p.strip()]
def _parse_structuur(elem: etree._Element, level: int, label: str) -> str:
"""Parse een structuurelement (hoofdstuk, afdeling, paragraaf, boek)."""
nr = ""
titel = ""
# <kop> element bevat nr en titel bij boeken en sommige andere structuren
kop = elem.find("./kop")
if kop is not None:
nr = kop.findtext("./nr", default="").strip()
titel = kop.findtext("./titel", default="").strip()
# Fallback: directe child-elementen
if not nr:
nr = elem.findtext("./nr", default="").strip()
if not titel:
titel = (
elem.findtext("./hoofdstuktitel", default="")
or elem.findtext("./titeldeel-titel", default="")
or elem.findtext("./afdelingtitel", default="")
or elem.findtext("./paragraaftitel", default="")
or elem.findtext("./boektitel", default="")
or elem.findtext("./titel", default="")
).strip()
heading = "#" * level
header = f"{heading} {label} {nr}"
if titel:
header += f". {titel}"
skip_tags = {
"nr", "kop", "titel",
"hoofdstuktitel", "titeldeel-titel", "afdelingtitel",
"paragraaftitel", "boektitel",
}
parts = [header]
for child in elem:
tag = child.tag
if tag in skip_tags:
continue
elif tag == "deel":
parts.append(_parse_structuur(child, level=level + 1, label="Deel"))
elif tag == "boek":
parts.append(_parse_structuur(child, level=level + 1, label="Boek"))
elif tag == "hoofdstuk":
parts.append(_parse_structuur(child, level=level + 1, label="Hoofdstuk"))
elif tag == "titeldeel":
parts.append(_parse_structuur(child, level=level + 1, label="Titel"))
elif tag == "afdeling":
parts.append(_parse_structuur(child, level=level + 1, label="Afdeling"))
elif tag == "paragraaf":
parts.append(_parse_structuur(child, level=level + 1, label="Paragraaf"))
elif tag == "circulaire.divisie":
parts.append(_parse_structuur(child, level=level + 1, label=""))
elif tag == "sub-paragraaf":
parts.append(_parse_structuur(child, level=level + 1, label="Paragraaf"))
elif tag == "divisie":
parts.append(_parse_structuur(child, level=level + 1, label=""))
elif tag in ("artikel", "enig-artikel"):
parts.append(_parse_artikel(child))
elif tag == "al":
parts.append(_get_text(child))
elif tag == "tekst":
parts.append(_parse_tekst_content(child))
elif tag == "tussenkop":
text = _get_text(child).strip()
if text:
sub_heading = "#" * min(level + 1, 6)
parts.append(f"{sub_heading} {text}")
elif tag == "redactie":
text = _get_text(child).strip()
if text:
parts.append(f"*[{text}]*")
elif tag == "table":
parts.append(_parse_cals_table(child))
elif tag == "tabel":
parts.append("*[tabel]*")
elif tag in ("plaatje", "illustratie"):
parts.append("*[afbeelding]*")
return "\n\n".join(parts)
def _parse_artikel(artikel: etree._Element) -> str:
"""Parse een <artikel> element naar Markdown."""
nr = artikel.findtext(".//nr", default="").strip()
heading = f"### Artikel {nr}" if nr else "### Artikel"
parts = [heading]
for child in artikel:
tag = child.tag
if tag == "nr":
continue
elif tag == "titel":
titel_text = _get_text(child).strip()
if titel_text:
parts.append(f"*{titel_text}*")
elif tag == "lid":
parts.append(_parse_lid(child))
elif tag == "al":
parts.append(_get_text(child))
elif tag == "lijst":
parts.append(_parse_lijst(child))
elif tag == "tekst":
parts.append(_parse_tekst_content(child))
elif tag == "tabel":
parts.append("*[tabel]*")
elif tag == "gereserveerd":
parts.append("*[Dit artikel is gereserveerd.]*")
elif tag == "vervallen":
parts.append("*[Dit artikel is vervallen.]*")
elif tag == "lid-vervallen":
lidnr = child.findtext(".//lidnr", default="").strip()
parts.append(f"**{lidnr}.** *[Vervallen.]*" if lidnr else "*[Lid vervallen.]*")
elif tag == "lidnr":
continue
return "\n\n".join(parts)
def _parse_lid(lid: etree._Element) -> str:
"""Parse een <lid> element."""
lidnr = lid.findtext(".//lidnr", default="").strip()
parts: list[str] = []
if lidnr:
parts.append(f"**{lidnr}.**")
for child in lid:
tag = child.tag
if tag == "lidnr":
continue
elif tag == "al":
parts.append(_get_text(child))
elif tag == "lijst":
parts.append(_parse_lijst(child))
elif tag == "tekst":
parts.append(_parse_tekst_content(child))
elif tag == "table":
parts.append(_parse_cals_table(child))
elif tag == "tabel":
parts.append("*[tabel]*")
elif tag == "formule":
parts.append(f"*[formule: {_get_text(child)}]*")
elif tag == "redactie":
text = _get_text(child).strip()
if text:
parts.append(f"*[{text}]*")
return " ".join(parts) if lidnr and len(parts) <= 2 else "\n\n".join(parts)
def _parse_lijst(lijst: etree._Element) -> str:
"""Parse een <lijst> element naar Markdown-lijst."""
items: list[str] = []
for li in lijst.findall(".//li"):
nr = li.findtext(".//li.nr", default="").strip()
body = li.find(".//li.body")
if body is not None:
text = _parse_tekst_content(body)
else:
text = _get_text(li)
prefix = f"{nr} " if nr else "- "
items.append(f"{prefix}{text}")
return "\n".join(items)
def _parse_tekst_content(elem: etree._Element) -> str:
"""Parse gemengde content (al, lijst, etc.) binnen een element."""
parts: list[str] = []
for child in elem:
if child.tag == "al":
parts.append(_get_text(child))
elif child.tag == "lijst":
parts.append(_parse_lijst(child))
elif child.tag == "table":
parts.append(_parse_cals_table(child))
elif child.tag == "tabel":
parts.append("*[tabel]*")
elif child.tag in ("plaatje", "illustratie"):
parts.append("*[afbeelding]*")
elif child.tag == "redactie":
text = _get_text(child).strip()
if text:
parts.append(f"*[{text}]*")
if not parts:
text = _get_text(elem)
if text:
parts.append(text)
return "\n\n".join(parts)
def _parse_cals_table(table: etree._Element) -> str:
"""Parse een CALS <table> element naar Markdown tabel."""
rows: list[list[str]] = []
has_header = False
# Thead
thead = table.find(".//thead")
if thead is not None:
has_header = True
for row in thead.findall(".//row"):
cells = [_get_text(e) for e in row.findall(".//entry")]
rows.append(cells)
# Tbody
tbody = table.find(".//tbody")
if tbody is not None:
for row in tbody.findall(".//row"):
cells = [_get_text(e) for e in row.findall(".//entry")]
rows.append(cells)
else:
# Geen tbody — rows direct onder tgroup
for row in table.findall(".//row"):
cells = [_get_text(e) for e in row.findall(".//entry")]
rows.append(cells)
if not rows:
return "*[tabel]*"
# Normaliseer kolom-aantallen
max_cols = max(len(r) for r in rows)
for r in rows:
while len(r) < max_cols:
r.append("")
# Markdown tabel genereren
lines: list[str] = []
for i, row in enumerate(rows):
escaped = [cell.replace("|", "\\|").replace("\n", " ") for cell in row]
lines.append("| " + " | ".join(escaped) + " |")
if i == 0:
lines.append("| " + " | ".join("---" for _ in escaped) + " |")
return "\n".join(lines)
def _parse_bijlage(bijlage: etree._Element) -> str:
"""Parse een <bijlage> element naar Markdown."""
# Titel uit <kop>
kop = bijlage.find("./kop")
nr = ""
titel = ""
if kop is not None:
nr = kop.findtext("./nr", default="").strip()
titel = kop.findtext("./titel", default="").strip()
header = f"## Bijlage {nr}"
if titel:
header += f". {titel}"
parts = [header]
for child in bijlage:
tag = child.tag
if tag == "kop":
continue
elif tag == "artikel":
parts.append(_parse_artikel(child))
elif tag == "tekst":
parts.append(_parse_tekst_content(child))
elif tag == "tabel":
parts.append("*[tabel]*")
elif tag == "plaatje":
parts.append("*[afbeelding]*")
elif tag == "bijlage-tekst":
parts.append(_parse_tekst_content(child))
elif tag in ("hoofdstuk", "titeldeel", "afdeling", "paragraaf"):
parts.append(_parse_structuur(child, level=3, label=tag.capitalize()))
elif tag == "al":
parts.append(_get_text(child))
return "\n\n".join(parts)
def _get_text(elem: etree._Element) -> str:
"""Haal alle tekst op uit een element, met inline formatting.
Verwerkt <nadruk>, <sup>, <sub/inf>, <extref> en <intref> inline.
"""
parts: list[str] = []
_collect_text(elem, parts)
return "".join(parts).strip()
def _collect_text(elem: etree._Element, parts: list[str]) -> None:
"""Recursief tekst verzamelen met inline Markdown formatting."""
if elem.text:
parts.append(elem.text)
for child in elem:
tag = child.tag
if tag == "nadruk":
nadruk_type = child.get("type", "")
inner = "".join(child.itertext())
if nadruk_type == "vet":
parts.append(f"**{inner}**")
elif nadruk_type == "cur":
parts.append(f"*{inner}*")
else:
parts.append(inner)
elif tag == "sup":
inner = "".join(child.itertext())
parts.append(f"^{inner}")
elif tag in ("sub", "inf"):
inner = "".join(child.itertext())
parts.append(f"_{inner}")
elif tag in ("extref", "intref"):
# Tekst behouden, link-info gaat verloren in Markdown v0.1
_collect_text(child, parts)
elif tag == "nootref":
# Voetnootverwijzing — neem tekst mee
inner = "".join(child.itertext())
parts.append(inner)
else:
# Onbekend inline element — neem tekst gewoon mee
_collect_text(child, parts)
if child.tail:
parts.append(child.tail)

0
tests/__init__.py Normal file
View file

View file

View file

@ -0,0 +1,59 @@
"""Tests voor de BWB XML parser."""
import pytest
from wetgit.pipeline.bwb_parser import parse_bwb_xml
@pytest.fixture
def grondwet_xml(tmp_path):
"""Download de Grondwet XML voor tests."""
import httpx
url = "https://repository.officiele-overheidspublicaties.nl/bwb/BWBR0001840/2023-02-22_0/xml/BWBR0001840_2023-02-22_0.xml"
resp = httpx.get(url, timeout=30)
resp.raise_for_status()
xml_path = tmp_path / "grondwet.xml"
xml_path.write_bytes(resp.content)
return str(xml_path)
@pytest.mark.integration
class TestBWBParser:
def test_parse_grondwet_metadata(self, grondwet_xml: str) -> None:
result = parse_bwb_xml(grondwet_xml)
assert result.bwb_id == "BWBR0001840"
assert result.titel == "Grondwet"
assert result.soort == "wet"
def test_parse_grondwet_frontmatter(self, grondwet_xml: str) -> None:
result = parse_bwb_xml(grondwet_xml)
assert result.frontmatter["bwb_id"] == "BWBR0001840"
assert result.frontmatter["type"] == "wet"
assert result.frontmatter["status"] == "geldend"
assert "wetten.overheid.nl" in str(result.frontmatter["bron"])
def test_parse_grondwet_has_artikel_1(self, grondwet_xml: str) -> None:
result = parse_bwb_xml(grondwet_xml)
assert "### Artikel 1" in result.markdown
assert "gelijke gevallen gelijk behandeld" in result.markdown
def test_parse_grondwet_has_hoofdstukken(self, grondwet_xml: str) -> None:
result = parse_bwb_xml(grondwet_xml)
assert "## Hoofdstuk 1" in result.markdown
assert "## Hoofdstuk 2" in result.markdown
def test_parse_grondwet_yaml_frontmatter(self, grondwet_xml: str) -> None:
result = parse_bwb_xml(grondwet_xml)
assert result.markdown.startswith("---\n")
assert "\n---\n" in result.markdown
def test_parse_grondwet_markdown_structure(self, grondwet_xml: str) -> None:
result = parse_bwb_xml(grondwet_xml)
# Should start with frontmatter then h1
lines = result.markdown.split("\n")
assert lines[0] == "---"
# Find the h1
h1_lines = [l for l in lines if l.startswith("# ")]
assert len(h1_lines) == 1
assert h1_lines[0] == "# Grondwet"