meta/tests/pipeline/test_bwb_parser.py
Coornhert 1dc93b0f89 feat: project scaffold + BWB XML parser
- pyproject.toml met wetgit package, pytest/ruff/black/mypy config
- BWB XML → Markdown parser (src/wetgit/pipeline/bwb_parser.py)
- Getest op ~400 regelingen over alle BWB-types
- 20 edge cases gevonden en opgelost:
  - <boek>, <deel>, <kop> structuren
  - <regeling-tekst>, <circulaire-tekst> containers
  - <bijlage>, <enig-artikel>, <sub-paragraaf>, <divisie>
  - CALS <table> → Markdown tabellen
  - <nadruk>, <sup>, <sub> inline formatting
  - <redactie>, <tussenkop>, <gereserveerd>, <vervallen>
- Nix flake devshell met alle dependencies
- CLI entrypoint (wetgit)
- Domain models (Regeling, Artikel)

Sluit #4, sluit #5
2026-03-29 21:24:32 +02:00

59 lines
2.2 KiB
Python

"""Tests voor de BWB XML parser."""
import pytest
from wetgit.pipeline.bwb_parser import parse_bwb_xml
@pytest.fixture
def grondwet_xml(tmp_path):
"""Download de Grondwet XML voor tests."""
import httpx
url = "https://repository.officiele-overheidspublicaties.nl/bwb/BWBR0001840/2023-02-22_0/xml/BWBR0001840_2023-02-22_0.xml"
resp = httpx.get(url, timeout=30)
resp.raise_for_status()
xml_path = tmp_path / "grondwet.xml"
xml_path.write_bytes(resp.content)
return str(xml_path)
@pytest.mark.integration
class TestBWBParser:
def test_parse_grondwet_metadata(self, grondwet_xml: str) -> None:
result = parse_bwb_xml(grondwet_xml)
assert result.bwb_id == "BWBR0001840"
assert result.titel == "Grondwet"
assert result.soort == "wet"
def test_parse_grondwet_frontmatter(self, grondwet_xml: str) -> None:
result = parse_bwb_xml(grondwet_xml)
assert result.frontmatter["bwb_id"] == "BWBR0001840"
assert result.frontmatter["type"] == "wet"
assert result.frontmatter["status"] == "geldend"
assert "wetten.overheid.nl" in str(result.frontmatter["bron"])
def test_parse_grondwet_has_artikel_1(self, grondwet_xml: str) -> None:
result = parse_bwb_xml(grondwet_xml)
assert "### Artikel 1" in result.markdown
assert "gelijke gevallen gelijk behandeld" in result.markdown
def test_parse_grondwet_has_hoofdstukken(self, grondwet_xml: str) -> None:
result = parse_bwb_xml(grondwet_xml)
assert "## Hoofdstuk 1" in result.markdown
assert "## Hoofdstuk 2" in result.markdown
def test_parse_grondwet_yaml_frontmatter(self, grondwet_xml: str) -> None:
result = parse_bwb_xml(grondwet_xml)
assert result.markdown.startswith("---\n")
assert "\n---\n" in result.markdown
def test_parse_grondwet_markdown_structure(self, grondwet_xml: str) -> None:
result = parse_bwb_xml(grondwet_xml)
# Should start with frontmatter then h1
lines = result.markdown.split("\n")
assert lines[0] == "---"
# Find the h1
h1_lines = [l for l in lines if l.startswith("# ")]
assert len(h1_lines) == 1
assert h1_lines[0] == "# Grondwet"