- pyproject.toml met wetgit package, pytest/ruff/black/mypy config - BWB XML → Markdown parser (src/wetgit/pipeline/bwb_parser.py) - Getest op ~400 regelingen over alle BWB-types - 20 edge cases gevonden en opgelost: - <boek>, <deel>, <kop> structuren - <regeling-tekst>, <circulaire-tekst> containers - <bijlage>, <enig-artikel>, <sub-paragraaf>, <divisie> - CALS <table> → Markdown tabellen - <nadruk>, <sup>, <sub> inline formatting - <redactie>, <tussenkop>, <gereserveerd>, <vervallen> - Nix flake devshell met alle dependencies - CLI entrypoint (wetgit) - Domain models (Regeling, Artikel) Sluit #4, sluit #5
59 lines
2.2 KiB
Python
59 lines
2.2 KiB
Python
"""Tests voor de BWB XML parser."""
|
|
|
|
import pytest
|
|
|
|
from wetgit.pipeline.bwb_parser import parse_bwb_xml
|
|
|
|
|
|
@pytest.fixture
|
|
def grondwet_xml(tmp_path):
|
|
"""Download de Grondwet XML voor tests."""
|
|
import httpx
|
|
|
|
url = "https://repository.officiele-overheidspublicaties.nl/bwb/BWBR0001840/2023-02-22_0/xml/BWBR0001840_2023-02-22_0.xml"
|
|
resp = httpx.get(url, timeout=30)
|
|
resp.raise_for_status()
|
|
xml_path = tmp_path / "grondwet.xml"
|
|
xml_path.write_bytes(resp.content)
|
|
return str(xml_path)
|
|
|
|
|
|
@pytest.mark.integration
|
|
class TestBWBParser:
|
|
def test_parse_grondwet_metadata(self, grondwet_xml: str) -> None:
|
|
result = parse_bwb_xml(grondwet_xml)
|
|
assert result.bwb_id == "BWBR0001840"
|
|
assert result.titel == "Grondwet"
|
|
assert result.soort == "wet"
|
|
|
|
def test_parse_grondwet_frontmatter(self, grondwet_xml: str) -> None:
|
|
result = parse_bwb_xml(grondwet_xml)
|
|
assert result.frontmatter["bwb_id"] == "BWBR0001840"
|
|
assert result.frontmatter["type"] == "wet"
|
|
assert result.frontmatter["status"] == "geldend"
|
|
assert "wetten.overheid.nl" in str(result.frontmatter["bron"])
|
|
|
|
def test_parse_grondwet_has_artikel_1(self, grondwet_xml: str) -> None:
|
|
result = parse_bwb_xml(grondwet_xml)
|
|
assert "### Artikel 1" in result.markdown
|
|
assert "gelijke gevallen gelijk behandeld" in result.markdown
|
|
|
|
def test_parse_grondwet_has_hoofdstukken(self, grondwet_xml: str) -> None:
|
|
result = parse_bwb_xml(grondwet_xml)
|
|
assert "## Hoofdstuk 1" in result.markdown
|
|
assert "## Hoofdstuk 2" in result.markdown
|
|
|
|
def test_parse_grondwet_yaml_frontmatter(self, grondwet_xml: str) -> None:
|
|
result = parse_bwb_xml(grondwet_xml)
|
|
assert result.markdown.startswith("---\n")
|
|
assert "\n---\n" in result.markdown
|
|
|
|
def test_parse_grondwet_markdown_structure(self, grondwet_xml: str) -> None:
|
|
result = parse_bwb_xml(grondwet_xml)
|
|
# Should start with frontmatter then h1
|
|
lines = result.markdown.split("\n")
|
|
assert lines[0] == "---"
|
|
# Find the h1
|
|
h1_lines = [l for l in lines if l.startswith("# ")]
|
|
assert len(h1_lines) == 1
|
|
assert h1_lines[0] == "# Grondwet"
|