feat: project scaffold + BWB XML parser
- pyproject.toml met wetgit package, pytest/ruff/black/mypy config - BWB XML → Markdown parser (src/wetgit/pipeline/bwb_parser.py) - Getest op ~400 regelingen over alle BWB-types - 20 edge cases gevonden en opgelost: - <boek>, <deel>, <kop> structuren - <regeling-tekst>, <circulaire-tekst> containers - <bijlage>, <enig-artikel>, <sub-paragraaf>, <divisie> - CALS <table> → Markdown tabellen - <nadruk>, <sup>, <sub> inline formatting - <redactie>, <tussenkop>, <gereserveerd>, <vervallen> - Nix flake devshell met alle dependencies - CLI entrypoint (wetgit) - Domain models (Regeling, Artikel) Sluit #4, sluit #5
This commit is contained in:
parent
bed91e891e
commit
1dc93b0f89
15 changed files with 936 additions and 0 deletions
10
.env.example
Normal file
10
.env.example
Normal file
|
|
@ -0,0 +1,10 @@
|
||||||
|
# WetGIT Environment Variables
|
||||||
|
# Copy to .env and fill in real values:
|
||||||
|
# cp .env.example .env
|
||||||
|
|
||||||
|
# AgentMail API (coornhert@wetgit.nl)
|
||||||
|
# Get your key from https://console.agentmail.to
|
||||||
|
AGENTMAIL_API_KEY=
|
||||||
|
|
||||||
|
# Hetzner Cloud
|
||||||
|
HCLOUD_TOKEN=
|
||||||
33
.gitignore
vendored
Normal file
33
.gitignore
vendored
Normal file
|
|
@ -0,0 +1,33 @@
|
||||||
|
# Secrets
|
||||||
|
.env
|
||||||
|
.env.*
|
||||||
|
!.env.example
|
||||||
|
ansible/.vault_pass
|
||||||
|
|
||||||
|
# Nix / direnv
|
||||||
|
.direnv/
|
||||||
|
result
|
||||||
|
|
||||||
|
# Python
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*.egg-info/
|
||||||
|
dist/
|
||||||
|
build/
|
||||||
|
.venv/
|
||||||
|
*.egg
|
||||||
|
|
||||||
|
# Testing
|
||||||
|
.coverage
|
||||||
|
htmlcov/
|
||||||
|
.pytest_cache/
|
||||||
|
|
||||||
|
# IDE
|
||||||
|
.idea/
|
||||||
|
.vscode/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
|
||||||
|
# OS
|
||||||
|
.DS_Store
|
||||||
|
Thumbs.db
|
||||||
61
flake.lock
generated
Normal file
61
flake.lock
generated
Normal file
|
|
@ -0,0 +1,61 @@
|
||||||
|
{
|
||||||
|
"nodes": {
|
||||||
|
"flake-utils": {
|
||||||
|
"inputs": {
|
||||||
|
"systems": "systems"
|
||||||
|
},
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1731533236,
|
||||||
|
"narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
|
||||||
|
"owner": "numtide",
|
||||||
|
"repo": "flake-utils",
|
||||||
|
"rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "numtide",
|
||||||
|
"repo": "flake-utils",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nixpkgs": {
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1774610258,
|
||||||
|
"narHash": "sha256-HaThtroVD9wRdx7KQk0B75JmFcXlMUoEdDFNOMOlsOs=",
|
||||||
|
"owner": "NixOS",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"rev": "832efc09b4caf6b4569fbf9dc01bec3082a00611",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "NixOS",
|
||||||
|
"ref": "nixpkgs-unstable",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"root": {
|
||||||
|
"inputs": {
|
||||||
|
"flake-utils": "flake-utils",
|
||||||
|
"nixpkgs": "nixpkgs"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"systems": {
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1681028828,
|
||||||
|
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
|
||||||
|
"owner": "nix-systems",
|
||||||
|
"repo": "default",
|
||||||
|
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "nix-systems",
|
||||||
|
"repo": "default",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"root": "root",
|
||||||
|
"version": 7
|
||||||
|
}
|
||||||
116
flake.nix
Normal file
116
flake.nix
Normal file
|
|
@ -0,0 +1,116 @@
|
||||||
|
{
|
||||||
|
description = "WetGit - Nederlandse wetgeving als code";
|
||||||
|
|
||||||
|
inputs = {
|
||||||
|
nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable";
|
||||||
|
flake-utils.url = "github:numtide/flake-utils";
|
||||||
|
};
|
||||||
|
|
||||||
|
outputs = { self, nixpkgs, flake-utils }:
|
||||||
|
flake-utils.lib.eachDefaultSystem (system:
|
||||||
|
let
|
||||||
|
pkgs = nixpkgs.legacyPackages.${system};
|
||||||
|
|
||||||
|
# Python 3.13 (zelfde versie als ansible gebruikt, voorkomt PATH-conflicten)
|
||||||
|
pythonEnv = pkgs.python313.withPackages (ps: with ps; [
|
||||||
|
# Conversie-pipeline (PRD: Technische Stack)
|
||||||
|
lxml # BWB XML-parsing met XPath/XSLT
|
||||||
|
pygit2 # Git-operaties via libgit2 (performanter dan GitPython)
|
||||||
|
pyyaml # YAML frontmatter generatie
|
||||||
|
python-frontmatter # Markdown + YAML frontmatter parsing
|
||||||
|
|
||||||
|
# API-laag (PRD: FastAPI)
|
||||||
|
fastapi
|
||||||
|
uvicorn # ASGI server
|
||||||
|
httpx # Async HTTP client (SRU-API, EUR-Lex)
|
||||||
|
pydantic # Data validatie
|
||||||
|
|
||||||
|
# Achtergrondtaken (PRD: Celery + Redis)
|
||||||
|
celery
|
||||||
|
redis # Python Redis client
|
||||||
|
|
||||||
|
# CLI-tool (PRD: wetgit CLI)
|
||||||
|
click
|
||||||
|
rich # Terminal formatting
|
||||||
|
|
||||||
|
# Testing
|
||||||
|
pytest
|
||||||
|
pytest-cov
|
||||||
|
pytest-asyncio
|
||||||
|
|
||||||
|
# Development tools
|
||||||
|
black
|
||||||
|
ruff
|
||||||
|
mypy
|
||||||
|
pip
|
||||||
|
setuptools
|
||||||
|
wheel
|
||||||
|
build
|
||||||
|
|
||||||
|
# Typing stubs
|
||||||
|
types-requests
|
||||||
|
types-pyyaml
|
||||||
|
]);
|
||||||
|
|
||||||
|
in {
|
||||||
|
devShells.default = pkgs.mkShell {
|
||||||
|
name = "wetgit";
|
||||||
|
|
||||||
|
buildInputs = with pkgs; [
|
||||||
|
# Python environment
|
||||||
|
pythonEnv
|
||||||
|
|
||||||
|
# Dependency management
|
||||||
|
uv
|
||||||
|
|
||||||
|
# Ansible (infrastructuur provisioning Hetzner)
|
||||||
|
ansible
|
||||||
|
ansible-lint
|
||||||
|
|
||||||
|
# Hetzner Cloud CLI
|
||||||
|
hcloud
|
||||||
|
|
||||||
|
# Redis server (lokale development)
|
||||||
|
redis
|
||||||
|
|
||||||
|
# Git & tools
|
||||||
|
git
|
||||||
|
jq
|
||||||
|
yq-go
|
||||||
|
curl
|
||||||
|
|
||||||
|
# Native dependencies voor pygit2
|
||||||
|
libgit2
|
||||||
|
];
|
||||||
|
|
||||||
|
shellHook = ''
|
||||||
|
echo "WetGit - Nederlandse wetgeving als code"
|
||||||
|
echo ""
|
||||||
|
echo "Python: $(python --version)"
|
||||||
|
echo "Ansible: $(ansible --version 2>/dev/null | head -1)"
|
||||||
|
echo "hcloud: $(hcloud version 2>/dev/null)"
|
||||||
|
echo ""
|
||||||
|
echo "Pipeline tools: lxml, pygit2, fastapi"
|
||||||
|
echo "Infra tools: ansible, hcloud"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Laad .env als die bestaat (API keys, Hetzner token)
|
||||||
|
if [ -f .env ]; then
|
||||||
|
set -a
|
||||||
|
source .env
|
||||||
|
set +a
|
||||||
|
echo "Loaded environment from .env"
|
||||||
|
echo ""
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Venv voor PyPI-only packages (agentmail etc.)
|
||||||
|
if [ ! -d .venv ]; then
|
||||||
|
uv venv .venv --python python3.13 --seed
|
||||||
|
uv pip install --python .venv/bin/python agentmail
|
||||||
|
echo "Created .venv and installed PyPI dependencies"
|
||||||
|
fi
|
||||||
|
source .venv/bin/activate
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
});
|
||||||
|
}
|
||||||
92
pyproject.toml
Normal file
92
pyproject.toml
Normal file
|
|
@ -0,0 +1,92 @@
|
||||||
|
[project]
|
||||||
|
name = "wetgit"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "Nederlandse wetgeving als code — elke wet een Markdown-bestand, elke wijziging een Git-commit"
|
||||||
|
readme = "README.md"
|
||||||
|
license = "MIT"
|
||||||
|
requires-python = ">=3.12"
|
||||||
|
authors = [
|
||||||
|
{ name = "Coornhert", email = "coornhert@wetgit.nl" },
|
||||||
|
]
|
||||||
|
keywords = ["wetgeving", "dutch-law", "bwb", "git", "markdown"]
|
||||||
|
classifiers = [
|
||||||
|
"Development Status :: 2 - Pre-Alpha",
|
||||||
|
"Intended Audience :: Developers",
|
||||||
|
"Intended Audience :: Legal Industry",
|
||||||
|
"License :: OSI Approved :: MIT License",
|
||||||
|
"Programming Language :: Python :: 3.13",
|
||||||
|
"Topic :: Text Processing :: Markup",
|
||||||
|
]
|
||||||
|
dependencies = [
|
||||||
|
"lxml>=5.0",
|
||||||
|
"pygit2>=1.13",
|
||||||
|
"pyyaml>=6.0",
|
||||||
|
"python-frontmatter>=1.1",
|
||||||
|
"httpx>=0.27",
|
||||||
|
"click>=8.1",
|
||||||
|
"rich>=13.0",
|
||||||
|
"pydantic>=2.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
api = [
|
||||||
|
"fastapi>=0.115",
|
||||||
|
"uvicorn>=0.30",
|
||||||
|
"celery>=5.4",
|
||||||
|
"redis>=5.0",
|
||||||
|
]
|
||||||
|
dev = [
|
||||||
|
"pytest>=8.0",
|
||||||
|
"pytest-cov>=5.0",
|
||||||
|
"pytest-asyncio>=0.24",
|
||||||
|
"black>=24.0",
|
||||||
|
"ruff>=0.6",
|
||||||
|
"mypy>=1.11",
|
||||||
|
"types-pyyaml",
|
||||||
|
"types-requests",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
wetgit = "wetgit.cli.main:cli"
|
||||||
|
|
||||||
|
[project.urls]
|
||||||
|
Homepage = "https://wetgit.nl"
|
||||||
|
Repository = "https://git.wetgit.nl/wetgit/meta"
|
||||||
|
Issues = "https://git.wetgit.nl/wetgit/meta/issues"
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["setuptools>=75.0"]
|
||||||
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[tool.setuptools.packages.find]
|
||||||
|
where = ["src"]
|
||||||
|
|
||||||
|
[tool.pytest.ini_options]
|
||||||
|
testpaths = ["tests"]
|
||||||
|
markers = [
|
||||||
|
"unit: Unit tests (fast, no I/O)",
|
||||||
|
"integration: Integration tests (may need network/disk)",
|
||||||
|
"slow: Slow tests (large XML parsing, bulk operations)",
|
||||||
|
]
|
||||||
|
asyncio_mode = "auto"
|
||||||
|
|
||||||
|
[tool.black]
|
||||||
|
line-length = 99
|
||||||
|
target-version = ["py313"]
|
||||||
|
|
||||||
|
[tool.ruff]
|
||||||
|
line-length = 99
|
||||||
|
target-version = "py313"
|
||||||
|
|
||||||
|
[tool.ruff.lint]
|
||||||
|
select = ["E", "F", "I", "N", "W", "UP", "B", "A", "SIM", "TCH"]
|
||||||
|
ignore = ["E501"]
|
||||||
|
|
||||||
|
[tool.ruff.lint.isort]
|
||||||
|
known-first-party = ["wetgit"]
|
||||||
|
|
||||||
|
[tool.mypy]
|
||||||
|
python_version = "3.13"
|
||||||
|
warn_return_any = true
|
||||||
|
warn_unused_configs = true
|
||||||
|
disallow_untyped_defs = true
|
||||||
3
src/wetgit/__init__.py
Normal file
3
src/wetgit/__init__.py
Normal file
|
|
@ -0,0 +1,3 @@
|
||||||
|
"""WetGit — Nederlandse wetgeving als code."""
|
||||||
|
|
||||||
|
__version__ = "0.1.0"
|
||||||
1
src/wetgit/api/__init__.py
Normal file
1
src/wetgit/api/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
"""FastAPI REST API."""
|
||||||
1
src/wetgit/cli/__init__.py
Normal file
1
src/wetgit/cli/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
"""WetGit CLI tool."""
|
||||||
17
src/wetgit/cli/main.py
Normal file
17
src/wetgit/cli/main.py
Normal file
|
|
@ -0,0 +1,17 @@
|
||||||
|
"""WetGit CLI — command-line interface."""
|
||||||
|
|
||||||
|
import click
|
||||||
|
|
||||||
|
from wetgit import __version__
|
||||||
|
|
||||||
|
|
||||||
|
@click.group()
|
||||||
|
@click.version_option(version=__version__, prog_name="wetgit")
|
||||||
|
def cli() -> None:
|
||||||
|
"""WetGit — Nederlandse wetgeving als code."""
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
def version() -> None:
|
||||||
|
"""Toon de WetGit versie."""
|
||||||
|
click.echo(f"wetgit {__version__}")
|
||||||
55
src/wetgit/models.py
Normal file
55
src/wetgit/models.py
Normal file
|
|
@ -0,0 +1,55 @@
|
||||||
|
"""Domain models voor WetGit."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from datetime import date
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
|
||||||
|
class RegelingType(str, Enum):
|
||||||
|
"""Type regeling conform BWB-classificatie."""
|
||||||
|
|
||||||
|
WET = "wet"
|
||||||
|
AMVB = "amvb"
|
||||||
|
MINISTERIELE_REGELING = "ministeriele-regeling"
|
||||||
|
KB = "kb"
|
||||||
|
RIJKSWET = "rijkswet"
|
||||||
|
VERDRAG = "verdrag"
|
||||||
|
BELEIDSREGEL = "beleidsregel"
|
||||||
|
CIRCULAIRE = "circulaire"
|
||||||
|
ZBO = "zbo"
|
||||||
|
|
||||||
|
|
||||||
|
class RegelingStatus(str, Enum):
|
||||||
|
"""Status van een regeling."""
|
||||||
|
|
||||||
|
GELDEND = "geldend"
|
||||||
|
VERVALLEN = "vervallen"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class Regeling:
|
||||||
|
"""Metadata van een regeling."""
|
||||||
|
|
||||||
|
bwb_id: str
|
||||||
|
titel: str
|
||||||
|
type: RegelingType
|
||||||
|
status: RegelingStatus
|
||||||
|
datum_inwerkingtreding: date
|
||||||
|
datum_laatste_wijziging: date | None = None
|
||||||
|
datum_verval: date | None = None
|
||||||
|
citeertitel: str | None = None
|
||||||
|
ministerie: str | None = None
|
||||||
|
bron_url: str | None = None
|
||||||
|
eu_implementatie: list[dict[str, str]] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class Artikel:
|
||||||
|
"""Een artikel binnen een regeling."""
|
||||||
|
|
||||||
|
nummer: str
|
||||||
|
titel: str | None
|
||||||
|
inhoud: str
|
||||||
|
leden: list[str] = field(default_factory=list)
|
||||||
1
src/wetgit/pipeline/__init__.py
Normal file
1
src/wetgit/pipeline/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
"""BWB/CVDR/EUR-Lex conversie-pipeline."""
|
||||||
487
src/wetgit/pipeline/bwb_parser.py
Normal file
487
src/wetgit/pipeline/bwb_parser.py
Normal file
|
|
@ -0,0 +1,487 @@
|
||||||
|
"""BWB XML naar Markdown parser.
|
||||||
|
|
||||||
|
Parseert BWB toestand-XML (schema versie 2.0) naar Markdown + YAML frontmatter.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from datetime import date
|
||||||
|
from io import StringIO
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ParsedRegeling:
|
||||||
|
"""Resultaat van het parsen van een BWB toestand-XML."""
|
||||||
|
|
||||||
|
bwb_id: str
|
||||||
|
titel: str
|
||||||
|
citeertitel: str | None
|
||||||
|
soort: str
|
||||||
|
datum_inwerkingtreding: str | None
|
||||||
|
markdown: str
|
||||||
|
frontmatter: dict[str, str | list[str] | None]
|
||||||
|
|
||||||
|
|
||||||
|
def parse_bwb_xml(xml_path: str) -> ParsedRegeling:
|
||||||
|
"""Parse een BWB toestand-XML bestand naar Markdown.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
xml_path: Pad naar het BWB XML-bestand.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ParsedRegeling met metadata en Markdown-tekst.
|
||||||
|
"""
|
||||||
|
tree = etree.parse(xml_path)
|
||||||
|
wetgeving = tree.find(".//wetgeving")
|
||||||
|
if wetgeving is None:
|
||||||
|
raise ValueError(f"Geen <wetgeving> element gevonden in {xml_path}")
|
||||||
|
|
||||||
|
# Metadata extraheren
|
||||||
|
soort = wetgeving.get("soort", "onbekend")
|
||||||
|
bwb_id = _extract_bwb_id(tree)
|
||||||
|
datum = wetgeving.get("inwerkingtredingsdatum")
|
||||||
|
|
||||||
|
intitule = wetgeving.findtext(".//intitule", default="").strip()
|
||||||
|
citeertitel = wetgeving.findtext(".//citeertitel", default="").strip() or None
|
||||||
|
titel = citeertitel or intitule or bwb_id
|
||||||
|
|
||||||
|
# Wettekst parsen — probeer meerdere structuren
|
||||||
|
# Formele wetten: <wet-besluit><wettekst>
|
||||||
|
# Ministeriële regelingen: <regeling-tekst>
|
||||||
|
# Circulaires/ZBO: <circulaire-tekst> met <circulaire.divisie>
|
||||||
|
# Fallback: <wettekst> direct
|
||||||
|
wettekst = wetgeving.find(".//wet-besluit//wettekst")
|
||||||
|
if wettekst is None:
|
||||||
|
wettekst = wetgeving.find(".//regeling-tekst")
|
||||||
|
if wettekst is None:
|
||||||
|
wettekst = wetgeving.find(".//circulaire-tekst")
|
||||||
|
if wettekst is None:
|
||||||
|
wettekst = wetgeving.find(".//wettekst")
|
||||||
|
|
||||||
|
md_parts: list[str] = []
|
||||||
|
if wettekst is not None:
|
||||||
|
md_parts = _parse_wettekst(wettekst)
|
||||||
|
|
||||||
|
# Bijlagen parsen (staan buiten de wettekst)
|
||||||
|
for bijlage in wetgeving.findall(".//bijlage"):
|
||||||
|
bijlage_md = _parse_bijlage(bijlage)
|
||||||
|
if bijlage_md:
|
||||||
|
md_parts.append(bijlage_md)
|
||||||
|
|
||||||
|
# Frontmatter opbouwen
|
||||||
|
frontmatter: dict[str, str | list[str] | None] = {
|
||||||
|
"titel": titel,
|
||||||
|
"bwb_id": bwb_id,
|
||||||
|
"type": soort,
|
||||||
|
"status": "geldend",
|
||||||
|
"datum_inwerkingtreding": datum,
|
||||||
|
"bron": f"https://wetten.overheid.nl/{bwb_id}",
|
||||||
|
}
|
||||||
|
if citeertitel:
|
||||||
|
frontmatter["citeertitel"] = citeertitel
|
||||||
|
|
||||||
|
# Markdown samenstellen
|
||||||
|
fm_yaml = yaml.dump(frontmatter, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
||||||
|
markdown = f"---\n{fm_yaml.strip()}\n---\n\n# {titel}\n\n"
|
||||||
|
markdown += "\n\n".join(md_parts)
|
||||||
|
markdown += "\n"
|
||||||
|
|
||||||
|
return ParsedRegeling(
|
||||||
|
bwb_id=bwb_id,
|
||||||
|
titel=titel,
|
||||||
|
citeertitel=citeertitel,
|
||||||
|
soort=soort,
|
||||||
|
datum_inwerkingtreding=datum,
|
||||||
|
markdown=markdown,
|
||||||
|
frontmatter=frontmatter,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_bwb_id(tree: etree._ElementTree) -> str:
|
||||||
|
"""Haal BWB-ID op uit het XML-document."""
|
||||||
|
root = tree.getroot()
|
||||||
|
# Probeer eerst via wetgeving stam-id
|
||||||
|
wetgeving = tree.find(".//wetgeving")
|
||||||
|
if wetgeving is not None:
|
||||||
|
stam_id = wetgeving.get("stam-id", "")
|
||||||
|
# stam-id is niet het BWB-ID, zoek in meta-data
|
||||||
|
# Zoek in meta-data
|
||||||
|
for elem in tree.iter():
|
||||||
|
if elem.tag == "toestand":
|
||||||
|
bwb_id = elem.get("bwb-id")
|
||||||
|
if bwb_id:
|
||||||
|
return bwb_id
|
||||||
|
# Fallback: zoek in bwb-inputbestand
|
||||||
|
inp = tree.find(".//bwb-inputbestand")
|
||||||
|
if inp is not None:
|
||||||
|
for child in inp.iter():
|
||||||
|
bwb_id = child.get("bwb-id")
|
||||||
|
if bwb_id:
|
||||||
|
return bwb_id
|
||||||
|
return "ONBEKEND"
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_wettekst(wettekst: etree._Element) -> list[str]:
|
||||||
|
"""Parse het <wettekst> element naar Markdown-blokken."""
|
||||||
|
parts: list[str] = []
|
||||||
|
for child in wettekst:
|
||||||
|
tag = child.tag
|
||||||
|
if tag == "deel":
|
||||||
|
parts.append(_parse_structuur(child, level=2, label="Deel"))
|
||||||
|
elif tag == "boek":
|
||||||
|
parts.append(_parse_structuur(child, level=2, label="Boek"))
|
||||||
|
elif tag == "hoofdstuk":
|
||||||
|
parts.append(_parse_structuur(child, level=2, label="Hoofdstuk"))
|
||||||
|
elif tag == "titeldeel":
|
||||||
|
parts.append(_parse_structuur(child, level=2, label="Titel"))
|
||||||
|
elif tag == "afdeling":
|
||||||
|
parts.append(_parse_structuur(child, level=3, label="Afdeling"))
|
||||||
|
elif tag == "paragraaf":
|
||||||
|
parts.append(_parse_structuur(child, level=3, label="Paragraaf"))
|
||||||
|
elif tag == "circulaire.divisie":
|
||||||
|
parts.append(_parse_structuur(child, level=2, label=""))
|
||||||
|
elif tag == "sub-paragraaf":
|
||||||
|
parts.append(_parse_structuur(child, level=3, label="Paragraaf"))
|
||||||
|
elif tag == "divisie":
|
||||||
|
parts.append(_parse_structuur(child, level=2, label=""))
|
||||||
|
elif tag in ("artikel", "enig-artikel"):
|
||||||
|
parts.append(_parse_artikel(child))
|
||||||
|
elif tag == "tekst":
|
||||||
|
parts.append(_parse_tekst_content(child))
|
||||||
|
elif tag == "al":
|
||||||
|
parts.append(_get_text(child))
|
||||||
|
elif tag == "table":
|
||||||
|
parts.append(_parse_cals_table(child))
|
||||||
|
elif tag == "tabel":
|
||||||
|
parts.append("*[tabel]*")
|
||||||
|
elif tag in ("plaatje", "illustratie"):
|
||||||
|
parts.append("*[afbeelding]*")
|
||||||
|
return [p for p in parts if p.strip()]
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_structuur(elem: etree._Element, level: int, label: str) -> str:
|
||||||
|
"""Parse een structuurelement (hoofdstuk, afdeling, paragraaf, boek)."""
|
||||||
|
nr = ""
|
||||||
|
titel = ""
|
||||||
|
|
||||||
|
# <kop> element bevat nr en titel bij boeken en sommige andere structuren
|
||||||
|
kop = elem.find("./kop")
|
||||||
|
if kop is not None:
|
||||||
|
nr = kop.findtext("./nr", default="").strip()
|
||||||
|
titel = kop.findtext("./titel", default="").strip()
|
||||||
|
|
||||||
|
# Fallback: directe child-elementen
|
||||||
|
if not nr:
|
||||||
|
nr = elem.findtext("./nr", default="").strip()
|
||||||
|
if not titel:
|
||||||
|
titel = (
|
||||||
|
elem.findtext("./hoofdstuktitel", default="")
|
||||||
|
or elem.findtext("./titeldeel-titel", default="")
|
||||||
|
or elem.findtext("./afdelingtitel", default="")
|
||||||
|
or elem.findtext("./paragraaftitel", default="")
|
||||||
|
or elem.findtext("./boektitel", default="")
|
||||||
|
or elem.findtext("./titel", default="")
|
||||||
|
).strip()
|
||||||
|
|
||||||
|
heading = "#" * level
|
||||||
|
header = f"{heading} {label} {nr}"
|
||||||
|
if titel:
|
||||||
|
header += f". {titel}"
|
||||||
|
|
||||||
|
skip_tags = {
|
||||||
|
"nr", "kop", "titel",
|
||||||
|
"hoofdstuktitel", "titeldeel-titel", "afdelingtitel",
|
||||||
|
"paragraaftitel", "boektitel",
|
||||||
|
}
|
||||||
|
|
||||||
|
parts = [header]
|
||||||
|
|
||||||
|
for child in elem:
|
||||||
|
tag = child.tag
|
||||||
|
if tag in skip_tags:
|
||||||
|
continue
|
||||||
|
elif tag == "deel":
|
||||||
|
parts.append(_parse_structuur(child, level=level + 1, label="Deel"))
|
||||||
|
elif tag == "boek":
|
||||||
|
parts.append(_parse_structuur(child, level=level + 1, label="Boek"))
|
||||||
|
elif tag == "hoofdstuk":
|
||||||
|
parts.append(_parse_structuur(child, level=level + 1, label="Hoofdstuk"))
|
||||||
|
elif tag == "titeldeel":
|
||||||
|
parts.append(_parse_structuur(child, level=level + 1, label="Titel"))
|
||||||
|
elif tag == "afdeling":
|
||||||
|
parts.append(_parse_structuur(child, level=level + 1, label="Afdeling"))
|
||||||
|
elif tag == "paragraaf":
|
||||||
|
parts.append(_parse_structuur(child, level=level + 1, label="Paragraaf"))
|
||||||
|
elif tag == "circulaire.divisie":
|
||||||
|
parts.append(_parse_structuur(child, level=level + 1, label=""))
|
||||||
|
elif tag == "sub-paragraaf":
|
||||||
|
parts.append(_parse_structuur(child, level=level + 1, label="Paragraaf"))
|
||||||
|
elif tag == "divisie":
|
||||||
|
parts.append(_parse_structuur(child, level=level + 1, label=""))
|
||||||
|
elif tag in ("artikel", "enig-artikel"):
|
||||||
|
parts.append(_parse_artikel(child))
|
||||||
|
elif tag == "al":
|
||||||
|
parts.append(_get_text(child))
|
||||||
|
elif tag == "tekst":
|
||||||
|
parts.append(_parse_tekst_content(child))
|
||||||
|
elif tag == "tussenkop":
|
||||||
|
text = _get_text(child).strip()
|
||||||
|
if text:
|
||||||
|
sub_heading = "#" * min(level + 1, 6)
|
||||||
|
parts.append(f"{sub_heading} {text}")
|
||||||
|
elif tag == "redactie":
|
||||||
|
text = _get_text(child).strip()
|
||||||
|
if text:
|
||||||
|
parts.append(f"*[{text}]*")
|
||||||
|
elif tag == "table":
|
||||||
|
parts.append(_parse_cals_table(child))
|
||||||
|
elif tag == "tabel":
|
||||||
|
parts.append("*[tabel]*")
|
||||||
|
elif tag in ("plaatje", "illustratie"):
|
||||||
|
parts.append("*[afbeelding]*")
|
||||||
|
|
||||||
|
return "\n\n".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_artikel(artikel: etree._Element) -> str:
|
||||||
|
"""Parse een <artikel> element naar Markdown."""
|
||||||
|
nr = artikel.findtext(".//nr", default="").strip()
|
||||||
|
heading = f"### Artikel {nr}" if nr else "### Artikel"
|
||||||
|
|
||||||
|
parts = [heading]
|
||||||
|
|
||||||
|
for child in artikel:
|
||||||
|
tag = child.tag
|
||||||
|
if tag == "nr":
|
||||||
|
continue
|
||||||
|
elif tag == "titel":
|
||||||
|
titel_text = _get_text(child).strip()
|
||||||
|
if titel_text:
|
||||||
|
parts.append(f"*{titel_text}*")
|
||||||
|
elif tag == "lid":
|
||||||
|
parts.append(_parse_lid(child))
|
||||||
|
elif tag == "al":
|
||||||
|
parts.append(_get_text(child))
|
||||||
|
elif tag == "lijst":
|
||||||
|
parts.append(_parse_lijst(child))
|
||||||
|
elif tag == "tekst":
|
||||||
|
parts.append(_parse_tekst_content(child))
|
||||||
|
elif tag == "tabel":
|
||||||
|
parts.append("*[tabel]*")
|
||||||
|
elif tag == "gereserveerd":
|
||||||
|
parts.append("*[Dit artikel is gereserveerd.]*")
|
||||||
|
elif tag == "vervallen":
|
||||||
|
parts.append("*[Dit artikel is vervallen.]*")
|
||||||
|
elif tag == "lid-vervallen":
|
||||||
|
lidnr = child.findtext(".//lidnr", default="").strip()
|
||||||
|
parts.append(f"**{lidnr}.** *[Vervallen.]*" if lidnr else "*[Lid vervallen.]*")
|
||||||
|
elif tag == "lidnr":
|
||||||
|
continue
|
||||||
|
|
||||||
|
return "\n\n".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_lid(lid: etree._Element) -> str:
|
||||||
|
"""Parse een <lid> element."""
|
||||||
|
lidnr = lid.findtext(".//lidnr", default="").strip()
|
||||||
|
parts: list[str] = []
|
||||||
|
if lidnr:
|
||||||
|
parts.append(f"**{lidnr}.**")
|
||||||
|
|
||||||
|
for child in lid:
|
||||||
|
tag = child.tag
|
||||||
|
if tag == "lidnr":
|
||||||
|
continue
|
||||||
|
elif tag == "al":
|
||||||
|
parts.append(_get_text(child))
|
||||||
|
elif tag == "lijst":
|
||||||
|
parts.append(_parse_lijst(child))
|
||||||
|
elif tag == "tekst":
|
||||||
|
parts.append(_parse_tekst_content(child))
|
||||||
|
elif tag == "table":
|
||||||
|
parts.append(_parse_cals_table(child))
|
||||||
|
elif tag == "tabel":
|
||||||
|
parts.append("*[tabel]*")
|
||||||
|
elif tag == "formule":
|
||||||
|
parts.append(f"*[formule: {_get_text(child)}]*")
|
||||||
|
elif tag == "redactie":
|
||||||
|
text = _get_text(child).strip()
|
||||||
|
if text:
|
||||||
|
parts.append(f"*[{text}]*")
|
||||||
|
|
||||||
|
return " ".join(parts) if lidnr and len(parts) <= 2 else "\n\n".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_lijst(lijst: etree._Element) -> str:
|
||||||
|
"""Parse een <lijst> element naar Markdown-lijst."""
|
||||||
|
items: list[str] = []
|
||||||
|
for li in lijst.findall(".//li"):
|
||||||
|
nr = li.findtext(".//li.nr", default="").strip()
|
||||||
|
body = li.find(".//li.body")
|
||||||
|
if body is not None:
|
||||||
|
text = _parse_tekst_content(body)
|
||||||
|
else:
|
||||||
|
text = _get_text(li)
|
||||||
|
prefix = f"{nr} " if nr else "- "
|
||||||
|
items.append(f"{prefix}{text}")
|
||||||
|
return "\n".join(items)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_tekst_content(elem: etree._Element) -> str:
|
||||||
|
"""Parse gemengde content (al, lijst, etc.) binnen een element."""
|
||||||
|
parts: list[str] = []
|
||||||
|
for child in elem:
|
||||||
|
if child.tag == "al":
|
||||||
|
parts.append(_get_text(child))
|
||||||
|
elif child.tag == "lijst":
|
||||||
|
parts.append(_parse_lijst(child))
|
||||||
|
elif child.tag == "table":
|
||||||
|
parts.append(_parse_cals_table(child))
|
||||||
|
elif child.tag == "tabel":
|
||||||
|
parts.append("*[tabel]*")
|
||||||
|
elif child.tag in ("plaatje", "illustratie"):
|
||||||
|
parts.append("*[afbeelding]*")
|
||||||
|
elif child.tag == "redactie":
|
||||||
|
text = _get_text(child).strip()
|
||||||
|
if text:
|
||||||
|
parts.append(f"*[{text}]*")
|
||||||
|
if not parts:
|
||||||
|
text = _get_text(elem)
|
||||||
|
if text:
|
||||||
|
parts.append(text)
|
||||||
|
return "\n\n".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_cals_table(table: etree._Element) -> str:
|
||||||
|
"""Parse een CALS <table> element naar Markdown tabel."""
|
||||||
|
rows: list[list[str]] = []
|
||||||
|
has_header = False
|
||||||
|
|
||||||
|
# Thead
|
||||||
|
thead = table.find(".//thead")
|
||||||
|
if thead is not None:
|
||||||
|
has_header = True
|
||||||
|
for row in thead.findall(".//row"):
|
||||||
|
cells = [_get_text(e) for e in row.findall(".//entry")]
|
||||||
|
rows.append(cells)
|
||||||
|
|
||||||
|
# Tbody
|
||||||
|
tbody = table.find(".//tbody")
|
||||||
|
if tbody is not None:
|
||||||
|
for row in tbody.findall(".//row"):
|
||||||
|
cells = [_get_text(e) for e in row.findall(".//entry")]
|
||||||
|
rows.append(cells)
|
||||||
|
else:
|
||||||
|
# Geen tbody — rows direct onder tgroup
|
||||||
|
for row in table.findall(".//row"):
|
||||||
|
cells = [_get_text(e) for e in row.findall(".//entry")]
|
||||||
|
rows.append(cells)
|
||||||
|
|
||||||
|
if not rows:
|
||||||
|
return "*[tabel]*"
|
||||||
|
|
||||||
|
# Normaliseer kolom-aantallen
|
||||||
|
max_cols = max(len(r) for r in rows)
|
||||||
|
for r in rows:
|
||||||
|
while len(r) < max_cols:
|
||||||
|
r.append("")
|
||||||
|
|
||||||
|
# Markdown tabel genereren
|
||||||
|
lines: list[str] = []
|
||||||
|
for i, row in enumerate(rows):
|
||||||
|
escaped = [cell.replace("|", "\\|").replace("\n", " ") for cell in row]
|
||||||
|
lines.append("| " + " | ".join(escaped) + " |")
|
||||||
|
if i == 0:
|
||||||
|
lines.append("| " + " | ".join("---" for _ in escaped) + " |")
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_bijlage(bijlage: etree._Element) -> str:
|
||||||
|
"""Parse een <bijlage> element naar Markdown."""
|
||||||
|
# Titel uit <kop>
|
||||||
|
kop = bijlage.find("./kop")
|
||||||
|
nr = ""
|
||||||
|
titel = ""
|
||||||
|
if kop is not None:
|
||||||
|
nr = kop.findtext("./nr", default="").strip()
|
||||||
|
titel = kop.findtext("./titel", default="").strip()
|
||||||
|
|
||||||
|
header = f"## Bijlage {nr}"
|
||||||
|
if titel:
|
||||||
|
header += f". {titel}"
|
||||||
|
|
||||||
|
parts = [header]
|
||||||
|
|
||||||
|
for child in bijlage:
|
||||||
|
tag = child.tag
|
||||||
|
if tag == "kop":
|
||||||
|
continue
|
||||||
|
elif tag == "artikel":
|
||||||
|
parts.append(_parse_artikel(child))
|
||||||
|
elif tag == "tekst":
|
||||||
|
parts.append(_parse_tekst_content(child))
|
||||||
|
elif tag == "tabel":
|
||||||
|
parts.append("*[tabel]*")
|
||||||
|
elif tag == "plaatje":
|
||||||
|
parts.append("*[afbeelding]*")
|
||||||
|
elif tag == "bijlage-tekst":
|
||||||
|
parts.append(_parse_tekst_content(child))
|
||||||
|
elif tag in ("hoofdstuk", "titeldeel", "afdeling", "paragraaf"):
|
||||||
|
parts.append(_parse_structuur(child, level=3, label=tag.capitalize()))
|
||||||
|
elif tag == "al":
|
||||||
|
parts.append(_get_text(child))
|
||||||
|
|
||||||
|
return "\n\n".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_text(elem: etree._Element) -> str:
|
||||||
|
"""Haal alle tekst op uit een element, met inline formatting.
|
||||||
|
|
||||||
|
Verwerkt <nadruk>, <sup>, <sub/inf>, <extref> en <intref> inline.
|
||||||
|
"""
|
||||||
|
parts: list[str] = []
|
||||||
|
_collect_text(elem, parts)
|
||||||
|
return "".join(parts).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _collect_text(elem: etree._Element, parts: list[str]) -> None:
|
||||||
|
"""Recursief tekst verzamelen met inline Markdown formatting."""
|
||||||
|
if elem.text:
|
||||||
|
parts.append(elem.text)
|
||||||
|
|
||||||
|
for child in elem:
|
||||||
|
tag = child.tag
|
||||||
|
if tag == "nadruk":
|
||||||
|
nadruk_type = child.get("type", "")
|
||||||
|
inner = "".join(child.itertext())
|
||||||
|
if nadruk_type == "vet":
|
||||||
|
parts.append(f"**{inner}**")
|
||||||
|
elif nadruk_type == "cur":
|
||||||
|
parts.append(f"*{inner}*")
|
||||||
|
else:
|
||||||
|
parts.append(inner)
|
||||||
|
elif tag == "sup":
|
||||||
|
inner = "".join(child.itertext())
|
||||||
|
parts.append(f"^{inner}")
|
||||||
|
elif tag in ("sub", "inf"):
|
||||||
|
inner = "".join(child.itertext())
|
||||||
|
parts.append(f"_{inner}")
|
||||||
|
elif tag in ("extref", "intref"):
|
||||||
|
# Tekst behouden, link-info gaat verloren in Markdown v0.1
|
||||||
|
_collect_text(child, parts)
|
||||||
|
elif tag == "nootref":
|
||||||
|
# Voetnootverwijzing — neem tekst mee
|
||||||
|
inner = "".join(child.itertext())
|
||||||
|
parts.append(inner)
|
||||||
|
else:
|
||||||
|
# Onbekend inline element — neem tekst gewoon mee
|
||||||
|
_collect_text(child, parts)
|
||||||
|
|
||||||
|
if child.tail:
|
||||||
|
parts.append(child.tail)
|
||||||
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
0
tests/pipeline/__init__.py
Normal file
0
tests/pipeline/__init__.py
Normal file
59
tests/pipeline/test_bwb_parser.py
Normal file
59
tests/pipeline/test_bwb_parser.py
Normal file
|
|
@ -0,0 +1,59 @@
|
||||||
|
"""Tests voor de BWB XML parser."""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from wetgit.pipeline.bwb_parser import parse_bwb_xml
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def grondwet_xml(tmp_path):
|
||||||
|
"""Download de Grondwet XML voor tests."""
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
url = "https://repository.officiele-overheidspublicaties.nl/bwb/BWBR0001840/2023-02-22_0/xml/BWBR0001840_2023-02-22_0.xml"
|
||||||
|
resp = httpx.get(url, timeout=30)
|
||||||
|
resp.raise_for_status()
|
||||||
|
xml_path = tmp_path / "grondwet.xml"
|
||||||
|
xml_path.write_bytes(resp.content)
|
||||||
|
return str(xml_path)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
class TestBWBParser:
|
||||||
|
def test_parse_grondwet_metadata(self, grondwet_xml: str) -> None:
|
||||||
|
result = parse_bwb_xml(grondwet_xml)
|
||||||
|
assert result.bwb_id == "BWBR0001840"
|
||||||
|
assert result.titel == "Grondwet"
|
||||||
|
assert result.soort == "wet"
|
||||||
|
|
||||||
|
def test_parse_grondwet_frontmatter(self, grondwet_xml: str) -> None:
|
||||||
|
result = parse_bwb_xml(grondwet_xml)
|
||||||
|
assert result.frontmatter["bwb_id"] == "BWBR0001840"
|
||||||
|
assert result.frontmatter["type"] == "wet"
|
||||||
|
assert result.frontmatter["status"] == "geldend"
|
||||||
|
assert "wetten.overheid.nl" in str(result.frontmatter["bron"])
|
||||||
|
|
||||||
|
def test_parse_grondwet_has_artikel_1(self, grondwet_xml: str) -> None:
|
||||||
|
result = parse_bwb_xml(grondwet_xml)
|
||||||
|
assert "### Artikel 1" in result.markdown
|
||||||
|
assert "gelijke gevallen gelijk behandeld" in result.markdown
|
||||||
|
|
||||||
|
def test_parse_grondwet_has_hoofdstukken(self, grondwet_xml: str) -> None:
|
||||||
|
result = parse_bwb_xml(grondwet_xml)
|
||||||
|
assert "## Hoofdstuk 1" in result.markdown
|
||||||
|
assert "## Hoofdstuk 2" in result.markdown
|
||||||
|
|
||||||
|
def test_parse_grondwet_yaml_frontmatter(self, grondwet_xml: str) -> None:
|
||||||
|
result = parse_bwb_xml(grondwet_xml)
|
||||||
|
assert result.markdown.startswith("---\n")
|
||||||
|
assert "\n---\n" in result.markdown
|
||||||
|
|
||||||
|
def test_parse_grondwet_markdown_structure(self, grondwet_xml: str) -> None:
|
||||||
|
result = parse_bwb_xml(grondwet_xml)
|
||||||
|
# Should start with frontmatter then h1
|
||||||
|
lines = result.markdown.split("\n")
|
||||||
|
assert lines[0] == "---"
|
||||||
|
# Find the h1
|
||||||
|
h1_lines = [l for l in lines if l.startswith("# ")]
|
||||||
|
assert len(h1_lines) == 1
|
||||||
|
assert h1_lines[0] == "# Grondwet"
|
||||||
Loading…
Add table
Reference in a new issue