Compare commits

...

2 commits

Author SHA1 Message Date
c481ebf9e7 feat: ansible deployment setup voor dt-prod-01
- Forgejo + Redis Docker stack (wetgit-forgejo role)
- FastAPI + Celery systemd services (wetgit-app role)
- Nginx vhosts voor git.wetgit.nl en api.wetgit.nl (wetgit-nginx role)
- SSL via Let's Encrypt (certbot webroot)
- Backup script (forgejo dump, geen downtime)
- Codeberg mirror script
- Cron jobs voor backup/mirror/log cleanup
- Ansible vault voor secrets (encrypted)

Geïsoleerd van dt-platform: eigen poorten, users, directories.
2026-03-29 21:24:47 +02:00
1dc93b0f89 feat: project scaffold + BWB XML parser
- pyproject.toml met wetgit package, pytest/ruff/black/mypy config
- BWB XML → Markdown parser (src/wetgit/pipeline/bwb_parser.py)
- Getest op ~400 regelingen over alle BWB-types
- 20 edge cases gevonden en opgelost:
  - <boek>, <deel>, <kop> structuren
  - <regeling-tekst>, <circulaire-tekst> containers
  - <bijlage>, <enig-artikel>, <sub-paragraaf>, <divisie>
  - CALS <table> → Markdown tabellen
  - <nadruk>, <sup>, <sub> inline formatting
  - <redactie>, <tussenkop>, <gereserveerd>, <vervallen>
- Nix flake devshell met alle dependencies
- CLI entrypoint (wetgit)
- Domain models (Regeling, Artikel)

Sluit #4, sluit #5
2026-03-29 21:24:32 +02:00
35 changed files with 1769 additions and 0 deletions

10
.env.example Normal file
View file

@ -0,0 +1,10 @@
# WetGIT Environment Variables
# Copy to .env and fill in real values:
# cp .env.example .env
# AgentMail API (coornhert@wetgit.nl)
# Get your key from https://console.agentmail.to
AGENTMAIL_API_KEY=
# Hetzner Cloud
HCLOUD_TOKEN=

33
.gitignore vendored Normal file
View file

@ -0,0 +1,33 @@
# Secrets
.env
.env.*
!.env.example
ansible/.vault_pass
# Nix / direnv
.direnv/
result
# Python
__pycache__/
*.py[cod]
*.egg-info/
dist/
build/
.venv/
*.egg
# Testing
.coverage
htmlcov/
.pytest_cache/
# IDE
.idea/
.vscode/
*.swp
*.swo
# OS
.DS_Store
Thumbs.db

10
ansible/ansible.cfg Normal file
View file

@ -0,0 +1,10 @@
[defaults]
inventory = inventory/hosts
remote_tmp = /tmp/.ansible/tmp
host_key_checking = True
retry_files_enabled = False
roles_path = roles
vault_password_file = .vault_pass
[ssh_connection]
pipelining = True

View file

@ -0,0 +1,45 @@
# WetGIT - Nederlandse wetgeving als code
# Deployment variables for dt-prod-01
#
# IMPORTANT: This server is shared with dt-platform.
# Do NOT use ports 8001 (dt-chatbot), 8200 (grimoire).
# Do NOT modify /opt/dt-chatbot, /opt/dt-skills-portal, /opt/grimoire.
# Do NOT modify the global nginx.conf — only add vhost configs.
# --- Application ---
app_name: wetgit
app_dir: /opt/wetgit
data_dir: /data/wetgit
# FastAPI backend
backend_port: 8002
backend_workers: 1
backend_host: "127.0.0.1"
# --- Domains ---
server_name: "api.wetgit.nl"
forgejo_domain: "git.wetgit.nl"
# --- Forgejo ---
forgejo_port: 3000
forgejo_data_dir: /opt/wetgit/data
forgejo_admin_user: coornhert
forgejo_admin_email: coornhert@wetgit.nl
# --- Redis (Docker, shared network with Forgejo) ---
redis_port: 6379
redis_host: "127.0.0.1"
# --- Celery ---
celery_concurrency: 2
# --- Codeberg mirror ---
codeberg_api_token: "{{ vault_codeberg_api_token | default('') }}"
# --- AgentMail ---
agentmail_api_key: "{{ vault_agentmail_api_key }}"
# --- Secrets (from vault.yml) ---
# vault_agentmail_api_key
# vault_codeberg_api_token (add when Codeberg account is ready)
# vault_forgejo_admin_password (initial admin password)

View file

@ -0,0 +1,14 @@
$ANSIBLE_VAULT;1.1;AES256
35323237613730303463313335643433616238663932643630636530356461323433666435653436
3433343462343538333335343165353538613435613962650a656166366364393564353733343561
66643462313261643538653839393365643634376432373665653133383464313636633762366163
6562336332396535390a333062323534373963356439353336633964383832313431623934653739
37646339376338623536323336353931343039323263666265363763373266343533333236346635
37656436623764393037393138343536313666613439666535656631313031343061346130376136
64383164643466643162393537343265313632343432336238393030306164636434356463396434
34656334383731326131393061333138643435366534333965376666393535316334396662633561
61386636336438383563326565336635643663313934326333323939663637653531363261613733
38646631333739303737616630663337663265616462346637326539306338613866313762306662
38633066323936623233336631653836656531633839643739313966623065313931356630613134
39636539643065663963626437383637643932633164306337626330623466313737623532366631
6435

2
ansible/inventory/hosts Normal file
View file

@ -0,0 +1,2 @@
[wetgit]
dt-prod-01 ansible_host=100.98.29.89 ansible_user=deploy ansible_become=yes

View file

@ -0,0 +1,12 @@
---
- name: restart wetgit
systemd:
name: wetgit
state: restarted
daemon_reload: yes
- name: restart wetgit-celery
systemd:
name: wetgit-celery
state: restarted
daemon_reload: yes

View file

@ -0,0 +1,79 @@
---
# WetGIT FastAPI application + Celery worker
# Deploys to /opt/wetgit/backend with own venv and systemd services
#
# Directories are created by wetgit-forgejo role (runs first).
# This role only manages the FastAPI app and Celery worker.
#
# NOTE: Services are only enabled when application code exists.
# On first deploy (no code yet), this role is effectively a no-op.
- name: Check if application code exists
stat:
path: "{{ app_dir }}/backend/requirements.txt"
register: app_code
- name: Create Python venv
command: python3 -m venv {{ app_dir }}/backend/venv
args:
creates: "{{ app_dir }}/backend/venv/bin/python"
when: app_code.stat.exists
- name: Set venv ownership
file:
path: "{{ app_dir }}/backend/venv"
owner: www-data
group: www-data
recurse: yes
when: app_code.stat.exists
- name: Install Python dependencies
pip:
requirements: "{{ app_dir }}/backend/requirements.txt"
virtualenv: "{{ app_dir }}/backend/venv"
when: app_code.stat.exists
notify: restart wetgit
- name: Deploy environment file
template:
src: wetgit.env.j2
dest: "{{ app_dir }}/backend/.env"
owner: www-data
group: www-data
mode: "0600"
notify: restart wetgit
- name: Deploy WetGIT systemd service
template:
src: wetgit.service.j2
dest: /etc/systemd/system/wetgit.service
owner: root
group: root
mode: "0644"
notify: restart wetgit
- name: Deploy Celery worker systemd service
template:
src: wetgit-celery.service.j2
dest: /etc/systemd/system/wetgit-celery.service
owner: root
group: root
mode: "0644"
notify: restart wetgit-celery
# Only start services when app code is deployed
- name: Enable and start WetGIT service
systemd:
name: wetgit
enabled: yes
state: started
daemon_reload: yes
when: app_code.stat.exists
- name: Enable and start Celery worker
systemd:
name: wetgit-celery
enabled: yes
state: started
daemon_reload: yes
when: app_code.stat.exists

View file

@ -0,0 +1,17 @@
[Unit]
Description=WetGIT Celery Worker
After=network.target docker.service
Requires=docker.service
[Service]
Type=simple
User=www-data
Group=www-data
WorkingDirectory={{ app_dir }}/backend
EnvironmentFile={{ app_dir }}/backend/.env
ExecStart={{ app_dir }}/backend/venv/bin/celery -A tasks worker --loglevel=info --concurrency={{ celery_concurrency }}
Restart=always
RestartSec=10
[Install]
WantedBy=multi-user.target

View file

@ -0,0 +1,19 @@
# WetGIT environment — managed by Ansible
# Do NOT edit manually on the server
# FastAPI
WETGIT_HOST={{ backend_host }}
WETGIT_PORT={{ backend_port }}
WETGIT_WORKERS={{ backend_workers }}
# Redis / Celery
REDIS_URL=redis://{{ redis_host }}:{{ redis_port }}/0
CELERY_BROKER_URL=redis://{{ redis_host }}:{{ redis_port }}/0
CELERY_RESULT_BACKEND=redis://{{ redis_host }}:{{ redis_port }}/1
# AgentMail
AGENTMAIL_API_KEY={{ agentmail_api_key }}
# Data
WETGIT_DATA_DIR={{ data_dir }}
WETGIT_GIT_REPOS_DIR={{ data_dir }}/git-repos

View file

@ -0,0 +1,17 @@
[Unit]
Description=WetGIT API - Nederlandse wetgeving als code
After=network.target docker.service
Wants=wetgit-celery.service
[Service]
Type=simple
User=www-data
Group=www-data
WorkingDirectory={{ app_dir }}/backend
EnvironmentFile={{ app_dir }}/backend/.env
ExecStart={{ app_dir }}/backend/venv/bin/uvicorn main:app --host {{ backend_host }} --port {{ backend_port }} --workers {{ backend_workers }}
Restart=always
RestartSec=5
[Install]
WantedBy=multi-user.target

View file

@ -0,0 +1,5 @@
---
- name: restart forgejo
community.docker.docker_compose_v2:
project_src: "{{ app_dir }}/docker"
state: restarted

View file

@ -0,0 +1,148 @@
---
# WetGIT Forgejo (self-hosted Git) + Redis
#
# Deploys Forgejo and Redis as Docker containers.
# Forgejo serves git.wetgit.nl (HTTPS-only, no SSH — blocked by firewall).
# Redis provides Celery broker for the WetGIT pipeline.
#
# IMPORTANT: Does NOT touch dt-platform's Docker services (grimoire).
# All containers use the 'wetgit-network' Docker network.
# --- System user ---
- name: Create wetgit system user
user:
name: wetgit
system: yes
home: /opt/wetgit
shell: /bin/bash
create_home: no
- name: Get wetgit user UID
command: id -u wetgit
register: wetgit_uid_result
changed_when: false
check_mode: false
- name: Get wetgit user GID
command: id -g wetgit
register: wetgit_gid_result
changed_when: false
check_mode: false
- name: Store wetgit UID/GID as facts
set_fact:
wetgit_uid: "{{ wetgit_uid_result.stdout }}"
wetgit_gid: "{{ wetgit_gid_result.stdout }}"
# --- Directories ---
- name: Create WetGIT directories
file:
path: "{{ item.path }}"
state: directory
owner: "{{ item.owner }}"
group: "{{ item.group }}"
mode: "0755"
loop:
# Forgejo directories (owned by wetgit user)
- { path: "{{ app_dir }}/docker", owner: wetgit, group: wetgit }
- { path: "{{ forgejo_data_dir }}", owner: wetgit, group: wetgit }
- { path: "{{ forgejo_data_dir }}/gitea/conf", owner: wetgit, group: wetgit }
- { path: "{{ data_dir }}/redis", owner: wetgit, group: wetgit }
- { path: "{{ app_dir }}/scripts", owner: wetgit, group: wetgit }
- { path: "{{ app_dir }}/backups", owner: wetgit, group: wetgit }
- { path: "{{ app_dir }}/logs", owner: wetgit, group: wetgit }
- { path: "{{ app_dir }}/mirrors", owner: wetgit, group: wetgit }
# Application directories (owned by www-data for FastAPI/Celery)
- { path: "{{ app_dir }}", owner: root, group: root }
- { path: "{{ app_dir }}/backend", owner: www-data, group: www-data }
- { path: "{{ data_dir }}", owner: root, group: root }
- { path: "{{ data_dir }}/git-repos", owner: www-data, group: www-data }
# --- Forgejo config ---
- name: Deploy Forgejo app.ini (initial seed)
template:
src: app.ini.j2
dest: "{{ forgejo_data_dir }}/gitea/conf/app.ini"
owner: wetgit
group: wetgit
mode: "0644"
# Don't overwrite if Forgejo has already modified it
force: no
notify: restart forgejo
# --- Docker Compose ---
- name: Deploy Docker Compose stack
template:
src: docker-compose.yml.j2
dest: "{{ app_dir }}/docker/docker-compose.yml"
owner: wetgit
group: wetgit
mode: "0644"
notify: restart forgejo
- name: Start WetGIT Docker stack
community.docker.docker_compose_v2:
project_src: "{{ app_dir }}/docker"
state: present
# --- Backup script ---
- name: Deploy backup script
template:
src: backup.sh.j2
dest: "{{ app_dir }}/scripts/backup.sh"
owner: wetgit
group: wetgit
mode: "0755"
# --- Mirror script ---
- name: Deploy Codeberg mirror script
template:
src: mirror-to-codeberg.sh.j2
dest: "{{ app_dir }}/scripts/mirror-to-codeberg.sh"
owner: wetgit
group: wetgit
mode: "0755"
- name: Deploy Codeberg token
copy:
content: "{{ codeberg_api_token }}"
dest: "{{ app_dir }}/.codeberg-token"
owner: wetgit
group: wetgit
mode: "0600"
when: codeberg_api_token is defined and codeberg_api_token | length > 0
# --- Cron jobs ---
- name: Configure backup cron (weekly Sunday 02:00)
cron:
name: "wetgit-backup"
user: root
weekday: "0"
hour: "2"
minute: "0"
job: "{{ app_dir }}/scripts/backup.sh >> {{ app_dir }}/logs/backup.log 2>&1"
- name: Configure Codeberg mirror cron (daily 04:00)
cron:
name: "wetgit-codeberg-mirror"
user: wetgit
hour: "4"
minute: "0"
job: "{{ app_dir }}/scripts/mirror-to-codeberg.sh >> {{ app_dir }}/logs/mirror.log 2>&1"
when: codeberg_api_token is defined and codeberg_api_token | length > 0
- name: Configure log cleanup cron (monthly)
cron:
name: "wetgit-log-cleanup"
user: wetgit
day: "1"
hour: "5"
minute: "0"
job: "find {{ app_dir }}/logs -name '*.log' -mtime +30 -delete"

View file

@ -0,0 +1,75 @@
; WetGit Forgejo configuration — managed by Ansible
; This file is merged with Forgejo's defaults on first boot.
; After first boot, Forgejo writes its own app.ini in /data/gitea/conf/.
; This template is used to seed initial configuration.
[DEFAULT]
APP_NAME = WetGit
[server]
DOMAIN = {{ forgejo_domain }}
SSH_DOMAIN = {{ forgejo_domain }}
ROOT_URL = https://{{ forgejo_domain }}/
HTTP_PORT = 3000
; HTTPS-only — no SSH, firewall blocks port 2222
DISABLE_SSH = true
LFS_START_SERVER = true
OFFLINE_MODE = false
[database]
DB_TYPE = sqlite3
PATH = /data/gitea/forgejo.db
[service]
DISABLE_REGISTRATION = true
REQUIRE_SIGNIN_VIEW = false
DEFAULT_KEEP_EMAIL_PRIVATE = true
[repository]
DEFAULT_BRANCH = main
PREFERRED_LICENSES = MIT License,CC0-1.0
MAX_CREATION_LIMIT = -1
ENABLE_PUSH_CREATE_USER = true
ENABLE_PUSH_CREATE_ORG = true
; 100 MB max file size for large law datasets
MAX_FILE_SIZE = 104857600
[git]
MAX_GIT_DIFF_LINES = 10000
MAX_GIT_DIFF_FILES = 1000
[git.timeout]
DEFAULT = 600
MIGRATE = 1200
MIRROR = 600
CLONE = 600
PULL = 600
GC = 120
[lfs]
PATH = /data/git/lfs
[ui]
DEFAULT_THEME = forgejo-auto
SHOW_USER_EMAIL = false
[actions]
ENABLED = true
[indexer]
REPO_INDEXER_ENABLED = true
REPO_INDEXER_PATH = /data/gitea/indexers/repos.bleve
REPO_INDEXER_EXCLUDE = node_modules/**
[markup.markdown]
ENABLED = true
FILE_EXTENSIONS = .md,.markdown
[mailer]
ENABLED = true
PROTOCOL = smtp+starttls
SMTP_ADDR = {{ forgejo_smtp_host | default('smtp.email.undefined') }}
SMTP_PORT = {{ forgejo_smtp_port | default(587) }}
FROM = Coornhert <coornhert@wetgit.nl>
USER = {{ forgejo_smtp_user | default('') }}
PASSWD = {{ forgejo_smtp_password | default('') }}

View file

@ -0,0 +1,37 @@
#!/usr/bin/env bash
set -euo pipefail
# WetGIT Forgejo backup — managed by Ansible
# Uses Forgejo's built-in dump command (no downtime).
BACKUP_DIR="{{ app_dir }}/backups"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
RETENTION_DAYS=14
LOG_PREFIX="[$(date '+%Y-%m-%d %H:%M:%S')]"
echo "$LOG_PREFIX Starting WetGit backup..."
# Forgejo dump (runs inside container, no service stop needed)
docker exec wetgit-forgejo forgejo dump \
--type tar.gz \
--file /data/backup-${TIMESTAMP}.tar.gz \
2>&1 || {
echo "$LOG_PREFIX ERROR: Forgejo dump failed"
exit 1
}
# Move dump from container volume to backup dir
mv "{{ forgejo_data_dir }}/backup-${TIMESTAMP}.tar.gz" \
"$BACKUP_DIR/wetgit-forgejo-${TIMESTAMP}.tar.gz"
# Also backup Redis AOF
docker exec wetgit-redis redis-cli BGSAVE 2>/dev/null || true
sleep 2
cp "{{ data_dir }}/redis/dump.rdb" \
"$BACKUP_DIR/wetgit-redis-${TIMESTAMP}.rdb" 2>/dev/null || true
# Clean old backups
find "$BACKUP_DIR" -name "wetgit-forgejo-*.tar.gz" -mtime +${RETENTION_DAYS} -delete
find "$BACKUP_DIR" -name "wetgit-redis-*.rdb" -mtime +${RETENTION_DAYS} -delete
echo "$LOG_PREFIX Backup complete: wetgit-forgejo-${TIMESTAMP}.tar.gz"

View file

@ -0,0 +1,45 @@
services:
forgejo:
image: codeberg.org/forgejo/forgejo:10
container_name: wetgit-forgejo
restart: unless-stopped
environment:
- USER_UID={{ wetgit_uid }}
- USER_GID={{ wetgit_gid }}
volumes:
- {{ forgejo_data_dir }}:/data
- /etc/timezone:/etc/timezone:ro
- /etc/localtime:/etc/localtime:ro
ports:
- "{{ backend_host }}:{{ forgejo_port }}:3000"
deploy:
resources:
limits:
memory: 1G
cpus: "2.0"
reservations:
memory: 256M
cpus: "0.5"
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:3000/api/v1/version"]
interval: 30s
timeout: 10s
retries: 3
networks:
- wetgit
redis:
image: redis:7-alpine
container_name: wetgit-redis
restart: unless-stopped
ports:
- "{{ backend_host }}:{{ redis_port }}:6379"
volumes:
- {{ data_dir }}/redis:/data
command: redis-server --appendonly yes --maxmemory 256mb --maxmemory-policy allkeys-lru
networks:
- wetgit
networks:
wetgit:
name: wetgit-network

View file

@ -0,0 +1,55 @@
#!/usr/bin/env bash
set -euo pipefail
# Mirror WetGit repos from self-hosted Forgejo to Codeberg
# Managed by Ansible — runs daily at 04:00
CODEBERG_USER="coornhert"
CODEBERG_TOKEN_FILE="{{ app_dir }}/.codeberg-token"
MIRROR_DIR="{{ app_dir }}/mirrors"
LOG_PREFIX="[$(date '+%Y-%m-%d %H:%M:%S')]"
REPOS=(
"wetgit/meta"
"wetgit/rijk"
# Add more as they are created:
# "wetgit/cvdr-noord-holland"
# "wetgit/eu"
)
if [ ! -f "$CODEBERG_TOKEN_FILE" ]; then
echo "$LOG_PREFIX ERROR: Codeberg token not found at $CODEBERG_TOKEN_FILE"
exit 1
fi
CODEBERG_TOKEN=$(cat "$CODEBERG_TOKEN_FILE")
mkdir -p "$MIRROR_DIR"
for REPO in "${REPOS[@]}"; do
REPO_NAME=$(basename "$REPO")
REPO_MIRROR_DIR="$MIRROR_DIR/$REPO_NAME.git"
FORGEJO_URL="https://{{ forgejo_domain }}/${REPO}.git"
CODEBERG_URL="https://${CODEBERG_USER}:${CODEBERG_TOKEN}@codeberg.org/${REPO}.git"
echo "$LOG_PREFIX Mirroring $REPO..."
if [ ! -d "$REPO_MIRROR_DIR" ]; then
echo "$LOG_PREFIX Initial clone from Forgejo..."
git clone --bare "$FORGEJO_URL" "$REPO_MIRROR_DIR"
cd "$REPO_MIRROR_DIR"
git remote add codeberg "$CODEBERG_URL"
else
cd "$REPO_MIRROR_DIR"
echo "$LOG_PREFIX Fetching from Forgejo..."
git fetch origin --prune '+refs/heads/*:refs/heads/*' '+refs/tags/*:refs/tags/*'
fi
echo "$LOG_PREFIX Pushing to Codeberg..."
git push codeberg --mirror --force 2>&1 || {
echo "$LOG_PREFIX WARNING: Push to Codeberg failed for $REPO (non-fatal)"
}
echo "$LOG_PREFIX Done: $REPO"
done
echo "$LOG_PREFIX Mirror complete."

View file

@ -0,0 +1,5 @@
---
- name: reload nginx
systemd:
name: nginx
state: reloaded

View file

@ -0,0 +1,118 @@
---
# Nginx vhosts for WetGIT
# IMPORTANT: Only adds vhost configs. Does NOT touch global nginx.conf
# (managed by dt-platform's nginx role).
#
# Strategy: Deploy HTTP-only first → get SSL certs → deploy full HTTPS config.
# --- Step 1: Check existing SSL certificates ---
- name: Check if API SSL certificate exists
stat:
path: "/etc/letsencrypt/live/{{ server_name }}/fullchain.pem"
register: ssl_cert_api
- name: Check if Forgejo SSL certificate exists
stat:
path: "/etc/letsencrypt/live/{{ forgejo_domain }}/fullchain.pem"
register: ssl_cert_git
# --- Step 2: Deploy HTTP-only configs for domains without certs ---
- name: Deploy API HTTP-only vhost (pre-SSL)
copy:
content: |
# Temporary HTTP-only config for SSL provisioning — managed by Ansible
server {
listen 80;
listen [::]:80;
server_name {{ server_name }};
location /.well-known/acme-challenge/ { root /var/www/certbot; }
location / { return 503; }
}
dest: /etc/nginx/sites-available/wetgit-api.conf
owner: root
group: root
mode: "0644"
when: not ssl_cert_api.stat.exists
notify: reload nginx
- name: Deploy Forgejo HTTP-only vhost (pre-SSL)
copy:
content: |
# Temporary HTTP-only config for SSL provisioning — managed by Ansible
server {
listen 80;
listen [::]:80;
server_name {{ forgejo_domain }};
location /.well-known/acme-challenge/ { root /var/www/certbot; }
location / { return 503; }
}
dest: /etc/nginx/sites-available/wetgit-git.conf
owner: root
group: root
mode: "0644"
when: not ssl_cert_git.stat.exists
notify: reload nginx
# --- Step 3: Enable vhosts and reload nginx ---
- name: Enable API vhost
file:
src: /etc/nginx/sites-available/wetgit-api.conf
dest: /etc/nginx/sites-enabled/wetgit-api.conf
state: link
notify: reload nginx
- name: Enable Forgejo vhost
file:
src: /etc/nginx/sites-available/wetgit-git.conf
dest: /etc/nginx/sites-enabled/wetgit-git.conf
state: link
notify: reload nginx
# Force handler to run now so nginx has the HTTP configs before certbot
- name: Flush handlers (reload nginx for certbot)
meta: flush_handlers
# --- Step 4: Obtain SSL certificates via webroot ---
- name: Obtain SSL certificate for {{ server_name }}
command: >
certbot certonly --webroot
-w /var/www/certbot
-d {{ server_name }}
--non-interactive --agree-tos
--email coornhert@wetgit.nl
when: not ssl_cert_api.stat.exists
register: certbot_api
- name: Obtain SSL certificate for {{ forgejo_domain }}
command: >
certbot certonly --webroot
-w /var/www/certbot
-d {{ forgejo_domain }}
--non-interactive --agree-tos
--email coornhert@wetgit.nl
when: not ssl_cert_git.stat.exists
register: certbot_git
# --- Step 5: Deploy full HTTPS configs ---
- name: Deploy API nginx vhost (full HTTPS)
template:
src: wetgit-api.conf.j2
dest: /etc/nginx/sites-available/wetgit-api.conf
owner: root
group: root
mode: "0644"
notify: reload nginx
- name: Deploy Forgejo nginx vhost (full HTTPS)
template:
src: wetgit-git.conf.j2
dest: /etc/nginx/sites-available/wetgit-git.conf
owner: root
group: root
mode: "0644"
notify: reload nginx

View file

@ -0,0 +1,51 @@
# WetGIT API — managed by WetGIT Ansible (not dt-platform)
# Do NOT edit manually
server {
listen 80;
listen [::]:80;
server_name {{ server_name }};
# ACME challenge (reuse existing certbot webroot)
location /.well-known/acme-challenge/ {
root /var/www/certbot;
}
location / {
return 301 https://$host$request_uri;
}
}
server {
listen 443 ssl http2;
listen [::]:443 ssl http2;
server_name {{ server_name }};
ssl_certificate /etc/letsencrypt/live/{{ server_name }}/fullchain.pem;
ssl_certificate_key /etc/letsencrypt/live/{{ server_name }}/privkey.pem;
# Security headers
add_header Strict-Transport-Security "max-age=63072000; includeSubDomains" always;
add_header X-Content-Type-Options "nosniff" always;
add_header X-Frame-Options "DENY" always;
add_header Referrer-Policy "strict-origin-when-cross-origin" always;
# API proxy
location / {
proxy_pass http://{{ backend_host }}:{{ backend_port }};
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# Timeouts for long-running legislation processing
proxy_read_timeout 120s;
proxy_connect_timeout 10s;
}
# Health check (no rate limit)
location = /health {
proxy_pass http://{{ backend_host }}:{{ backend_port }}/health;
access_log off;
}
}

View file

@ -0,0 +1,52 @@
# Forgejo (git.wetgit.nl) — managed by WetGIT Ansible (not dt-platform)
# Do NOT edit manually
server {
listen 80;
listen [::]:80;
server_name {{ forgejo_domain }};
location /.well-known/acme-challenge/ {
root /var/www/certbot;
}
location / {
return 301 https://$host$request_uri;
}
}
server {
listen 443 ssl http2;
listen [::]:443 ssl http2;
server_name {{ forgejo_domain }};
ssl_certificate /etc/letsencrypt/live/{{ forgejo_domain }}/fullchain.pem;
ssl_certificate_key /etc/letsencrypt/live/{{ forgejo_domain }}/privkey.pem;
# Security headers
add_header Strict-Transport-Security "max-age=63072000; includeSubDomains" always;
add_header X-Content-Type-Options "nosniff" always;
add_header X-Frame-Options "SAMEORIGIN" always;
add_header Referrer-Policy "strict-origin-when-cross-origin" always;
# Large body size for git pushes (law datasets can be large)
client_max_body_size 512M;
# Timeouts for large git operations
proxy_connect_timeout 300;
proxy_send_timeout 300;
proxy_read_timeout 300;
location / {
proxy_pass http://{{ backend_host }}:{{ forgejo_port }};
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# WebSocket support (Forgejo live features)
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
}
}

27
ansible/site.yml Normal file
View file

@ -0,0 +1,27 @@
---
# WetGIT - Nederlandse wetgeving als code
#
# Usage:
# ansible-playbook ansible/site.yml
# ansible-playbook ansible/site.yml --tags forgejo
# ansible-playbook ansible/site.yml --tags app
# ansible-playbook ansible/site.yml --tags nginx
# ansible-playbook ansible/site.yml --check (dry-run)
#
# NOTE: This server is shared with dt-platform.
# This playbook only manages WetGIT resources.
# System-level config (users, packages, firewall) is managed by dt-platform.
- name: Deploy WetGIT
hosts: wetgit
become: yes
roles:
- role: wetgit-forgejo
tags: [forgejo, docker]
- role: wetgit-app
tags: [app]
- role: wetgit-nginx
tags: [nginx]

61
flake.lock generated Normal file
View file

@ -0,0 +1,61 @@
{
"nodes": {
"flake-utils": {
"inputs": {
"systems": "systems"
},
"locked": {
"lastModified": 1731533236,
"narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "flake-utils",
"type": "github"
}
},
"nixpkgs": {
"locked": {
"lastModified": 1774610258,
"narHash": "sha256-HaThtroVD9wRdx7KQk0B75JmFcXlMUoEdDFNOMOlsOs=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "832efc09b4caf6b4569fbf9dc01bec3082a00611",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixpkgs-unstable",
"repo": "nixpkgs",
"type": "github"
}
},
"root": {
"inputs": {
"flake-utils": "flake-utils",
"nixpkgs": "nixpkgs"
}
},
"systems": {
"locked": {
"lastModified": 1681028828,
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
"owner": "nix-systems",
"repo": "default",
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
"type": "github"
},
"original": {
"owner": "nix-systems",
"repo": "default",
"type": "github"
}
}
},
"root": "root",
"version": 7
}

116
flake.nix Normal file
View file

@ -0,0 +1,116 @@
{
description = "WetGit - Nederlandse wetgeving als code";
inputs = {
nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable";
flake-utils.url = "github:numtide/flake-utils";
};
outputs = { self, nixpkgs, flake-utils }:
flake-utils.lib.eachDefaultSystem (system:
let
pkgs = nixpkgs.legacyPackages.${system};
# Python 3.13 (zelfde versie als ansible gebruikt, voorkomt PATH-conflicten)
pythonEnv = pkgs.python313.withPackages (ps: with ps; [
# Conversie-pipeline (PRD: Technische Stack)
lxml # BWB XML-parsing met XPath/XSLT
pygit2 # Git-operaties via libgit2 (performanter dan GitPython)
pyyaml # YAML frontmatter generatie
python-frontmatter # Markdown + YAML frontmatter parsing
# API-laag (PRD: FastAPI)
fastapi
uvicorn # ASGI server
httpx # Async HTTP client (SRU-API, EUR-Lex)
pydantic # Data validatie
# Achtergrondtaken (PRD: Celery + Redis)
celery
redis # Python Redis client
# CLI-tool (PRD: wetgit CLI)
click
rich # Terminal formatting
# Testing
pytest
pytest-cov
pytest-asyncio
# Development tools
black
ruff
mypy
pip
setuptools
wheel
build
# Typing stubs
types-requests
types-pyyaml
]);
in {
devShells.default = pkgs.mkShell {
name = "wetgit";
buildInputs = with pkgs; [
# Python environment
pythonEnv
# Dependency management
uv
# Ansible (infrastructuur provisioning Hetzner)
ansible
ansible-lint
# Hetzner Cloud CLI
hcloud
# Redis server (lokale development)
redis
# Git & tools
git
jq
yq-go
curl
# Native dependencies voor pygit2
libgit2
];
shellHook = ''
echo "WetGit - Nederlandse wetgeving als code"
echo ""
echo "Python: $(python --version)"
echo "Ansible: $(ansible --version 2>/dev/null | head -1)"
echo "hcloud: $(hcloud version 2>/dev/null)"
echo ""
echo "Pipeline tools: lxml, pygit2, fastapi"
echo "Infra tools: ansible, hcloud"
echo ""
# Laad .env als die bestaat (API keys, Hetzner token)
if [ -f .env ]; then
set -a
source .env
set +a
echo "Loaded environment from .env"
echo ""
fi
# Venv voor PyPI-only packages (agentmail etc.)
if [ ! -d .venv ]; then
uv venv .venv --python python3.13 --seed
uv pip install --python .venv/bin/python agentmail
echo "Created .venv and installed PyPI dependencies"
fi
source .venv/bin/activate
'';
};
});
}

92
pyproject.toml Normal file
View file

@ -0,0 +1,92 @@
[project]
name = "wetgit"
version = "0.1.0"
description = "Nederlandse wetgeving als code — elke wet een Markdown-bestand, elke wijziging een Git-commit"
readme = "README.md"
license = "MIT"
requires-python = ">=3.12"
authors = [
{ name = "Coornhert", email = "coornhert@wetgit.nl" },
]
keywords = ["wetgeving", "dutch-law", "bwb", "git", "markdown"]
classifiers = [
"Development Status :: 2 - Pre-Alpha",
"Intended Audience :: Developers",
"Intended Audience :: Legal Industry",
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3.13",
"Topic :: Text Processing :: Markup",
]
dependencies = [
"lxml>=5.0",
"pygit2>=1.13",
"pyyaml>=6.0",
"python-frontmatter>=1.1",
"httpx>=0.27",
"click>=8.1",
"rich>=13.0",
"pydantic>=2.0",
]
[project.optional-dependencies]
api = [
"fastapi>=0.115",
"uvicorn>=0.30",
"celery>=5.4",
"redis>=5.0",
]
dev = [
"pytest>=8.0",
"pytest-cov>=5.0",
"pytest-asyncio>=0.24",
"black>=24.0",
"ruff>=0.6",
"mypy>=1.11",
"types-pyyaml",
"types-requests",
]
[project.scripts]
wetgit = "wetgit.cli.main:cli"
[project.urls]
Homepage = "https://wetgit.nl"
Repository = "https://git.wetgit.nl/wetgit/meta"
Issues = "https://git.wetgit.nl/wetgit/meta/issues"
[build-system]
requires = ["setuptools>=75.0"]
build-backend = "setuptools.build_meta"
[tool.setuptools.packages.find]
where = ["src"]
[tool.pytest.ini_options]
testpaths = ["tests"]
markers = [
"unit: Unit tests (fast, no I/O)",
"integration: Integration tests (may need network/disk)",
"slow: Slow tests (large XML parsing, bulk operations)",
]
asyncio_mode = "auto"
[tool.black]
line-length = 99
target-version = ["py313"]
[tool.ruff]
line-length = 99
target-version = "py313"
[tool.ruff.lint]
select = ["E", "F", "I", "N", "W", "UP", "B", "A", "SIM", "TCH"]
ignore = ["E501"]
[tool.ruff.lint.isort]
known-first-party = ["wetgit"]
[tool.mypy]
python_version = "3.13"
warn_return_any = true
warn_unused_configs = true
disallow_untyped_defs = true

3
src/wetgit/__init__.py Normal file
View file

@ -0,0 +1,3 @@
"""WetGit — Nederlandse wetgeving als code."""
__version__ = "0.1.0"

View file

@ -0,0 +1 @@
"""FastAPI REST API."""

View file

@ -0,0 +1 @@
"""WetGit CLI tool."""

17
src/wetgit/cli/main.py Normal file
View file

@ -0,0 +1,17 @@
"""WetGit CLI — command-line interface."""
import click
from wetgit import __version__
@click.group()
@click.version_option(version=__version__, prog_name="wetgit")
def cli() -> None:
"""WetGit — Nederlandse wetgeving als code."""
@cli.command()
def version() -> None:
"""Toon de WetGit versie."""
click.echo(f"wetgit {__version__}")

55
src/wetgit/models.py Normal file
View file

@ -0,0 +1,55 @@
"""Domain models voor WetGit."""
from __future__ import annotations
from dataclasses import dataclass, field
from datetime import date
from enum import Enum
class RegelingType(str, Enum):
"""Type regeling conform BWB-classificatie."""
WET = "wet"
AMVB = "amvb"
MINISTERIELE_REGELING = "ministeriele-regeling"
KB = "kb"
RIJKSWET = "rijkswet"
VERDRAG = "verdrag"
BELEIDSREGEL = "beleidsregel"
CIRCULAIRE = "circulaire"
ZBO = "zbo"
class RegelingStatus(str, Enum):
"""Status van een regeling."""
GELDEND = "geldend"
VERVALLEN = "vervallen"
@dataclass(frozen=True)
class Regeling:
"""Metadata van een regeling."""
bwb_id: str
titel: str
type: RegelingType
status: RegelingStatus
datum_inwerkingtreding: date
datum_laatste_wijziging: date | None = None
datum_verval: date | None = None
citeertitel: str | None = None
ministerie: str | None = None
bron_url: str | None = None
eu_implementatie: list[dict[str, str]] = field(default_factory=list)
@dataclass(frozen=True)
class Artikel:
"""Een artikel binnen een regeling."""
nummer: str
titel: str | None
inhoud: str
leden: list[str] = field(default_factory=list)

View file

@ -0,0 +1 @@
"""BWB/CVDR/EUR-Lex conversie-pipeline."""

View file

@ -0,0 +1,487 @@
"""BWB XML naar Markdown parser.
Parseert BWB toestand-XML (schema versie 2.0) naar Markdown + YAML frontmatter.
"""
from __future__ import annotations
import re
from dataclasses import dataclass, field
from datetime import date
from io import StringIO
import yaml
from lxml import etree
@dataclass
class ParsedRegeling:
"""Resultaat van het parsen van een BWB toestand-XML."""
bwb_id: str
titel: str
citeertitel: str | None
soort: str
datum_inwerkingtreding: str | None
markdown: str
frontmatter: dict[str, str | list[str] | None]
def parse_bwb_xml(xml_path: str) -> ParsedRegeling:
"""Parse een BWB toestand-XML bestand naar Markdown.
Args:
xml_path: Pad naar het BWB XML-bestand.
Returns:
ParsedRegeling met metadata en Markdown-tekst.
"""
tree = etree.parse(xml_path)
wetgeving = tree.find(".//wetgeving")
if wetgeving is None:
raise ValueError(f"Geen <wetgeving> element gevonden in {xml_path}")
# Metadata extraheren
soort = wetgeving.get("soort", "onbekend")
bwb_id = _extract_bwb_id(tree)
datum = wetgeving.get("inwerkingtredingsdatum")
intitule = wetgeving.findtext(".//intitule", default="").strip()
citeertitel = wetgeving.findtext(".//citeertitel", default="").strip() or None
titel = citeertitel or intitule or bwb_id
# Wettekst parsen — probeer meerdere structuren
# Formele wetten: <wet-besluit><wettekst>
# Ministeriële regelingen: <regeling-tekst>
# Circulaires/ZBO: <circulaire-tekst> met <circulaire.divisie>
# Fallback: <wettekst> direct
wettekst = wetgeving.find(".//wet-besluit//wettekst")
if wettekst is None:
wettekst = wetgeving.find(".//regeling-tekst")
if wettekst is None:
wettekst = wetgeving.find(".//circulaire-tekst")
if wettekst is None:
wettekst = wetgeving.find(".//wettekst")
md_parts: list[str] = []
if wettekst is not None:
md_parts = _parse_wettekst(wettekst)
# Bijlagen parsen (staan buiten de wettekst)
for bijlage in wetgeving.findall(".//bijlage"):
bijlage_md = _parse_bijlage(bijlage)
if bijlage_md:
md_parts.append(bijlage_md)
# Frontmatter opbouwen
frontmatter: dict[str, str | list[str] | None] = {
"titel": titel,
"bwb_id": bwb_id,
"type": soort,
"status": "geldend",
"datum_inwerkingtreding": datum,
"bron": f"https://wetten.overheid.nl/{bwb_id}",
}
if citeertitel:
frontmatter["citeertitel"] = citeertitel
# Markdown samenstellen
fm_yaml = yaml.dump(frontmatter, allow_unicode=True, default_flow_style=False, sort_keys=False)
markdown = f"---\n{fm_yaml.strip()}\n---\n\n# {titel}\n\n"
markdown += "\n\n".join(md_parts)
markdown += "\n"
return ParsedRegeling(
bwb_id=bwb_id,
titel=titel,
citeertitel=citeertitel,
soort=soort,
datum_inwerkingtreding=datum,
markdown=markdown,
frontmatter=frontmatter,
)
def _extract_bwb_id(tree: etree._ElementTree) -> str:
"""Haal BWB-ID op uit het XML-document."""
root = tree.getroot()
# Probeer eerst via wetgeving stam-id
wetgeving = tree.find(".//wetgeving")
if wetgeving is not None:
stam_id = wetgeving.get("stam-id", "")
# stam-id is niet het BWB-ID, zoek in meta-data
# Zoek in meta-data
for elem in tree.iter():
if elem.tag == "toestand":
bwb_id = elem.get("bwb-id")
if bwb_id:
return bwb_id
# Fallback: zoek in bwb-inputbestand
inp = tree.find(".//bwb-inputbestand")
if inp is not None:
for child in inp.iter():
bwb_id = child.get("bwb-id")
if bwb_id:
return bwb_id
return "ONBEKEND"
def _parse_wettekst(wettekst: etree._Element) -> list[str]:
"""Parse het <wettekst> element naar Markdown-blokken."""
parts: list[str] = []
for child in wettekst:
tag = child.tag
if tag == "deel":
parts.append(_parse_structuur(child, level=2, label="Deel"))
elif tag == "boek":
parts.append(_parse_structuur(child, level=2, label="Boek"))
elif tag == "hoofdstuk":
parts.append(_parse_structuur(child, level=2, label="Hoofdstuk"))
elif tag == "titeldeel":
parts.append(_parse_structuur(child, level=2, label="Titel"))
elif tag == "afdeling":
parts.append(_parse_structuur(child, level=3, label="Afdeling"))
elif tag == "paragraaf":
parts.append(_parse_structuur(child, level=3, label="Paragraaf"))
elif tag == "circulaire.divisie":
parts.append(_parse_structuur(child, level=2, label=""))
elif tag == "sub-paragraaf":
parts.append(_parse_structuur(child, level=3, label="Paragraaf"))
elif tag == "divisie":
parts.append(_parse_structuur(child, level=2, label=""))
elif tag in ("artikel", "enig-artikel"):
parts.append(_parse_artikel(child))
elif tag == "tekst":
parts.append(_parse_tekst_content(child))
elif tag == "al":
parts.append(_get_text(child))
elif tag == "table":
parts.append(_parse_cals_table(child))
elif tag == "tabel":
parts.append("*[tabel]*")
elif tag in ("plaatje", "illustratie"):
parts.append("*[afbeelding]*")
return [p for p in parts if p.strip()]
def _parse_structuur(elem: etree._Element, level: int, label: str) -> str:
"""Parse een structuurelement (hoofdstuk, afdeling, paragraaf, boek)."""
nr = ""
titel = ""
# <kop> element bevat nr en titel bij boeken en sommige andere structuren
kop = elem.find("./kop")
if kop is not None:
nr = kop.findtext("./nr", default="").strip()
titel = kop.findtext("./titel", default="").strip()
# Fallback: directe child-elementen
if not nr:
nr = elem.findtext("./nr", default="").strip()
if not titel:
titel = (
elem.findtext("./hoofdstuktitel", default="")
or elem.findtext("./titeldeel-titel", default="")
or elem.findtext("./afdelingtitel", default="")
or elem.findtext("./paragraaftitel", default="")
or elem.findtext("./boektitel", default="")
or elem.findtext("./titel", default="")
).strip()
heading = "#" * level
header = f"{heading} {label} {nr}"
if titel:
header += f". {titel}"
skip_tags = {
"nr", "kop", "titel",
"hoofdstuktitel", "titeldeel-titel", "afdelingtitel",
"paragraaftitel", "boektitel",
}
parts = [header]
for child in elem:
tag = child.tag
if tag in skip_tags:
continue
elif tag == "deel":
parts.append(_parse_structuur(child, level=level + 1, label="Deel"))
elif tag == "boek":
parts.append(_parse_structuur(child, level=level + 1, label="Boek"))
elif tag == "hoofdstuk":
parts.append(_parse_structuur(child, level=level + 1, label="Hoofdstuk"))
elif tag == "titeldeel":
parts.append(_parse_structuur(child, level=level + 1, label="Titel"))
elif tag == "afdeling":
parts.append(_parse_structuur(child, level=level + 1, label="Afdeling"))
elif tag == "paragraaf":
parts.append(_parse_structuur(child, level=level + 1, label="Paragraaf"))
elif tag == "circulaire.divisie":
parts.append(_parse_structuur(child, level=level + 1, label=""))
elif tag == "sub-paragraaf":
parts.append(_parse_structuur(child, level=level + 1, label="Paragraaf"))
elif tag == "divisie":
parts.append(_parse_structuur(child, level=level + 1, label=""))
elif tag in ("artikel", "enig-artikel"):
parts.append(_parse_artikel(child))
elif tag == "al":
parts.append(_get_text(child))
elif tag == "tekst":
parts.append(_parse_tekst_content(child))
elif tag == "tussenkop":
text = _get_text(child).strip()
if text:
sub_heading = "#" * min(level + 1, 6)
parts.append(f"{sub_heading} {text}")
elif tag == "redactie":
text = _get_text(child).strip()
if text:
parts.append(f"*[{text}]*")
elif tag == "table":
parts.append(_parse_cals_table(child))
elif tag == "tabel":
parts.append("*[tabel]*")
elif tag in ("plaatje", "illustratie"):
parts.append("*[afbeelding]*")
return "\n\n".join(parts)
def _parse_artikel(artikel: etree._Element) -> str:
"""Parse een <artikel> element naar Markdown."""
nr = artikel.findtext(".//nr", default="").strip()
heading = f"### Artikel {nr}" if nr else "### Artikel"
parts = [heading]
for child in artikel:
tag = child.tag
if tag == "nr":
continue
elif tag == "titel":
titel_text = _get_text(child).strip()
if titel_text:
parts.append(f"*{titel_text}*")
elif tag == "lid":
parts.append(_parse_lid(child))
elif tag == "al":
parts.append(_get_text(child))
elif tag == "lijst":
parts.append(_parse_lijst(child))
elif tag == "tekst":
parts.append(_parse_tekst_content(child))
elif tag == "tabel":
parts.append("*[tabel]*")
elif tag == "gereserveerd":
parts.append("*[Dit artikel is gereserveerd.]*")
elif tag == "vervallen":
parts.append("*[Dit artikel is vervallen.]*")
elif tag == "lid-vervallen":
lidnr = child.findtext(".//lidnr", default="").strip()
parts.append(f"**{lidnr}.** *[Vervallen.]*" if lidnr else "*[Lid vervallen.]*")
elif tag == "lidnr":
continue
return "\n\n".join(parts)
def _parse_lid(lid: etree._Element) -> str:
"""Parse een <lid> element."""
lidnr = lid.findtext(".//lidnr", default="").strip()
parts: list[str] = []
if lidnr:
parts.append(f"**{lidnr}.**")
for child in lid:
tag = child.tag
if tag == "lidnr":
continue
elif tag == "al":
parts.append(_get_text(child))
elif tag == "lijst":
parts.append(_parse_lijst(child))
elif tag == "tekst":
parts.append(_parse_tekst_content(child))
elif tag == "table":
parts.append(_parse_cals_table(child))
elif tag == "tabel":
parts.append("*[tabel]*")
elif tag == "formule":
parts.append(f"*[formule: {_get_text(child)}]*")
elif tag == "redactie":
text = _get_text(child).strip()
if text:
parts.append(f"*[{text}]*")
return " ".join(parts) if lidnr and len(parts) <= 2 else "\n\n".join(parts)
def _parse_lijst(lijst: etree._Element) -> str:
"""Parse een <lijst> element naar Markdown-lijst."""
items: list[str] = []
for li in lijst.findall(".//li"):
nr = li.findtext(".//li.nr", default="").strip()
body = li.find(".//li.body")
if body is not None:
text = _parse_tekst_content(body)
else:
text = _get_text(li)
prefix = f"{nr} " if nr else "- "
items.append(f"{prefix}{text}")
return "\n".join(items)
def _parse_tekst_content(elem: etree._Element) -> str:
"""Parse gemengde content (al, lijst, etc.) binnen een element."""
parts: list[str] = []
for child in elem:
if child.tag == "al":
parts.append(_get_text(child))
elif child.tag == "lijst":
parts.append(_parse_lijst(child))
elif child.tag == "table":
parts.append(_parse_cals_table(child))
elif child.tag == "tabel":
parts.append("*[tabel]*")
elif child.tag in ("plaatje", "illustratie"):
parts.append("*[afbeelding]*")
elif child.tag == "redactie":
text = _get_text(child).strip()
if text:
parts.append(f"*[{text}]*")
if not parts:
text = _get_text(elem)
if text:
parts.append(text)
return "\n\n".join(parts)
def _parse_cals_table(table: etree._Element) -> str:
"""Parse een CALS <table> element naar Markdown tabel."""
rows: list[list[str]] = []
has_header = False
# Thead
thead = table.find(".//thead")
if thead is not None:
has_header = True
for row in thead.findall(".//row"):
cells = [_get_text(e) for e in row.findall(".//entry")]
rows.append(cells)
# Tbody
tbody = table.find(".//tbody")
if tbody is not None:
for row in tbody.findall(".//row"):
cells = [_get_text(e) for e in row.findall(".//entry")]
rows.append(cells)
else:
# Geen tbody — rows direct onder tgroup
for row in table.findall(".//row"):
cells = [_get_text(e) for e in row.findall(".//entry")]
rows.append(cells)
if not rows:
return "*[tabel]*"
# Normaliseer kolom-aantallen
max_cols = max(len(r) for r in rows)
for r in rows:
while len(r) < max_cols:
r.append("")
# Markdown tabel genereren
lines: list[str] = []
for i, row in enumerate(rows):
escaped = [cell.replace("|", "\\|").replace("\n", " ") for cell in row]
lines.append("| " + " | ".join(escaped) + " |")
if i == 0:
lines.append("| " + " | ".join("---" for _ in escaped) + " |")
return "\n".join(lines)
def _parse_bijlage(bijlage: etree._Element) -> str:
"""Parse een <bijlage> element naar Markdown."""
# Titel uit <kop>
kop = bijlage.find("./kop")
nr = ""
titel = ""
if kop is not None:
nr = kop.findtext("./nr", default="").strip()
titel = kop.findtext("./titel", default="").strip()
header = f"## Bijlage {nr}"
if titel:
header += f". {titel}"
parts = [header]
for child in bijlage:
tag = child.tag
if tag == "kop":
continue
elif tag == "artikel":
parts.append(_parse_artikel(child))
elif tag == "tekst":
parts.append(_parse_tekst_content(child))
elif tag == "tabel":
parts.append("*[tabel]*")
elif tag == "plaatje":
parts.append("*[afbeelding]*")
elif tag == "bijlage-tekst":
parts.append(_parse_tekst_content(child))
elif tag in ("hoofdstuk", "titeldeel", "afdeling", "paragraaf"):
parts.append(_parse_structuur(child, level=3, label=tag.capitalize()))
elif tag == "al":
parts.append(_get_text(child))
return "\n\n".join(parts)
def _get_text(elem: etree._Element) -> str:
"""Haal alle tekst op uit een element, met inline formatting.
Verwerkt <nadruk>, <sup>, <sub/inf>, <extref> en <intref> inline.
"""
parts: list[str] = []
_collect_text(elem, parts)
return "".join(parts).strip()
def _collect_text(elem: etree._Element, parts: list[str]) -> None:
"""Recursief tekst verzamelen met inline Markdown formatting."""
if elem.text:
parts.append(elem.text)
for child in elem:
tag = child.tag
if tag == "nadruk":
nadruk_type = child.get("type", "")
inner = "".join(child.itertext())
if nadruk_type == "vet":
parts.append(f"**{inner}**")
elif nadruk_type == "cur":
parts.append(f"*{inner}*")
else:
parts.append(inner)
elif tag == "sup":
inner = "".join(child.itertext())
parts.append(f"^{inner}")
elif tag in ("sub", "inf"):
inner = "".join(child.itertext())
parts.append(f"_{inner}")
elif tag in ("extref", "intref"):
# Tekst behouden, link-info gaat verloren in Markdown v0.1
_collect_text(child, parts)
elif tag == "nootref":
# Voetnootverwijzing — neem tekst mee
inner = "".join(child.itertext())
parts.append(inner)
else:
# Onbekend inline element — neem tekst gewoon mee
_collect_text(child, parts)
if child.tail:
parts.append(child.tail)

0
tests/__init__.py Normal file
View file

View file

View file

@ -0,0 +1,59 @@
"""Tests voor de BWB XML parser."""
import pytest
from wetgit.pipeline.bwb_parser import parse_bwb_xml
@pytest.fixture
def grondwet_xml(tmp_path):
"""Download de Grondwet XML voor tests."""
import httpx
url = "https://repository.officiele-overheidspublicaties.nl/bwb/BWBR0001840/2023-02-22_0/xml/BWBR0001840_2023-02-22_0.xml"
resp = httpx.get(url, timeout=30)
resp.raise_for_status()
xml_path = tmp_path / "grondwet.xml"
xml_path.write_bytes(resp.content)
return str(xml_path)
@pytest.mark.integration
class TestBWBParser:
def test_parse_grondwet_metadata(self, grondwet_xml: str) -> None:
result = parse_bwb_xml(grondwet_xml)
assert result.bwb_id == "BWBR0001840"
assert result.titel == "Grondwet"
assert result.soort == "wet"
def test_parse_grondwet_frontmatter(self, grondwet_xml: str) -> None:
result = parse_bwb_xml(grondwet_xml)
assert result.frontmatter["bwb_id"] == "BWBR0001840"
assert result.frontmatter["type"] == "wet"
assert result.frontmatter["status"] == "geldend"
assert "wetten.overheid.nl" in str(result.frontmatter["bron"])
def test_parse_grondwet_has_artikel_1(self, grondwet_xml: str) -> None:
result = parse_bwb_xml(grondwet_xml)
assert "### Artikel 1" in result.markdown
assert "gelijke gevallen gelijk behandeld" in result.markdown
def test_parse_grondwet_has_hoofdstukken(self, grondwet_xml: str) -> None:
result = parse_bwb_xml(grondwet_xml)
assert "## Hoofdstuk 1" in result.markdown
assert "## Hoofdstuk 2" in result.markdown
def test_parse_grondwet_yaml_frontmatter(self, grondwet_xml: str) -> None:
result = parse_bwb_xml(grondwet_xml)
assert result.markdown.startswith("---\n")
assert "\n---\n" in result.markdown
def test_parse_grondwet_markdown_structure(self, grondwet_xml: str) -> None:
result = parse_bwb_xml(grondwet_xml)
# Should start with frontmatter then h1
lines = result.markdown.split("\n")
assert lines[0] == "---"
# Find the h1
h1_lines = [l for l in lines if l.startswith("# ")]
assert len(h1_lines) == 1
assert h1_lines[0] == "# Grondwet"