Compare commits
2 commits
bed91e891e
...
c481ebf9e7
| Author | SHA1 | Date | |
|---|---|---|---|
| c481ebf9e7 | |||
| 1dc93b0f89 |
35 changed files with 1769 additions and 0 deletions
10
.env.example
Normal file
10
.env.example
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
# WetGIT Environment Variables
|
||||
# Copy to .env and fill in real values:
|
||||
# cp .env.example .env
|
||||
|
||||
# AgentMail API (coornhert@wetgit.nl)
|
||||
# Get your key from https://console.agentmail.to
|
||||
AGENTMAIL_API_KEY=
|
||||
|
||||
# Hetzner Cloud
|
||||
HCLOUD_TOKEN=
|
||||
33
.gitignore
vendored
Normal file
33
.gitignore
vendored
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
# Secrets
|
||||
.env
|
||||
.env.*
|
||||
!.env.example
|
||||
ansible/.vault_pass
|
||||
|
||||
# Nix / direnv
|
||||
.direnv/
|
||||
result
|
||||
|
||||
# Python
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*.egg-info/
|
||||
dist/
|
||||
build/
|
||||
.venv/
|
||||
*.egg
|
||||
|
||||
# Testing
|
||||
.coverage
|
||||
htmlcov/
|
||||
.pytest_cache/
|
||||
|
||||
# IDE
|
||||
.idea/
|
||||
.vscode/
|
||||
*.swp
|
||||
*.swo
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
10
ansible/ansible.cfg
Normal file
10
ansible/ansible.cfg
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
[defaults]
|
||||
inventory = inventory/hosts
|
||||
remote_tmp = /tmp/.ansible/tmp
|
||||
host_key_checking = True
|
||||
retry_files_enabled = False
|
||||
roles_path = roles
|
||||
vault_password_file = .vault_pass
|
||||
|
||||
[ssh_connection]
|
||||
pipelining = True
|
||||
45
ansible/group_vars/wetgit/main.yml
Normal file
45
ansible/group_vars/wetgit/main.yml
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
# WetGIT - Nederlandse wetgeving als code
|
||||
# Deployment variables for dt-prod-01
|
||||
#
|
||||
# IMPORTANT: This server is shared with dt-platform.
|
||||
# Do NOT use ports 8001 (dt-chatbot), 8200 (grimoire).
|
||||
# Do NOT modify /opt/dt-chatbot, /opt/dt-skills-portal, /opt/grimoire.
|
||||
# Do NOT modify the global nginx.conf — only add vhost configs.
|
||||
|
||||
# --- Application ---
|
||||
app_name: wetgit
|
||||
app_dir: /opt/wetgit
|
||||
data_dir: /data/wetgit
|
||||
|
||||
# FastAPI backend
|
||||
backend_port: 8002
|
||||
backend_workers: 1
|
||||
backend_host: "127.0.0.1"
|
||||
|
||||
# --- Domains ---
|
||||
server_name: "api.wetgit.nl"
|
||||
forgejo_domain: "git.wetgit.nl"
|
||||
|
||||
# --- Forgejo ---
|
||||
forgejo_port: 3000
|
||||
forgejo_data_dir: /opt/wetgit/data
|
||||
forgejo_admin_user: coornhert
|
||||
forgejo_admin_email: coornhert@wetgit.nl
|
||||
|
||||
# --- Redis (Docker, shared network with Forgejo) ---
|
||||
redis_port: 6379
|
||||
redis_host: "127.0.0.1"
|
||||
|
||||
# --- Celery ---
|
||||
celery_concurrency: 2
|
||||
|
||||
# --- Codeberg mirror ---
|
||||
codeberg_api_token: "{{ vault_codeberg_api_token | default('') }}"
|
||||
|
||||
# --- AgentMail ---
|
||||
agentmail_api_key: "{{ vault_agentmail_api_key }}"
|
||||
|
||||
# --- Secrets (from vault.yml) ---
|
||||
# vault_agentmail_api_key
|
||||
# vault_codeberg_api_token (add when Codeberg account is ready)
|
||||
# vault_forgejo_admin_password (initial admin password)
|
||||
14
ansible/group_vars/wetgit/vault.yml
Normal file
14
ansible/group_vars/wetgit/vault.yml
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
$ANSIBLE_VAULT;1.1;AES256
|
||||
35323237613730303463313335643433616238663932643630636530356461323433666435653436
|
||||
3433343462343538333335343165353538613435613962650a656166366364393564353733343561
|
||||
66643462313261643538653839393365643634376432373665653133383464313636633762366163
|
||||
6562336332396535390a333062323534373963356439353336633964383832313431623934653739
|
||||
37646339376338623536323336353931343039323263666265363763373266343533333236346635
|
||||
37656436623764393037393138343536313666613439666535656631313031343061346130376136
|
||||
64383164643466643162393537343265313632343432336238393030306164636434356463396434
|
||||
34656334383731326131393061333138643435366534333965376666393535316334396662633561
|
||||
61386636336438383563326565336635643663313934326333323939663637653531363261613733
|
||||
38646631333739303737616630663337663265616462346637326539306338613866313762306662
|
||||
38633066323936623233336631653836656531633839643739313966623065313931356630613134
|
||||
39636539643065663963626437383637643932633164306337626330623466313737623532366631
|
||||
6435
|
||||
2
ansible/inventory/hosts
Normal file
2
ansible/inventory/hosts
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
[wetgit]
|
||||
dt-prod-01 ansible_host=100.98.29.89 ansible_user=deploy ansible_become=yes
|
||||
12
ansible/roles/wetgit-app/handlers/main.yml
Normal file
12
ansible/roles/wetgit-app/handlers/main.yml
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
---
|
||||
- name: restart wetgit
|
||||
systemd:
|
||||
name: wetgit
|
||||
state: restarted
|
||||
daemon_reload: yes
|
||||
|
||||
- name: restart wetgit-celery
|
||||
systemd:
|
||||
name: wetgit-celery
|
||||
state: restarted
|
||||
daemon_reload: yes
|
||||
79
ansible/roles/wetgit-app/tasks/main.yml
Normal file
79
ansible/roles/wetgit-app/tasks/main.yml
Normal file
|
|
@ -0,0 +1,79 @@
|
|||
---
|
||||
# WetGIT FastAPI application + Celery worker
|
||||
# Deploys to /opt/wetgit/backend with own venv and systemd services
|
||||
#
|
||||
# Directories are created by wetgit-forgejo role (runs first).
|
||||
# This role only manages the FastAPI app and Celery worker.
|
||||
#
|
||||
# NOTE: Services are only enabled when application code exists.
|
||||
# On first deploy (no code yet), this role is effectively a no-op.
|
||||
|
||||
- name: Check if application code exists
|
||||
stat:
|
||||
path: "{{ app_dir }}/backend/requirements.txt"
|
||||
register: app_code
|
||||
|
||||
- name: Create Python venv
|
||||
command: python3 -m venv {{ app_dir }}/backend/venv
|
||||
args:
|
||||
creates: "{{ app_dir }}/backend/venv/bin/python"
|
||||
when: app_code.stat.exists
|
||||
|
||||
- name: Set venv ownership
|
||||
file:
|
||||
path: "{{ app_dir }}/backend/venv"
|
||||
owner: www-data
|
||||
group: www-data
|
||||
recurse: yes
|
||||
when: app_code.stat.exists
|
||||
|
||||
- name: Install Python dependencies
|
||||
pip:
|
||||
requirements: "{{ app_dir }}/backend/requirements.txt"
|
||||
virtualenv: "{{ app_dir }}/backend/venv"
|
||||
when: app_code.stat.exists
|
||||
notify: restart wetgit
|
||||
|
||||
- name: Deploy environment file
|
||||
template:
|
||||
src: wetgit.env.j2
|
||||
dest: "{{ app_dir }}/backend/.env"
|
||||
owner: www-data
|
||||
group: www-data
|
||||
mode: "0600"
|
||||
notify: restart wetgit
|
||||
|
||||
- name: Deploy WetGIT systemd service
|
||||
template:
|
||||
src: wetgit.service.j2
|
||||
dest: /etc/systemd/system/wetgit.service
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
notify: restart wetgit
|
||||
|
||||
- name: Deploy Celery worker systemd service
|
||||
template:
|
||||
src: wetgit-celery.service.j2
|
||||
dest: /etc/systemd/system/wetgit-celery.service
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
notify: restart wetgit-celery
|
||||
|
||||
# Only start services when app code is deployed
|
||||
- name: Enable and start WetGIT service
|
||||
systemd:
|
||||
name: wetgit
|
||||
enabled: yes
|
||||
state: started
|
||||
daemon_reload: yes
|
||||
when: app_code.stat.exists
|
||||
|
||||
- name: Enable and start Celery worker
|
||||
systemd:
|
||||
name: wetgit-celery
|
||||
enabled: yes
|
||||
state: started
|
||||
daemon_reload: yes
|
||||
when: app_code.stat.exists
|
||||
17
ansible/roles/wetgit-app/templates/wetgit-celery.service.j2
Normal file
17
ansible/roles/wetgit-app/templates/wetgit-celery.service.j2
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
[Unit]
|
||||
Description=WetGIT Celery Worker
|
||||
After=network.target docker.service
|
||||
Requires=docker.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=www-data
|
||||
Group=www-data
|
||||
WorkingDirectory={{ app_dir }}/backend
|
||||
EnvironmentFile={{ app_dir }}/backend/.env
|
||||
ExecStart={{ app_dir }}/backend/venv/bin/celery -A tasks worker --loglevel=info --concurrency={{ celery_concurrency }}
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
19
ansible/roles/wetgit-app/templates/wetgit.env.j2
Normal file
19
ansible/roles/wetgit-app/templates/wetgit.env.j2
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
# WetGIT environment — managed by Ansible
|
||||
# Do NOT edit manually on the server
|
||||
|
||||
# FastAPI
|
||||
WETGIT_HOST={{ backend_host }}
|
||||
WETGIT_PORT={{ backend_port }}
|
||||
WETGIT_WORKERS={{ backend_workers }}
|
||||
|
||||
# Redis / Celery
|
||||
REDIS_URL=redis://{{ redis_host }}:{{ redis_port }}/0
|
||||
CELERY_BROKER_URL=redis://{{ redis_host }}:{{ redis_port }}/0
|
||||
CELERY_RESULT_BACKEND=redis://{{ redis_host }}:{{ redis_port }}/1
|
||||
|
||||
# AgentMail
|
||||
AGENTMAIL_API_KEY={{ agentmail_api_key }}
|
||||
|
||||
# Data
|
||||
WETGIT_DATA_DIR={{ data_dir }}
|
||||
WETGIT_GIT_REPOS_DIR={{ data_dir }}/git-repos
|
||||
17
ansible/roles/wetgit-app/templates/wetgit.service.j2
Normal file
17
ansible/roles/wetgit-app/templates/wetgit.service.j2
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
[Unit]
|
||||
Description=WetGIT API - Nederlandse wetgeving als code
|
||||
After=network.target docker.service
|
||||
Wants=wetgit-celery.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=www-data
|
||||
Group=www-data
|
||||
WorkingDirectory={{ app_dir }}/backend
|
||||
EnvironmentFile={{ app_dir }}/backend/.env
|
||||
ExecStart={{ app_dir }}/backend/venv/bin/uvicorn main:app --host {{ backend_host }} --port {{ backend_port }} --workers {{ backend_workers }}
|
||||
Restart=always
|
||||
RestartSec=5
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
5
ansible/roles/wetgit-forgejo/handlers/main.yml
Normal file
5
ansible/roles/wetgit-forgejo/handlers/main.yml
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
---
|
||||
- name: restart forgejo
|
||||
community.docker.docker_compose_v2:
|
||||
project_src: "{{ app_dir }}/docker"
|
||||
state: restarted
|
||||
148
ansible/roles/wetgit-forgejo/tasks/main.yml
Normal file
148
ansible/roles/wetgit-forgejo/tasks/main.yml
Normal file
|
|
@ -0,0 +1,148 @@
|
|||
---
|
||||
# WetGIT Forgejo (self-hosted Git) + Redis
|
||||
#
|
||||
# Deploys Forgejo and Redis as Docker containers.
|
||||
# Forgejo serves git.wetgit.nl (HTTPS-only, no SSH — blocked by firewall).
|
||||
# Redis provides Celery broker for the WetGIT pipeline.
|
||||
#
|
||||
# IMPORTANT: Does NOT touch dt-platform's Docker services (grimoire).
|
||||
# All containers use the 'wetgit-network' Docker network.
|
||||
|
||||
# --- System user ---
|
||||
|
||||
- name: Create wetgit system user
|
||||
user:
|
||||
name: wetgit
|
||||
system: yes
|
||||
home: /opt/wetgit
|
||||
shell: /bin/bash
|
||||
create_home: no
|
||||
|
||||
- name: Get wetgit user UID
|
||||
command: id -u wetgit
|
||||
register: wetgit_uid_result
|
||||
changed_when: false
|
||||
check_mode: false
|
||||
|
||||
- name: Get wetgit user GID
|
||||
command: id -g wetgit
|
||||
register: wetgit_gid_result
|
||||
changed_when: false
|
||||
check_mode: false
|
||||
|
||||
- name: Store wetgit UID/GID as facts
|
||||
set_fact:
|
||||
wetgit_uid: "{{ wetgit_uid_result.stdout }}"
|
||||
wetgit_gid: "{{ wetgit_gid_result.stdout }}"
|
||||
|
||||
# --- Directories ---
|
||||
|
||||
- name: Create WetGIT directories
|
||||
file:
|
||||
path: "{{ item.path }}"
|
||||
state: directory
|
||||
owner: "{{ item.owner }}"
|
||||
group: "{{ item.group }}"
|
||||
mode: "0755"
|
||||
loop:
|
||||
# Forgejo directories (owned by wetgit user)
|
||||
- { path: "{{ app_dir }}/docker", owner: wetgit, group: wetgit }
|
||||
- { path: "{{ forgejo_data_dir }}", owner: wetgit, group: wetgit }
|
||||
- { path: "{{ forgejo_data_dir }}/gitea/conf", owner: wetgit, group: wetgit }
|
||||
- { path: "{{ data_dir }}/redis", owner: wetgit, group: wetgit }
|
||||
- { path: "{{ app_dir }}/scripts", owner: wetgit, group: wetgit }
|
||||
- { path: "{{ app_dir }}/backups", owner: wetgit, group: wetgit }
|
||||
- { path: "{{ app_dir }}/logs", owner: wetgit, group: wetgit }
|
||||
- { path: "{{ app_dir }}/mirrors", owner: wetgit, group: wetgit }
|
||||
# Application directories (owned by www-data for FastAPI/Celery)
|
||||
- { path: "{{ app_dir }}", owner: root, group: root }
|
||||
- { path: "{{ app_dir }}/backend", owner: www-data, group: www-data }
|
||||
- { path: "{{ data_dir }}", owner: root, group: root }
|
||||
- { path: "{{ data_dir }}/git-repos", owner: www-data, group: www-data }
|
||||
|
||||
# --- Forgejo config ---
|
||||
|
||||
- name: Deploy Forgejo app.ini (initial seed)
|
||||
template:
|
||||
src: app.ini.j2
|
||||
dest: "{{ forgejo_data_dir }}/gitea/conf/app.ini"
|
||||
owner: wetgit
|
||||
group: wetgit
|
||||
mode: "0644"
|
||||
# Don't overwrite if Forgejo has already modified it
|
||||
force: no
|
||||
notify: restart forgejo
|
||||
|
||||
# --- Docker Compose ---
|
||||
|
||||
- name: Deploy Docker Compose stack
|
||||
template:
|
||||
src: docker-compose.yml.j2
|
||||
dest: "{{ app_dir }}/docker/docker-compose.yml"
|
||||
owner: wetgit
|
||||
group: wetgit
|
||||
mode: "0644"
|
||||
notify: restart forgejo
|
||||
|
||||
- name: Start WetGIT Docker stack
|
||||
community.docker.docker_compose_v2:
|
||||
project_src: "{{ app_dir }}/docker"
|
||||
state: present
|
||||
|
||||
# --- Backup script ---
|
||||
|
||||
- name: Deploy backup script
|
||||
template:
|
||||
src: backup.sh.j2
|
||||
dest: "{{ app_dir }}/scripts/backup.sh"
|
||||
owner: wetgit
|
||||
group: wetgit
|
||||
mode: "0755"
|
||||
|
||||
# --- Mirror script ---
|
||||
|
||||
- name: Deploy Codeberg mirror script
|
||||
template:
|
||||
src: mirror-to-codeberg.sh.j2
|
||||
dest: "{{ app_dir }}/scripts/mirror-to-codeberg.sh"
|
||||
owner: wetgit
|
||||
group: wetgit
|
||||
mode: "0755"
|
||||
|
||||
- name: Deploy Codeberg token
|
||||
copy:
|
||||
content: "{{ codeberg_api_token }}"
|
||||
dest: "{{ app_dir }}/.codeberg-token"
|
||||
owner: wetgit
|
||||
group: wetgit
|
||||
mode: "0600"
|
||||
when: codeberg_api_token is defined and codeberg_api_token | length > 0
|
||||
|
||||
# --- Cron jobs ---
|
||||
|
||||
- name: Configure backup cron (weekly Sunday 02:00)
|
||||
cron:
|
||||
name: "wetgit-backup"
|
||||
user: root
|
||||
weekday: "0"
|
||||
hour: "2"
|
||||
minute: "0"
|
||||
job: "{{ app_dir }}/scripts/backup.sh >> {{ app_dir }}/logs/backup.log 2>&1"
|
||||
|
||||
- name: Configure Codeberg mirror cron (daily 04:00)
|
||||
cron:
|
||||
name: "wetgit-codeberg-mirror"
|
||||
user: wetgit
|
||||
hour: "4"
|
||||
minute: "0"
|
||||
job: "{{ app_dir }}/scripts/mirror-to-codeberg.sh >> {{ app_dir }}/logs/mirror.log 2>&1"
|
||||
when: codeberg_api_token is defined and codeberg_api_token | length > 0
|
||||
|
||||
- name: Configure log cleanup cron (monthly)
|
||||
cron:
|
||||
name: "wetgit-log-cleanup"
|
||||
user: wetgit
|
||||
day: "1"
|
||||
hour: "5"
|
||||
minute: "0"
|
||||
job: "find {{ app_dir }}/logs -name '*.log' -mtime +30 -delete"
|
||||
75
ansible/roles/wetgit-forgejo/templates/app.ini.j2
Normal file
75
ansible/roles/wetgit-forgejo/templates/app.ini.j2
Normal file
|
|
@ -0,0 +1,75 @@
|
|||
; WetGit Forgejo configuration — managed by Ansible
|
||||
; This file is merged with Forgejo's defaults on first boot.
|
||||
; After first boot, Forgejo writes its own app.ini in /data/gitea/conf/.
|
||||
; This template is used to seed initial configuration.
|
||||
|
||||
[DEFAULT]
|
||||
APP_NAME = WetGit
|
||||
|
||||
[server]
|
||||
DOMAIN = {{ forgejo_domain }}
|
||||
SSH_DOMAIN = {{ forgejo_domain }}
|
||||
ROOT_URL = https://{{ forgejo_domain }}/
|
||||
HTTP_PORT = 3000
|
||||
; HTTPS-only — no SSH, firewall blocks port 2222
|
||||
DISABLE_SSH = true
|
||||
LFS_START_SERVER = true
|
||||
OFFLINE_MODE = false
|
||||
|
||||
[database]
|
||||
DB_TYPE = sqlite3
|
||||
PATH = /data/gitea/forgejo.db
|
||||
|
||||
[service]
|
||||
DISABLE_REGISTRATION = true
|
||||
REQUIRE_SIGNIN_VIEW = false
|
||||
DEFAULT_KEEP_EMAIL_PRIVATE = true
|
||||
|
||||
[repository]
|
||||
DEFAULT_BRANCH = main
|
||||
PREFERRED_LICENSES = MIT License,CC0-1.0
|
||||
MAX_CREATION_LIMIT = -1
|
||||
ENABLE_PUSH_CREATE_USER = true
|
||||
ENABLE_PUSH_CREATE_ORG = true
|
||||
; 100 MB max file size for large law datasets
|
||||
MAX_FILE_SIZE = 104857600
|
||||
|
||||
[git]
|
||||
MAX_GIT_DIFF_LINES = 10000
|
||||
MAX_GIT_DIFF_FILES = 1000
|
||||
|
||||
[git.timeout]
|
||||
DEFAULT = 600
|
||||
MIGRATE = 1200
|
||||
MIRROR = 600
|
||||
CLONE = 600
|
||||
PULL = 600
|
||||
GC = 120
|
||||
|
||||
[lfs]
|
||||
PATH = /data/git/lfs
|
||||
|
||||
[ui]
|
||||
DEFAULT_THEME = forgejo-auto
|
||||
SHOW_USER_EMAIL = false
|
||||
|
||||
[actions]
|
||||
ENABLED = true
|
||||
|
||||
[indexer]
|
||||
REPO_INDEXER_ENABLED = true
|
||||
REPO_INDEXER_PATH = /data/gitea/indexers/repos.bleve
|
||||
REPO_INDEXER_EXCLUDE = node_modules/**
|
||||
|
||||
[markup.markdown]
|
||||
ENABLED = true
|
||||
FILE_EXTENSIONS = .md,.markdown
|
||||
|
||||
[mailer]
|
||||
ENABLED = true
|
||||
PROTOCOL = smtp+starttls
|
||||
SMTP_ADDR = {{ forgejo_smtp_host | default('smtp.email.undefined') }}
|
||||
SMTP_PORT = {{ forgejo_smtp_port | default(587) }}
|
||||
FROM = Coornhert <coornhert@wetgit.nl>
|
||||
USER = {{ forgejo_smtp_user | default('') }}
|
||||
PASSWD = {{ forgejo_smtp_password | default('') }}
|
||||
37
ansible/roles/wetgit-forgejo/templates/backup.sh.j2
Normal file
37
ansible/roles/wetgit-forgejo/templates/backup.sh.j2
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# WetGIT Forgejo backup — managed by Ansible
|
||||
# Uses Forgejo's built-in dump command (no downtime).
|
||||
|
||||
BACKUP_DIR="{{ app_dir }}/backups"
|
||||
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
||||
RETENTION_DAYS=14
|
||||
LOG_PREFIX="[$(date '+%Y-%m-%d %H:%M:%S')]"
|
||||
|
||||
echo "$LOG_PREFIX Starting WetGit backup..."
|
||||
|
||||
# Forgejo dump (runs inside container, no service stop needed)
|
||||
docker exec wetgit-forgejo forgejo dump \
|
||||
--type tar.gz \
|
||||
--file /data/backup-${TIMESTAMP}.tar.gz \
|
||||
2>&1 || {
|
||||
echo "$LOG_PREFIX ERROR: Forgejo dump failed"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Move dump from container volume to backup dir
|
||||
mv "{{ forgejo_data_dir }}/backup-${TIMESTAMP}.tar.gz" \
|
||||
"$BACKUP_DIR/wetgit-forgejo-${TIMESTAMP}.tar.gz"
|
||||
|
||||
# Also backup Redis AOF
|
||||
docker exec wetgit-redis redis-cli BGSAVE 2>/dev/null || true
|
||||
sleep 2
|
||||
cp "{{ data_dir }}/redis/dump.rdb" \
|
||||
"$BACKUP_DIR/wetgit-redis-${TIMESTAMP}.rdb" 2>/dev/null || true
|
||||
|
||||
# Clean old backups
|
||||
find "$BACKUP_DIR" -name "wetgit-forgejo-*.tar.gz" -mtime +${RETENTION_DAYS} -delete
|
||||
find "$BACKUP_DIR" -name "wetgit-redis-*.rdb" -mtime +${RETENTION_DAYS} -delete
|
||||
|
||||
echo "$LOG_PREFIX Backup complete: wetgit-forgejo-${TIMESTAMP}.tar.gz"
|
||||
45
ansible/roles/wetgit-forgejo/templates/docker-compose.yml.j2
Normal file
45
ansible/roles/wetgit-forgejo/templates/docker-compose.yml.j2
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
services:
|
||||
forgejo:
|
||||
image: codeberg.org/forgejo/forgejo:10
|
||||
container_name: wetgit-forgejo
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- USER_UID={{ wetgit_uid }}
|
||||
- USER_GID={{ wetgit_gid }}
|
||||
volumes:
|
||||
- {{ forgejo_data_dir }}:/data
|
||||
- /etc/timezone:/etc/timezone:ro
|
||||
- /etc/localtime:/etc/localtime:ro
|
||||
ports:
|
||||
- "{{ backend_host }}:{{ forgejo_port }}:3000"
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 1G
|
||||
cpus: "2.0"
|
||||
reservations:
|
||||
memory: 256M
|
||||
cpus: "0.5"
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:3000/api/v1/version"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
networks:
|
||||
- wetgit
|
||||
|
||||
redis:
|
||||
image: redis:7-alpine
|
||||
container_name: wetgit-redis
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "{{ backend_host }}:{{ redis_port }}:6379"
|
||||
volumes:
|
||||
- {{ data_dir }}/redis:/data
|
||||
command: redis-server --appendonly yes --maxmemory 256mb --maxmemory-policy allkeys-lru
|
||||
networks:
|
||||
- wetgit
|
||||
|
||||
networks:
|
||||
wetgit:
|
||||
name: wetgit-network
|
||||
|
|
@ -0,0 +1,55 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# Mirror WetGit repos from self-hosted Forgejo to Codeberg
|
||||
# Managed by Ansible — runs daily at 04:00
|
||||
|
||||
CODEBERG_USER="coornhert"
|
||||
CODEBERG_TOKEN_FILE="{{ app_dir }}/.codeberg-token"
|
||||
MIRROR_DIR="{{ app_dir }}/mirrors"
|
||||
LOG_PREFIX="[$(date '+%Y-%m-%d %H:%M:%S')]"
|
||||
|
||||
REPOS=(
|
||||
"wetgit/meta"
|
||||
"wetgit/rijk"
|
||||
# Add more as they are created:
|
||||
# "wetgit/cvdr-noord-holland"
|
||||
# "wetgit/eu"
|
||||
)
|
||||
|
||||
if [ ! -f "$CODEBERG_TOKEN_FILE" ]; then
|
||||
echo "$LOG_PREFIX ERROR: Codeberg token not found at $CODEBERG_TOKEN_FILE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
CODEBERG_TOKEN=$(cat "$CODEBERG_TOKEN_FILE")
|
||||
mkdir -p "$MIRROR_DIR"
|
||||
|
||||
for REPO in "${REPOS[@]}"; do
|
||||
REPO_NAME=$(basename "$REPO")
|
||||
REPO_MIRROR_DIR="$MIRROR_DIR/$REPO_NAME.git"
|
||||
FORGEJO_URL="https://{{ forgejo_domain }}/${REPO}.git"
|
||||
CODEBERG_URL="https://${CODEBERG_USER}:${CODEBERG_TOKEN}@codeberg.org/${REPO}.git"
|
||||
|
||||
echo "$LOG_PREFIX Mirroring $REPO..."
|
||||
|
||||
if [ ! -d "$REPO_MIRROR_DIR" ]; then
|
||||
echo "$LOG_PREFIX Initial clone from Forgejo..."
|
||||
git clone --bare "$FORGEJO_URL" "$REPO_MIRROR_DIR"
|
||||
cd "$REPO_MIRROR_DIR"
|
||||
git remote add codeberg "$CODEBERG_URL"
|
||||
else
|
||||
cd "$REPO_MIRROR_DIR"
|
||||
echo "$LOG_PREFIX Fetching from Forgejo..."
|
||||
git fetch origin --prune '+refs/heads/*:refs/heads/*' '+refs/tags/*:refs/tags/*'
|
||||
fi
|
||||
|
||||
echo "$LOG_PREFIX Pushing to Codeberg..."
|
||||
git push codeberg --mirror --force 2>&1 || {
|
||||
echo "$LOG_PREFIX WARNING: Push to Codeberg failed for $REPO (non-fatal)"
|
||||
}
|
||||
|
||||
echo "$LOG_PREFIX Done: $REPO"
|
||||
done
|
||||
|
||||
echo "$LOG_PREFIX Mirror complete."
|
||||
5
ansible/roles/wetgit-nginx/handlers/main.yml
Normal file
5
ansible/roles/wetgit-nginx/handlers/main.yml
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
---
|
||||
- name: reload nginx
|
||||
systemd:
|
||||
name: nginx
|
||||
state: reloaded
|
||||
118
ansible/roles/wetgit-nginx/tasks/main.yml
Normal file
118
ansible/roles/wetgit-nginx/tasks/main.yml
Normal file
|
|
@ -0,0 +1,118 @@
|
|||
---
|
||||
# Nginx vhosts for WetGIT
|
||||
# IMPORTANT: Only adds vhost configs. Does NOT touch global nginx.conf
|
||||
# (managed by dt-platform's nginx role).
|
||||
#
|
||||
# Strategy: Deploy HTTP-only first → get SSL certs → deploy full HTTPS config.
|
||||
|
||||
# --- Step 1: Check existing SSL certificates ---
|
||||
|
||||
- name: Check if API SSL certificate exists
|
||||
stat:
|
||||
path: "/etc/letsencrypt/live/{{ server_name }}/fullchain.pem"
|
||||
register: ssl_cert_api
|
||||
|
||||
- name: Check if Forgejo SSL certificate exists
|
||||
stat:
|
||||
path: "/etc/letsencrypt/live/{{ forgejo_domain }}/fullchain.pem"
|
||||
register: ssl_cert_git
|
||||
|
||||
# --- Step 2: Deploy HTTP-only configs for domains without certs ---
|
||||
|
||||
- name: Deploy API HTTP-only vhost (pre-SSL)
|
||||
copy:
|
||||
content: |
|
||||
# Temporary HTTP-only config for SSL provisioning — managed by Ansible
|
||||
server {
|
||||
listen 80;
|
||||
listen [::]:80;
|
||||
server_name {{ server_name }};
|
||||
location /.well-known/acme-challenge/ { root /var/www/certbot; }
|
||||
location / { return 503; }
|
||||
}
|
||||
dest: /etc/nginx/sites-available/wetgit-api.conf
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
when: not ssl_cert_api.stat.exists
|
||||
notify: reload nginx
|
||||
|
||||
- name: Deploy Forgejo HTTP-only vhost (pre-SSL)
|
||||
copy:
|
||||
content: |
|
||||
# Temporary HTTP-only config for SSL provisioning — managed by Ansible
|
||||
server {
|
||||
listen 80;
|
||||
listen [::]:80;
|
||||
server_name {{ forgejo_domain }};
|
||||
location /.well-known/acme-challenge/ { root /var/www/certbot; }
|
||||
location / { return 503; }
|
||||
}
|
||||
dest: /etc/nginx/sites-available/wetgit-git.conf
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
when: not ssl_cert_git.stat.exists
|
||||
notify: reload nginx
|
||||
|
||||
# --- Step 3: Enable vhosts and reload nginx ---
|
||||
|
||||
- name: Enable API vhost
|
||||
file:
|
||||
src: /etc/nginx/sites-available/wetgit-api.conf
|
||||
dest: /etc/nginx/sites-enabled/wetgit-api.conf
|
||||
state: link
|
||||
notify: reload nginx
|
||||
|
||||
- name: Enable Forgejo vhost
|
||||
file:
|
||||
src: /etc/nginx/sites-available/wetgit-git.conf
|
||||
dest: /etc/nginx/sites-enabled/wetgit-git.conf
|
||||
state: link
|
||||
notify: reload nginx
|
||||
|
||||
# Force handler to run now so nginx has the HTTP configs before certbot
|
||||
- name: Flush handlers (reload nginx for certbot)
|
||||
meta: flush_handlers
|
||||
|
||||
# --- Step 4: Obtain SSL certificates via webroot ---
|
||||
|
||||
- name: Obtain SSL certificate for {{ server_name }}
|
||||
command: >
|
||||
certbot certonly --webroot
|
||||
-w /var/www/certbot
|
||||
-d {{ server_name }}
|
||||
--non-interactive --agree-tos
|
||||
--email coornhert@wetgit.nl
|
||||
when: not ssl_cert_api.stat.exists
|
||||
register: certbot_api
|
||||
|
||||
- name: Obtain SSL certificate for {{ forgejo_domain }}
|
||||
command: >
|
||||
certbot certonly --webroot
|
||||
-w /var/www/certbot
|
||||
-d {{ forgejo_domain }}
|
||||
--non-interactive --agree-tos
|
||||
--email coornhert@wetgit.nl
|
||||
when: not ssl_cert_git.stat.exists
|
||||
register: certbot_git
|
||||
|
||||
# --- Step 5: Deploy full HTTPS configs ---
|
||||
|
||||
- name: Deploy API nginx vhost (full HTTPS)
|
||||
template:
|
||||
src: wetgit-api.conf.j2
|
||||
dest: /etc/nginx/sites-available/wetgit-api.conf
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
notify: reload nginx
|
||||
|
||||
- name: Deploy Forgejo nginx vhost (full HTTPS)
|
||||
template:
|
||||
src: wetgit-git.conf.j2
|
||||
dest: /etc/nginx/sites-available/wetgit-git.conf
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
notify: reload nginx
|
||||
51
ansible/roles/wetgit-nginx/templates/wetgit-api.conf.j2
Normal file
51
ansible/roles/wetgit-nginx/templates/wetgit-api.conf.j2
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
# WetGIT API — managed by WetGIT Ansible (not dt-platform)
|
||||
# Do NOT edit manually
|
||||
|
||||
server {
|
||||
listen 80;
|
||||
listen [::]:80;
|
||||
server_name {{ server_name }};
|
||||
|
||||
# ACME challenge (reuse existing certbot webroot)
|
||||
location /.well-known/acme-challenge/ {
|
||||
root /var/www/certbot;
|
||||
}
|
||||
|
||||
location / {
|
||||
return 301 https://$host$request_uri;
|
||||
}
|
||||
}
|
||||
|
||||
server {
|
||||
listen 443 ssl http2;
|
||||
listen [::]:443 ssl http2;
|
||||
server_name {{ server_name }};
|
||||
|
||||
ssl_certificate /etc/letsencrypt/live/{{ server_name }}/fullchain.pem;
|
||||
ssl_certificate_key /etc/letsencrypt/live/{{ server_name }}/privkey.pem;
|
||||
|
||||
# Security headers
|
||||
add_header Strict-Transport-Security "max-age=63072000; includeSubDomains" always;
|
||||
add_header X-Content-Type-Options "nosniff" always;
|
||||
add_header X-Frame-Options "DENY" always;
|
||||
add_header Referrer-Policy "strict-origin-when-cross-origin" always;
|
||||
|
||||
# API proxy
|
||||
location / {
|
||||
proxy_pass http://{{ backend_host }}:{{ backend_port }};
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
|
||||
# Timeouts for long-running legislation processing
|
||||
proxy_read_timeout 120s;
|
||||
proxy_connect_timeout 10s;
|
||||
}
|
||||
|
||||
# Health check (no rate limit)
|
||||
location = /health {
|
||||
proxy_pass http://{{ backend_host }}:{{ backend_port }}/health;
|
||||
access_log off;
|
||||
}
|
||||
}
|
||||
52
ansible/roles/wetgit-nginx/templates/wetgit-git.conf.j2
Normal file
52
ansible/roles/wetgit-nginx/templates/wetgit-git.conf.j2
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
# Forgejo (git.wetgit.nl) — managed by WetGIT Ansible (not dt-platform)
|
||||
# Do NOT edit manually
|
||||
|
||||
server {
|
||||
listen 80;
|
||||
listen [::]:80;
|
||||
server_name {{ forgejo_domain }};
|
||||
|
||||
location /.well-known/acme-challenge/ {
|
||||
root /var/www/certbot;
|
||||
}
|
||||
|
||||
location / {
|
||||
return 301 https://$host$request_uri;
|
||||
}
|
||||
}
|
||||
|
||||
server {
|
||||
listen 443 ssl http2;
|
||||
listen [::]:443 ssl http2;
|
||||
server_name {{ forgejo_domain }};
|
||||
|
||||
ssl_certificate /etc/letsencrypt/live/{{ forgejo_domain }}/fullchain.pem;
|
||||
ssl_certificate_key /etc/letsencrypt/live/{{ forgejo_domain }}/privkey.pem;
|
||||
|
||||
# Security headers
|
||||
add_header Strict-Transport-Security "max-age=63072000; includeSubDomains" always;
|
||||
add_header X-Content-Type-Options "nosniff" always;
|
||||
add_header X-Frame-Options "SAMEORIGIN" always;
|
||||
add_header Referrer-Policy "strict-origin-when-cross-origin" always;
|
||||
|
||||
# Large body size for git pushes (law datasets can be large)
|
||||
client_max_body_size 512M;
|
||||
|
||||
# Timeouts for large git operations
|
||||
proxy_connect_timeout 300;
|
||||
proxy_send_timeout 300;
|
||||
proxy_read_timeout 300;
|
||||
|
||||
location / {
|
||||
proxy_pass http://{{ backend_host }}:{{ forgejo_port }};
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
|
||||
# WebSocket support (Forgejo live features)
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Upgrade $http_upgrade;
|
||||
proxy_set_header Connection "upgrade";
|
||||
}
|
||||
}
|
||||
27
ansible/site.yml
Normal file
27
ansible/site.yml
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
---
|
||||
# WetGIT - Nederlandse wetgeving als code
|
||||
#
|
||||
# Usage:
|
||||
# ansible-playbook ansible/site.yml
|
||||
# ansible-playbook ansible/site.yml --tags forgejo
|
||||
# ansible-playbook ansible/site.yml --tags app
|
||||
# ansible-playbook ansible/site.yml --tags nginx
|
||||
# ansible-playbook ansible/site.yml --check (dry-run)
|
||||
#
|
||||
# NOTE: This server is shared with dt-platform.
|
||||
# This playbook only manages WetGIT resources.
|
||||
# System-level config (users, packages, firewall) is managed by dt-platform.
|
||||
|
||||
- name: Deploy WetGIT
|
||||
hosts: wetgit
|
||||
become: yes
|
||||
|
||||
roles:
|
||||
- role: wetgit-forgejo
|
||||
tags: [forgejo, docker]
|
||||
|
||||
- role: wetgit-app
|
||||
tags: [app]
|
||||
|
||||
- role: wetgit-nginx
|
||||
tags: [nginx]
|
||||
61
flake.lock
generated
Normal file
61
flake.lock
generated
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
{
|
||||
"nodes": {
|
||||
"flake-utils": {
|
||||
"inputs": {
|
||||
"systems": "systems"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1731533236,
|
||||
"narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1774610258,
|
||||
"narHash": "sha256-HaThtroVD9wRdx7KQk0B75JmFcXlMUoEdDFNOMOlsOs=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "832efc09b4caf6b4569fbf9dc01bec3082a00611",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"ref": "nixpkgs-unstable",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"flake-utils": "flake-utils",
|
||||
"nixpkgs": "nixpkgs"
|
||||
}
|
||||
},
|
||||
"systems": {
|
||||
"locked": {
|
||||
"lastModified": 1681028828,
|
||||
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"type": "github"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
"version": 7
|
||||
}
|
||||
116
flake.nix
Normal file
116
flake.nix
Normal file
|
|
@ -0,0 +1,116 @@
|
|||
{
|
||||
description = "WetGit - Nederlandse wetgeving als code";
|
||||
|
||||
inputs = {
|
||||
nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable";
|
||||
flake-utils.url = "github:numtide/flake-utils";
|
||||
};
|
||||
|
||||
outputs = { self, nixpkgs, flake-utils }:
|
||||
flake-utils.lib.eachDefaultSystem (system:
|
||||
let
|
||||
pkgs = nixpkgs.legacyPackages.${system};
|
||||
|
||||
# Python 3.13 (zelfde versie als ansible gebruikt, voorkomt PATH-conflicten)
|
||||
pythonEnv = pkgs.python313.withPackages (ps: with ps; [
|
||||
# Conversie-pipeline (PRD: Technische Stack)
|
||||
lxml # BWB XML-parsing met XPath/XSLT
|
||||
pygit2 # Git-operaties via libgit2 (performanter dan GitPython)
|
||||
pyyaml # YAML frontmatter generatie
|
||||
python-frontmatter # Markdown + YAML frontmatter parsing
|
||||
|
||||
# API-laag (PRD: FastAPI)
|
||||
fastapi
|
||||
uvicorn # ASGI server
|
||||
httpx # Async HTTP client (SRU-API, EUR-Lex)
|
||||
pydantic # Data validatie
|
||||
|
||||
# Achtergrondtaken (PRD: Celery + Redis)
|
||||
celery
|
||||
redis # Python Redis client
|
||||
|
||||
# CLI-tool (PRD: wetgit CLI)
|
||||
click
|
||||
rich # Terminal formatting
|
||||
|
||||
# Testing
|
||||
pytest
|
||||
pytest-cov
|
||||
pytest-asyncio
|
||||
|
||||
# Development tools
|
||||
black
|
||||
ruff
|
||||
mypy
|
||||
pip
|
||||
setuptools
|
||||
wheel
|
||||
build
|
||||
|
||||
# Typing stubs
|
||||
types-requests
|
||||
types-pyyaml
|
||||
]);
|
||||
|
||||
in {
|
||||
devShells.default = pkgs.mkShell {
|
||||
name = "wetgit";
|
||||
|
||||
buildInputs = with pkgs; [
|
||||
# Python environment
|
||||
pythonEnv
|
||||
|
||||
# Dependency management
|
||||
uv
|
||||
|
||||
# Ansible (infrastructuur provisioning Hetzner)
|
||||
ansible
|
||||
ansible-lint
|
||||
|
||||
# Hetzner Cloud CLI
|
||||
hcloud
|
||||
|
||||
# Redis server (lokale development)
|
||||
redis
|
||||
|
||||
# Git & tools
|
||||
git
|
||||
jq
|
||||
yq-go
|
||||
curl
|
||||
|
||||
# Native dependencies voor pygit2
|
||||
libgit2
|
||||
];
|
||||
|
||||
shellHook = ''
|
||||
echo "WetGit - Nederlandse wetgeving als code"
|
||||
echo ""
|
||||
echo "Python: $(python --version)"
|
||||
echo "Ansible: $(ansible --version 2>/dev/null | head -1)"
|
||||
echo "hcloud: $(hcloud version 2>/dev/null)"
|
||||
echo ""
|
||||
echo "Pipeline tools: lxml, pygit2, fastapi"
|
||||
echo "Infra tools: ansible, hcloud"
|
||||
echo ""
|
||||
|
||||
# Laad .env als die bestaat (API keys, Hetzner token)
|
||||
if [ -f .env ]; then
|
||||
set -a
|
||||
source .env
|
||||
set +a
|
||||
echo "Loaded environment from .env"
|
||||
echo ""
|
||||
fi
|
||||
|
||||
# Venv voor PyPI-only packages (agentmail etc.)
|
||||
if [ ! -d .venv ]; then
|
||||
uv venv .venv --python python3.13 --seed
|
||||
uv pip install --python .venv/bin/python agentmail
|
||||
echo "Created .venv and installed PyPI dependencies"
|
||||
fi
|
||||
source .venv/bin/activate
|
||||
'';
|
||||
};
|
||||
});
|
||||
}
|
||||
92
pyproject.toml
Normal file
92
pyproject.toml
Normal file
|
|
@ -0,0 +1,92 @@
|
|||
[project]
|
||||
name = "wetgit"
|
||||
version = "0.1.0"
|
||||
description = "Nederlandse wetgeving als code — elke wet een Markdown-bestand, elke wijziging een Git-commit"
|
||||
readme = "README.md"
|
||||
license = "MIT"
|
||||
requires-python = ">=3.12"
|
||||
authors = [
|
||||
{ name = "Coornhert", email = "coornhert@wetgit.nl" },
|
||||
]
|
||||
keywords = ["wetgeving", "dutch-law", "bwb", "git", "markdown"]
|
||||
classifiers = [
|
||||
"Development Status :: 2 - Pre-Alpha",
|
||||
"Intended Audience :: Developers",
|
||||
"Intended Audience :: Legal Industry",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Programming Language :: Python :: 3.13",
|
||||
"Topic :: Text Processing :: Markup",
|
||||
]
|
||||
dependencies = [
|
||||
"lxml>=5.0",
|
||||
"pygit2>=1.13",
|
||||
"pyyaml>=6.0",
|
||||
"python-frontmatter>=1.1",
|
||||
"httpx>=0.27",
|
||||
"click>=8.1",
|
||||
"rich>=13.0",
|
||||
"pydantic>=2.0",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
api = [
|
||||
"fastapi>=0.115",
|
||||
"uvicorn>=0.30",
|
||||
"celery>=5.4",
|
||||
"redis>=5.0",
|
||||
]
|
||||
dev = [
|
||||
"pytest>=8.0",
|
||||
"pytest-cov>=5.0",
|
||||
"pytest-asyncio>=0.24",
|
||||
"black>=24.0",
|
||||
"ruff>=0.6",
|
||||
"mypy>=1.11",
|
||||
"types-pyyaml",
|
||||
"types-requests",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
wetgit = "wetgit.cli.main:cli"
|
||||
|
||||
[project.urls]
|
||||
Homepage = "https://wetgit.nl"
|
||||
Repository = "https://git.wetgit.nl/wetgit/meta"
|
||||
Issues = "https://git.wetgit.nl/wetgit/meta/issues"
|
||||
|
||||
[build-system]
|
||||
requires = ["setuptools>=75.0"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
where = ["src"]
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
testpaths = ["tests"]
|
||||
markers = [
|
||||
"unit: Unit tests (fast, no I/O)",
|
||||
"integration: Integration tests (may need network/disk)",
|
||||
"slow: Slow tests (large XML parsing, bulk operations)",
|
||||
]
|
||||
asyncio_mode = "auto"
|
||||
|
||||
[tool.black]
|
||||
line-length = 99
|
||||
target-version = ["py313"]
|
||||
|
||||
[tool.ruff]
|
||||
line-length = 99
|
||||
target-version = "py313"
|
||||
|
||||
[tool.ruff.lint]
|
||||
select = ["E", "F", "I", "N", "W", "UP", "B", "A", "SIM", "TCH"]
|
||||
ignore = ["E501"]
|
||||
|
||||
[tool.ruff.lint.isort]
|
||||
known-first-party = ["wetgit"]
|
||||
|
||||
[tool.mypy]
|
||||
python_version = "3.13"
|
||||
warn_return_any = true
|
||||
warn_unused_configs = true
|
||||
disallow_untyped_defs = true
|
||||
3
src/wetgit/__init__.py
Normal file
3
src/wetgit/__init__.py
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
"""WetGit — Nederlandse wetgeving als code."""
|
||||
|
||||
__version__ = "0.1.0"
|
||||
1
src/wetgit/api/__init__.py
Normal file
1
src/wetgit/api/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
"""FastAPI REST API."""
|
||||
1
src/wetgit/cli/__init__.py
Normal file
1
src/wetgit/cli/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
"""WetGit CLI tool."""
|
||||
17
src/wetgit/cli/main.py
Normal file
17
src/wetgit/cli/main.py
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
"""WetGit CLI — command-line interface."""
|
||||
|
||||
import click
|
||||
|
||||
from wetgit import __version__
|
||||
|
||||
|
||||
@click.group()
|
||||
@click.version_option(version=__version__, prog_name="wetgit")
|
||||
def cli() -> None:
|
||||
"""WetGit — Nederlandse wetgeving als code."""
|
||||
|
||||
|
||||
@cli.command()
|
||||
def version() -> None:
|
||||
"""Toon de WetGit versie."""
|
||||
click.echo(f"wetgit {__version__}")
|
||||
55
src/wetgit/models.py
Normal file
55
src/wetgit/models.py
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
"""Domain models voor WetGit."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import date
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class RegelingType(str, Enum):
|
||||
"""Type regeling conform BWB-classificatie."""
|
||||
|
||||
WET = "wet"
|
||||
AMVB = "amvb"
|
||||
MINISTERIELE_REGELING = "ministeriele-regeling"
|
||||
KB = "kb"
|
||||
RIJKSWET = "rijkswet"
|
||||
VERDRAG = "verdrag"
|
||||
BELEIDSREGEL = "beleidsregel"
|
||||
CIRCULAIRE = "circulaire"
|
||||
ZBO = "zbo"
|
||||
|
||||
|
||||
class RegelingStatus(str, Enum):
|
||||
"""Status van een regeling."""
|
||||
|
||||
GELDEND = "geldend"
|
||||
VERVALLEN = "vervallen"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Regeling:
|
||||
"""Metadata van een regeling."""
|
||||
|
||||
bwb_id: str
|
||||
titel: str
|
||||
type: RegelingType
|
||||
status: RegelingStatus
|
||||
datum_inwerkingtreding: date
|
||||
datum_laatste_wijziging: date | None = None
|
||||
datum_verval: date | None = None
|
||||
citeertitel: str | None = None
|
||||
ministerie: str | None = None
|
||||
bron_url: str | None = None
|
||||
eu_implementatie: list[dict[str, str]] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Artikel:
|
||||
"""Een artikel binnen een regeling."""
|
||||
|
||||
nummer: str
|
||||
titel: str | None
|
||||
inhoud: str
|
||||
leden: list[str] = field(default_factory=list)
|
||||
1
src/wetgit/pipeline/__init__.py
Normal file
1
src/wetgit/pipeline/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
"""BWB/CVDR/EUR-Lex conversie-pipeline."""
|
||||
487
src/wetgit/pipeline/bwb_parser.py
Normal file
487
src/wetgit/pipeline/bwb_parser.py
Normal file
|
|
@ -0,0 +1,487 @@
|
|||
"""BWB XML naar Markdown parser.
|
||||
|
||||
Parseert BWB toestand-XML (schema versie 2.0) naar Markdown + YAML frontmatter.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import date
|
||||
from io import StringIO
|
||||
|
||||
import yaml
|
||||
from lxml import etree
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParsedRegeling:
|
||||
"""Resultaat van het parsen van een BWB toestand-XML."""
|
||||
|
||||
bwb_id: str
|
||||
titel: str
|
||||
citeertitel: str | None
|
||||
soort: str
|
||||
datum_inwerkingtreding: str | None
|
||||
markdown: str
|
||||
frontmatter: dict[str, str | list[str] | None]
|
||||
|
||||
|
||||
def parse_bwb_xml(xml_path: str) -> ParsedRegeling:
|
||||
"""Parse een BWB toestand-XML bestand naar Markdown.
|
||||
|
||||
Args:
|
||||
xml_path: Pad naar het BWB XML-bestand.
|
||||
|
||||
Returns:
|
||||
ParsedRegeling met metadata en Markdown-tekst.
|
||||
"""
|
||||
tree = etree.parse(xml_path)
|
||||
wetgeving = tree.find(".//wetgeving")
|
||||
if wetgeving is None:
|
||||
raise ValueError(f"Geen <wetgeving> element gevonden in {xml_path}")
|
||||
|
||||
# Metadata extraheren
|
||||
soort = wetgeving.get("soort", "onbekend")
|
||||
bwb_id = _extract_bwb_id(tree)
|
||||
datum = wetgeving.get("inwerkingtredingsdatum")
|
||||
|
||||
intitule = wetgeving.findtext(".//intitule", default="").strip()
|
||||
citeertitel = wetgeving.findtext(".//citeertitel", default="").strip() or None
|
||||
titel = citeertitel or intitule or bwb_id
|
||||
|
||||
# Wettekst parsen — probeer meerdere structuren
|
||||
# Formele wetten: <wet-besluit><wettekst>
|
||||
# Ministeriële regelingen: <regeling-tekst>
|
||||
# Circulaires/ZBO: <circulaire-tekst> met <circulaire.divisie>
|
||||
# Fallback: <wettekst> direct
|
||||
wettekst = wetgeving.find(".//wet-besluit//wettekst")
|
||||
if wettekst is None:
|
||||
wettekst = wetgeving.find(".//regeling-tekst")
|
||||
if wettekst is None:
|
||||
wettekst = wetgeving.find(".//circulaire-tekst")
|
||||
if wettekst is None:
|
||||
wettekst = wetgeving.find(".//wettekst")
|
||||
|
||||
md_parts: list[str] = []
|
||||
if wettekst is not None:
|
||||
md_parts = _parse_wettekst(wettekst)
|
||||
|
||||
# Bijlagen parsen (staan buiten de wettekst)
|
||||
for bijlage in wetgeving.findall(".//bijlage"):
|
||||
bijlage_md = _parse_bijlage(bijlage)
|
||||
if bijlage_md:
|
||||
md_parts.append(bijlage_md)
|
||||
|
||||
# Frontmatter opbouwen
|
||||
frontmatter: dict[str, str | list[str] | None] = {
|
||||
"titel": titel,
|
||||
"bwb_id": bwb_id,
|
||||
"type": soort,
|
||||
"status": "geldend",
|
||||
"datum_inwerkingtreding": datum,
|
||||
"bron": f"https://wetten.overheid.nl/{bwb_id}",
|
||||
}
|
||||
if citeertitel:
|
||||
frontmatter["citeertitel"] = citeertitel
|
||||
|
||||
# Markdown samenstellen
|
||||
fm_yaml = yaml.dump(frontmatter, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
||||
markdown = f"---\n{fm_yaml.strip()}\n---\n\n# {titel}\n\n"
|
||||
markdown += "\n\n".join(md_parts)
|
||||
markdown += "\n"
|
||||
|
||||
return ParsedRegeling(
|
||||
bwb_id=bwb_id,
|
||||
titel=titel,
|
||||
citeertitel=citeertitel,
|
||||
soort=soort,
|
||||
datum_inwerkingtreding=datum,
|
||||
markdown=markdown,
|
||||
frontmatter=frontmatter,
|
||||
)
|
||||
|
||||
|
||||
def _extract_bwb_id(tree: etree._ElementTree) -> str:
|
||||
"""Haal BWB-ID op uit het XML-document."""
|
||||
root = tree.getroot()
|
||||
# Probeer eerst via wetgeving stam-id
|
||||
wetgeving = tree.find(".//wetgeving")
|
||||
if wetgeving is not None:
|
||||
stam_id = wetgeving.get("stam-id", "")
|
||||
# stam-id is niet het BWB-ID, zoek in meta-data
|
||||
# Zoek in meta-data
|
||||
for elem in tree.iter():
|
||||
if elem.tag == "toestand":
|
||||
bwb_id = elem.get("bwb-id")
|
||||
if bwb_id:
|
||||
return bwb_id
|
||||
# Fallback: zoek in bwb-inputbestand
|
||||
inp = tree.find(".//bwb-inputbestand")
|
||||
if inp is not None:
|
||||
for child in inp.iter():
|
||||
bwb_id = child.get("bwb-id")
|
||||
if bwb_id:
|
||||
return bwb_id
|
||||
return "ONBEKEND"
|
||||
|
||||
|
||||
def _parse_wettekst(wettekst: etree._Element) -> list[str]:
|
||||
"""Parse het <wettekst> element naar Markdown-blokken."""
|
||||
parts: list[str] = []
|
||||
for child in wettekst:
|
||||
tag = child.tag
|
||||
if tag == "deel":
|
||||
parts.append(_parse_structuur(child, level=2, label="Deel"))
|
||||
elif tag == "boek":
|
||||
parts.append(_parse_structuur(child, level=2, label="Boek"))
|
||||
elif tag == "hoofdstuk":
|
||||
parts.append(_parse_structuur(child, level=2, label="Hoofdstuk"))
|
||||
elif tag == "titeldeel":
|
||||
parts.append(_parse_structuur(child, level=2, label="Titel"))
|
||||
elif tag == "afdeling":
|
||||
parts.append(_parse_structuur(child, level=3, label="Afdeling"))
|
||||
elif tag == "paragraaf":
|
||||
parts.append(_parse_structuur(child, level=3, label="Paragraaf"))
|
||||
elif tag == "circulaire.divisie":
|
||||
parts.append(_parse_structuur(child, level=2, label=""))
|
||||
elif tag == "sub-paragraaf":
|
||||
parts.append(_parse_structuur(child, level=3, label="Paragraaf"))
|
||||
elif tag == "divisie":
|
||||
parts.append(_parse_structuur(child, level=2, label=""))
|
||||
elif tag in ("artikel", "enig-artikel"):
|
||||
parts.append(_parse_artikel(child))
|
||||
elif tag == "tekst":
|
||||
parts.append(_parse_tekst_content(child))
|
||||
elif tag == "al":
|
||||
parts.append(_get_text(child))
|
||||
elif tag == "table":
|
||||
parts.append(_parse_cals_table(child))
|
||||
elif tag == "tabel":
|
||||
parts.append("*[tabel]*")
|
||||
elif tag in ("plaatje", "illustratie"):
|
||||
parts.append("*[afbeelding]*")
|
||||
return [p for p in parts if p.strip()]
|
||||
|
||||
|
||||
def _parse_structuur(elem: etree._Element, level: int, label: str) -> str:
|
||||
"""Parse een structuurelement (hoofdstuk, afdeling, paragraaf, boek)."""
|
||||
nr = ""
|
||||
titel = ""
|
||||
|
||||
# <kop> element bevat nr en titel bij boeken en sommige andere structuren
|
||||
kop = elem.find("./kop")
|
||||
if kop is not None:
|
||||
nr = kop.findtext("./nr", default="").strip()
|
||||
titel = kop.findtext("./titel", default="").strip()
|
||||
|
||||
# Fallback: directe child-elementen
|
||||
if not nr:
|
||||
nr = elem.findtext("./nr", default="").strip()
|
||||
if not titel:
|
||||
titel = (
|
||||
elem.findtext("./hoofdstuktitel", default="")
|
||||
or elem.findtext("./titeldeel-titel", default="")
|
||||
or elem.findtext("./afdelingtitel", default="")
|
||||
or elem.findtext("./paragraaftitel", default="")
|
||||
or elem.findtext("./boektitel", default="")
|
||||
or elem.findtext("./titel", default="")
|
||||
).strip()
|
||||
|
||||
heading = "#" * level
|
||||
header = f"{heading} {label} {nr}"
|
||||
if titel:
|
||||
header += f". {titel}"
|
||||
|
||||
skip_tags = {
|
||||
"nr", "kop", "titel",
|
||||
"hoofdstuktitel", "titeldeel-titel", "afdelingtitel",
|
||||
"paragraaftitel", "boektitel",
|
||||
}
|
||||
|
||||
parts = [header]
|
||||
|
||||
for child in elem:
|
||||
tag = child.tag
|
||||
if tag in skip_tags:
|
||||
continue
|
||||
elif tag == "deel":
|
||||
parts.append(_parse_structuur(child, level=level + 1, label="Deel"))
|
||||
elif tag == "boek":
|
||||
parts.append(_parse_structuur(child, level=level + 1, label="Boek"))
|
||||
elif tag == "hoofdstuk":
|
||||
parts.append(_parse_structuur(child, level=level + 1, label="Hoofdstuk"))
|
||||
elif tag == "titeldeel":
|
||||
parts.append(_parse_structuur(child, level=level + 1, label="Titel"))
|
||||
elif tag == "afdeling":
|
||||
parts.append(_parse_structuur(child, level=level + 1, label="Afdeling"))
|
||||
elif tag == "paragraaf":
|
||||
parts.append(_parse_structuur(child, level=level + 1, label="Paragraaf"))
|
||||
elif tag == "circulaire.divisie":
|
||||
parts.append(_parse_structuur(child, level=level + 1, label=""))
|
||||
elif tag == "sub-paragraaf":
|
||||
parts.append(_parse_structuur(child, level=level + 1, label="Paragraaf"))
|
||||
elif tag == "divisie":
|
||||
parts.append(_parse_structuur(child, level=level + 1, label=""))
|
||||
elif tag in ("artikel", "enig-artikel"):
|
||||
parts.append(_parse_artikel(child))
|
||||
elif tag == "al":
|
||||
parts.append(_get_text(child))
|
||||
elif tag == "tekst":
|
||||
parts.append(_parse_tekst_content(child))
|
||||
elif tag == "tussenkop":
|
||||
text = _get_text(child).strip()
|
||||
if text:
|
||||
sub_heading = "#" * min(level + 1, 6)
|
||||
parts.append(f"{sub_heading} {text}")
|
||||
elif tag == "redactie":
|
||||
text = _get_text(child).strip()
|
||||
if text:
|
||||
parts.append(f"*[{text}]*")
|
||||
elif tag == "table":
|
||||
parts.append(_parse_cals_table(child))
|
||||
elif tag == "tabel":
|
||||
parts.append("*[tabel]*")
|
||||
elif tag in ("plaatje", "illustratie"):
|
||||
parts.append("*[afbeelding]*")
|
||||
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
def _parse_artikel(artikel: etree._Element) -> str:
|
||||
"""Parse een <artikel> element naar Markdown."""
|
||||
nr = artikel.findtext(".//nr", default="").strip()
|
||||
heading = f"### Artikel {nr}" if nr else "### Artikel"
|
||||
|
||||
parts = [heading]
|
||||
|
||||
for child in artikel:
|
||||
tag = child.tag
|
||||
if tag == "nr":
|
||||
continue
|
||||
elif tag == "titel":
|
||||
titel_text = _get_text(child).strip()
|
||||
if titel_text:
|
||||
parts.append(f"*{titel_text}*")
|
||||
elif tag == "lid":
|
||||
parts.append(_parse_lid(child))
|
||||
elif tag == "al":
|
||||
parts.append(_get_text(child))
|
||||
elif tag == "lijst":
|
||||
parts.append(_parse_lijst(child))
|
||||
elif tag == "tekst":
|
||||
parts.append(_parse_tekst_content(child))
|
||||
elif tag == "tabel":
|
||||
parts.append("*[tabel]*")
|
||||
elif tag == "gereserveerd":
|
||||
parts.append("*[Dit artikel is gereserveerd.]*")
|
||||
elif tag == "vervallen":
|
||||
parts.append("*[Dit artikel is vervallen.]*")
|
||||
elif tag == "lid-vervallen":
|
||||
lidnr = child.findtext(".//lidnr", default="").strip()
|
||||
parts.append(f"**{lidnr}.** *[Vervallen.]*" if lidnr else "*[Lid vervallen.]*")
|
||||
elif tag == "lidnr":
|
||||
continue
|
||||
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
def _parse_lid(lid: etree._Element) -> str:
|
||||
"""Parse een <lid> element."""
|
||||
lidnr = lid.findtext(".//lidnr", default="").strip()
|
||||
parts: list[str] = []
|
||||
if lidnr:
|
||||
parts.append(f"**{lidnr}.**")
|
||||
|
||||
for child in lid:
|
||||
tag = child.tag
|
||||
if tag == "lidnr":
|
||||
continue
|
||||
elif tag == "al":
|
||||
parts.append(_get_text(child))
|
||||
elif tag == "lijst":
|
||||
parts.append(_parse_lijst(child))
|
||||
elif tag == "tekst":
|
||||
parts.append(_parse_tekst_content(child))
|
||||
elif tag == "table":
|
||||
parts.append(_parse_cals_table(child))
|
||||
elif tag == "tabel":
|
||||
parts.append("*[tabel]*")
|
||||
elif tag == "formule":
|
||||
parts.append(f"*[formule: {_get_text(child)}]*")
|
||||
elif tag == "redactie":
|
||||
text = _get_text(child).strip()
|
||||
if text:
|
||||
parts.append(f"*[{text}]*")
|
||||
|
||||
return " ".join(parts) if lidnr and len(parts) <= 2 else "\n\n".join(parts)
|
||||
|
||||
|
||||
def _parse_lijst(lijst: etree._Element) -> str:
|
||||
"""Parse een <lijst> element naar Markdown-lijst."""
|
||||
items: list[str] = []
|
||||
for li in lijst.findall(".//li"):
|
||||
nr = li.findtext(".//li.nr", default="").strip()
|
||||
body = li.find(".//li.body")
|
||||
if body is not None:
|
||||
text = _parse_tekst_content(body)
|
||||
else:
|
||||
text = _get_text(li)
|
||||
prefix = f"{nr} " if nr else "- "
|
||||
items.append(f"{prefix}{text}")
|
||||
return "\n".join(items)
|
||||
|
||||
|
||||
def _parse_tekst_content(elem: etree._Element) -> str:
|
||||
"""Parse gemengde content (al, lijst, etc.) binnen een element."""
|
||||
parts: list[str] = []
|
||||
for child in elem:
|
||||
if child.tag == "al":
|
||||
parts.append(_get_text(child))
|
||||
elif child.tag == "lijst":
|
||||
parts.append(_parse_lijst(child))
|
||||
elif child.tag == "table":
|
||||
parts.append(_parse_cals_table(child))
|
||||
elif child.tag == "tabel":
|
||||
parts.append("*[tabel]*")
|
||||
elif child.tag in ("plaatje", "illustratie"):
|
||||
parts.append("*[afbeelding]*")
|
||||
elif child.tag == "redactie":
|
||||
text = _get_text(child).strip()
|
||||
if text:
|
||||
parts.append(f"*[{text}]*")
|
||||
if not parts:
|
||||
text = _get_text(elem)
|
||||
if text:
|
||||
parts.append(text)
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
def _parse_cals_table(table: etree._Element) -> str:
|
||||
"""Parse een CALS <table> element naar Markdown tabel."""
|
||||
rows: list[list[str]] = []
|
||||
has_header = False
|
||||
|
||||
# Thead
|
||||
thead = table.find(".//thead")
|
||||
if thead is not None:
|
||||
has_header = True
|
||||
for row in thead.findall(".//row"):
|
||||
cells = [_get_text(e) for e in row.findall(".//entry")]
|
||||
rows.append(cells)
|
||||
|
||||
# Tbody
|
||||
tbody = table.find(".//tbody")
|
||||
if tbody is not None:
|
||||
for row in tbody.findall(".//row"):
|
||||
cells = [_get_text(e) for e in row.findall(".//entry")]
|
||||
rows.append(cells)
|
||||
else:
|
||||
# Geen tbody — rows direct onder tgroup
|
||||
for row in table.findall(".//row"):
|
||||
cells = [_get_text(e) for e in row.findall(".//entry")]
|
||||
rows.append(cells)
|
||||
|
||||
if not rows:
|
||||
return "*[tabel]*"
|
||||
|
||||
# Normaliseer kolom-aantallen
|
||||
max_cols = max(len(r) for r in rows)
|
||||
for r in rows:
|
||||
while len(r) < max_cols:
|
||||
r.append("")
|
||||
|
||||
# Markdown tabel genereren
|
||||
lines: list[str] = []
|
||||
for i, row in enumerate(rows):
|
||||
escaped = [cell.replace("|", "\\|").replace("\n", " ") for cell in row]
|
||||
lines.append("| " + " | ".join(escaped) + " |")
|
||||
if i == 0:
|
||||
lines.append("| " + " | ".join("---" for _ in escaped) + " |")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _parse_bijlage(bijlage: etree._Element) -> str:
|
||||
"""Parse een <bijlage> element naar Markdown."""
|
||||
# Titel uit <kop>
|
||||
kop = bijlage.find("./kop")
|
||||
nr = ""
|
||||
titel = ""
|
||||
if kop is not None:
|
||||
nr = kop.findtext("./nr", default="").strip()
|
||||
titel = kop.findtext("./titel", default="").strip()
|
||||
|
||||
header = f"## Bijlage {nr}"
|
||||
if titel:
|
||||
header += f". {titel}"
|
||||
|
||||
parts = [header]
|
||||
|
||||
for child in bijlage:
|
||||
tag = child.tag
|
||||
if tag == "kop":
|
||||
continue
|
||||
elif tag == "artikel":
|
||||
parts.append(_parse_artikel(child))
|
||||
elif tag == "tekst":
|
||||
parts.append(_parse_tekst_content(child))
|
||||
elif tag == "tabel":
|
||||
parts.append("*[tabel]*")
|
||||
elif tag == "plaatje":
|
||||
parts.append("*[afbeelding]*")
|
||||
elif tag == "bijlage-tekst":
|
||||
parts.append(_parse_tekst_content(child))
|
||||
elif tag in ("hoofdstuk", "titeldeel", "afdeling", "paragraaf"):
|
||||
parts.append(_parse_structuur(child, level=3, label=tag.capitalize()))
|
||||
elif tag == "al":
|
||||
parts.append(_get_text(child))
|
||||
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
def _get_text(elem: etree._Element) -> str:
|
||||
"""Haal alle tekst op uit een element, met inline formatting.
|
||||
|
||||
Verwerkt <nadruk>, <sup>, <sub/inf>, <extref> en <intref> inline.
|
||||
"""
|
||||
parts: list[str] = []
|
||||
_collect_text(elem, parts)
|
||||
return "".join(parts).strip()
|
||||
|
||||
|
||||
def _collect_text(elem: etree._Element, parts: list[str]) -> None:
|
||||
"""Recursief tekst verzamelen met inline Markdown formatting."""
|
||||
if elem.text:
|
||||
parts.append(elem.text)
|
||||
|
||||
for child in elem:
|
||||
tag = child.tag
|
||||
if tag == "nadruk":
|
||||
nadruk_type = child.get("type", "")
|
||||
inner = "".join(child.itertext())
|
||||
if nadruk_type == "vet":
|
||||
parts.append(f"**{inner}**")
|
||||
elif nadruk_type == "cur":
|
||||
parts.append(f"*{inner}*")
|
||||
else:
|
||||
parts.append(inner)
|
||||
elif tag == "sup":
|
||||
inner = "".join(child.itertext())
|
||||
parts.append(f"^{inner}")
|
||||
elif tag in ("sub", "inf"):
|
||||
inner = "".join(child.itertext())
|
||||
parts.append(f"_{inner}")
|
||||
elif tag in ("extref", "intref"):
|
||||
# Tekst behouden, link-info gaat verloren in Markdown v0.1
|
||||
_collect_text(child, parts)
|
||||
elif tag == "nootref":
|
||||
# Voetnootverwijzing — neem tekst mee
|
||||
inner = "".join(child.itertext())
|
||||
parts.append(inner)
|
||||
else:
|
||||
# Onbekend inline element — neem tekst gewoon mee
|
||||
_collect_text(child, parts)
|
||||
|
||||
if child.tail:
|
||||
parts.append(child.tail)
|
||||
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
0
tests/pipeline/__init__.py
Normal file
0
tests/pipeline/__init__.py
Normal file
59
tests/pipeline/test_bwb_parser.py
Normal file
59
tests/pipeline/test_bwb_parser.py
Normal file
|
|
@ -0,0 +1,59 @@
|
|||
"""Tests voor de BWB XML parser."""
|
||||
|
||||
import pytest
|
||||
|
||||
from wetgit.pipeline.bwb_parser import parse_bwb_xml
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def grondwet_xml(tmp_path):
|
||||
"""Download de Grondwet XML voor tests."""
|
||||
import httpx
|
||||
|
||||
url = "https://repository.officiele-overheidspublicaties.nl/bwb/BWBR0001840/2023-02-22_0/xml/BWBR0001840_2023-02-22_0.xml"
|
||||
resp = httpx.get(url, timeout=30)
|
||||
resp.raise_for_status()
|
||||
xml_path = tmp_path / "grondwet.xml"
|
||||
xml_path.write_bytes(resp.content)
|
||||
return str(xml_path)
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
class TestBWBParser:
|
||||
def test_parse_grondwet_metadata(self, grondwet_xml: str) -> None:
|
||||
result = parse_bwb_xml(grondwet_xml)
|
||||
assert result.bwb_id == "BWBR0001840"
|
||||
assert result.titel == "Grondwet"
|
||||
assert result.soort == "wet"
|
||||
|
||||
def test_parse_grondwet_frontmatter(self, grondwet_xml: str) -> None:
|
||||
result = parse_bwb_xml(grondwet_xml)
|
||||
assert result.frontmatter["bwb_id"] == "BWBR0001840"
|
||||
assert result.frontmatter["type"] == "wet"
|
||||
assert result.frontmatter["status"] == "geldend"
|
||||
assert "wetten.overheid.nl" in str(result.frontmatter["bron"])
|
||||
|
||||
def test_parse_grondwet_has_artikel_1(self, grondwet_xml: str) -> None:
|
||||
result = parse_bwb_xml(grondwet_xml)
|
||||
assert "### Artikel 1" in result.markdown
|
||||
assert "gelijke gevallen gelijk behandeld" in result.markdown
|
||||
|
||||
def test_parse_grondwet_has_hoofdstukken(self, grondwet_xml: str) -> None:
|
||||
result = parse_bwb_xml(grondwet_xml)
|
||||
assert "## Hoofdstuk 1" in result.markdown
|
||||
assert "## Hoofdstuk 2" in result.markdown
|
||||
|
||||
def test_parse_grondwet_yaml_frontmatter(self, grondwet_xml: str) -> None:
|
||||
result = parse_bwb_xml(grondwet_xml)
|
||||
assert result.markdown.startswith("---\n")
|
||||
assert "\n---\n" in result.markdown
|
||||
|
||||
def test_parse_grondwet_markdown_structure(self, grondwet_xml: str) -> None:
|
||||
result = parse_bwb_xml(grondwet_xml)
|
||||
# Should start with frontmatter then h1
|
||||
lines = result.markdown.split("\n")
|
||||
assert lines[0] == "---"
|
||||
# Find the h1
|
||||
h1_lines = [l for l in lines if l.startswith("# ")]
|
||||
assert len(h1_lines) == 1
|
||||
assert h1_lines[0] == "# Grondwet"
|
||||
Loading…
Add table
Reference in a new issue