Compare commits
2 commits
bed91e891e
...
c481ebf9e7
| Author | SHA1 | Date | |
|---|---|---|---|
| c481ebf9e7 | |||
| 1dc93b0f89 |
35 changed files with 1769 additions and 0 deletions
10
.env.example
Normal file
10
.env.example
Normal file
|
|
@ -0,0 +1,10 @@
|
||||||
|
# WetGIT Environment Variables
|
||||||
|
# Copy to .env and fill in real values:
|
||||||
|
# cp .env.example .env
|
||||||
|
|
||||||
|
# AgentMail API (coornhert@wetgit.nl)
|
||||||
|
# Get your key from https://console.agentmail.to
|
||||||
|
AGENTMAIL_API_KEY=
|
||||||
|
|
||||||
|
# Hetzner Cloud
|
||||||
|
HCLOUD_TOKEN=
|
||||||
33
.gitignore
vendored
Normal file
33
.gitignore
vendored
Normal file
|
|
@ -0,0 +1,33 @@
|
||||||
|
# Secrets
|
||||||
|
.env
|
||||||
|
.env.*
|
||||||
|
!.env.example
|
||||||
|
ansible/.vault_pass
|
||||||
|
|
||||||
|
# Nix / direnv
|
||||||
|
.direnv/
|
||||||
|
result
|
||||||
|
|
||||||
|
# Python
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*.egg-info/
|
||||||
|
dist/
|
||||||
|
build/
|
||||||
|
.venv/
|
||||||
|
*.egg
|
||||||
|
|
||||||
|
# Testing
|
||||||
|
.coverage
|
||||||
|
htmlcov/
|
||||||
|
.pytest_cache/
|
||||||
|
|
||||||
|
# IDE
|
||||||
|
.idea/
|
||||||
|
.vscode/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
|
||||||
|
# OS
|
||||||
|
.DS_Store
|
||||||
|
Thumbs.db
|
||||||
10
ansible/ansible.cfg
Normal file
10
ansible/ansible.cfg
Normal file
|
|
@ -0,0 +1,10 @@
|
||||||
|
[defaults]
|
||||||
|
inventory = inventory/hosts
|
||||||
|
remote_tmp = /tmp/.ansible/tmp
|
||||||
|
host_key_checking = True
|
||||||
|
retry_files_enabled = False
|
||||||
|
roles_path = roles
|
||||||
|
vault_password_file = .vault_pass
|
||||||
|
|
||||||
|
[ssh_connection]
|
||||||
|
pipelining = True
|
||||||
45
ansible/group_vars/wetgit/main.yml
Normal file
45
ansible/group_vars/wetgit/main.yml
Normal file
|
|
@ -0,0 +1,45 @@
|
||||||
|
# WetGIT - Nederlandse wetgeving als code
|
||||||
|
# Deployment variables for dt-prod-01
|
||||||
|
#
|
||||||
|
# IMPORTANT: This server is shared with dt-platform.
|
||||||
|
# Do NOT use ports 8001 (dt-chatbot), 8200 (grimoire).
|
||||||
|
# Do NOT modify /opt/dt-chatbot, /opt/dt-skills-portal, /opt/grimoire.
|
||||||
|
# Do NOT modify the global nginx.conf — only add vhost configs.
|
||||||
|
|
||||||
|
# --- Application ---
|
||||||
|
app_name: wetgit
|
||||||
|
app_dir: /opt/wetgit
|
||||||
|
data_dir: /data/wetgit
|
||||||
|
|
||||||
|
# FastAPI backend
|
||||||
|
backend_port: 8002
|
||||||
|
backend_workers: 1
|
||||||
|
backend_host: "127.0.0.1"
|
||||||
|
|
||||||
|
# --- Domains ---
|
||||||
|
server_name: "api.wetgit.nl"
|
||||||
|
forgejo_domain: "git.wetgit.nl"
|
||||||
|
|
||||||
|
# --- Forgejo ---
|
||||||
|
forgejo_port: 3000
|
||||||
|
forgejo_data_dir: /opt/wetgit/data
|
||||||
|
forgejo_admin_user: coornhert
|
||||||
|
forgejo_admin_email: coornhert@wetgit.nl
|
||||||
|
|
||||||
|
# --- Redis (Docker, shared network with Forgejo) ---
|
||||||
|
redis_port: 6379
|
||||||
|
redis_host: "127.0.0.1"
|
||||||
|
|
||||||
|
# --- Celery ---
|
||||||
|
celery_concurrency: 2
|
||||||
|
|
||||||
|
# --- Codeberg mirror ---
|
||||||
|
codeberg_api_token: "{{ vault_codeberg_api_token | default('') }}"
|
||||||
|
|
||||||
|
# --- AgentMail ---
|
||||||
|
agentmail_api_key: "{{ vault_agentmail_api_key }}"
|
||||||
|
|
||||||
|
# --- Secrets (from vault.yml) ---
|
||||||
|
# vault_agentmail_api_key
|
||||||
|
# vault_codeberg_api_token (add when Codeberg account is ready)
|
||||||
|
# vault_forgejo_admin_password (initial admin password)
|
||||||
14
ansible/group_vars/wetgit/vault.yml
Normal file
14
ansible/group_vars/wetgit/vault.yml
Normal file
|
|
@ -0,0 +1,14 @@
|
||||||
|
$ANSIBLE_VAULT;1.1;AES256
|
||||||
|
35323237613730303463313335643433616238663932643630636530356461323433666435653436
|
||||||
|
3433343462343538333335343165353538613435613962650a656166366364393564353733343561
|
||||||
|
66643462313261643538653839393365643634376432373665653133383464313636633762366163
|
||||||
|
6562336332396535390a333062323534373963356439353336633964383832313431623934653739
|
||||||
|
37646339376338623536323336353931343039323263666265363763373266343533333236346635
|
||||||
|
37656436623764393037393138343536313666613439666535656631313031343061346130376136
|
||||||
|
64383164643466643162393537343265313632343432336238393030306164636434356463396434
|
||||||
|
34656334383731326131393061333138643435366534333965376666393535316334396662633561
|
||||||
|
61386636336438383563326565336635643663313934326333323939663637653531363261613733
|
||||||
|
38646631333739303737616630663337663265616462346637326539306338613866313762306662
|
||||||
|
38633066323936623233336631653836656531633839643739313966623065313931356630613134
|
||||||
|
39636539643065663963626437383637643932633164306337626330623466313737623532366631
|
||||||
|
6435
|
||||||
2
ansible/inventory/hosts
Normal file
2
ansible/inventory/hosts
Normal file
|
|
@ -0,0 +1,2 @@
|
||||||
|
[wetgit]
|
||||||
|
dt-prod-01 ansible_host=100.98.29.89 ansible_user=deploy ansible_become=yes
|
||||||
12
ansible/roles/wetgit-app/handlers/main.yml
Normal file
12
ansible/roles/wetgit-app/handlers/main.yml
Normal file
|
|
@ -0,0 +1,12 @@
|
||||||
|
---
|
||||||
|
- name: restart wetgit
|
||||||
|
systemd:
|
||||||
|
name: wetgit
|
||||||
|
state: restarted
|
||||||
|
daemon_reload: yes
|
||||||
|
|
||||||
|
- name: restart wetgit-celery
|
||||||
|
systemd:
|
||||||
|
name: wetgit-celery
|
||||||
|
state: restarted
|
||||||
|
daemon_reload: yes
|
||||||
79
ansible/roles/wetgit-app/tasks/main.yml
Normal file
79
ansible/roles/wetgit-app/tasks/main.yml
Normal file
|
|
@ -0,0 +1,79 @@
|
||||||
|
---
|
||||||
|
# WetGIT FastAPI application + Celery worker
|
||||||
|
# Deploys to /opt/wetgit/backend with own venv and systemd services
|
||||||
|
#
|
||||||
|
# Directories are created by wetgit-forgejo role (runs first).
|
||||||
|
# This role only manages the FastAPI app and Celery worker.
|
||||||
|
#
|
||||||
|
# NOTE: Services are only enabled when application code exists.
|
||||||
|
# On first deploy (no code yet), this role is effectively a no-op.
|
||||||
|
|
||||||
|
- name: Check if application code exists
|
||||||
|
stat:
|
||||||
|
path: "{{ app_dir }}/backend/requirements.txt"
|
||||||
|
register: app_code
|
||||||
|
|
||||||
|
- name: Create Python venv
|
||||||
|
command: python3 -m venv {{ app_dir }}/backend/venv
|
||||||
|
args:
|
||||||
|
creates: "{{ app_dir }}/backend/venv/bin/python"
|
||||||
|
when: app_code.stat.exists
|
||||||
|
|
||||||
|
- name: Set venv ownership
|
||||||
|
file:
|
||||||
|
path: "{{ app_dir }}/backend/venv"
|
||||||
|
owner: www-data
|
||||||
|
group: www-data
|
||||||
|
recurse: yes
|
||||||
|
when: app_code.stat.exists
|
||||||
|
|
||||||
|
- name: Install Python dependencies
|
||||||
|
pip:
|
||||||
|
requirements: "{{ app_dir }}/backend/requirements.txt"
|
||||||
|
virtualenv: "{{ app_dir }}/backend/venv"
|
||||||
|
when: app_code.stat.exists
|
||||||
|
notify: restart wetgit
|
||||||
|
|
||||||
|
- name: Deploy environment file
|
||||||
|
template:
|
||||||
|
src: wetgit.env.j2
|
||||||
|
dest: "{{ app_dir }}/backend/.env"
|
||||||
|
owner: www-data
|
||||||
|
group: www-data
|
||||||
|
mode: "0600"
|
||||||
|
notify: restart wetgit
|
||||||
|
|
||||||
|
- name: Deploy WetGIT systemd service
|
||||||
|
template:
|
||||||
|
src: wetgit.service.j2
|
||||||
|
dest: /etc/systemd/system/wetgit.service
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: "0644"
|
||||||
|
notify: restart wetgit
|
||||||
|
|
||||||
|
- name: Deploy Celery worker systemd service
|
||||||
|
template:
|
||||||
|
src: wetgit-celery.service.j2
|
||||||
|
dest: /etc/systemd/system/wetgit-celery.service
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: "0644"
|
||||||
|
notify: restart wetgit-celery
|
||||||
|
|
||||||
|
# Only start services when app code is deployed
|
||||||
|
- name: Enable and start WetGIT service
|
||||||
|
systemd:
|
||||||
|
name: wetgit
|
||||||
|
enabled: yes
|
||||||
|
state: started
|
||||||
|
daemon_reload: yes
|
||||||
|
when: app_code.stat.exists
|
||||||
|
|
||||||
|
- name: Enable and start Celery worker
|
||||||
|
systemd:
|
||||||
|
name: wetgit-celery
|
||||||
|
enabled: yes
|
||||||
|
state: started
|
||||||
|
daemon_reload: yes
|
||||||
|
when: app_code.stat.exists
|
||||||
17
ansible/roles/wetgit-app/templates/wetgit-celery.service.j2
Normal file
17
ansible/roles/wetgit-app/templates/wetgit-celery.service.j2
Normal file
|
|
@ -0,0 +1,17 @@
|
||||||
|
[Unit]
|
||||||
|
Description=WetGIT Celery Worker
|
||||||
|
After=network.target docker.service
|
||||||
|
Requires=docker.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
User=www-data
|
||||||
|
Group=www-data
|
||||||
|
WorkingDirectory={{ app_dir }}/backend
|
||||||
|
EnvironmentFile={{ app_dir }}/backend/.env
|
||||||
|
ExecStart={{ app_dir }}/backend/venv/bin/celery -A tasks worker --loglevel=info --concurrency={{ celery_concurrency }}
|
||||||
|
Restart=always
|
||||||
|
RestartSec=10
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
19
ansible/roles/wetgit-app/templates/wetgit.env.j2
Normal file
19
ansible/roles/wetgit-app/templates/wetgit.env.j2
Normal file
|
|
@ -0,0 +1,19 @@
|
||||||
|
# WetGIT environment — managed by Ansible
|
||||||
|
# Do NOT edit manually on the server
|
||||||
|
|
||||||
|
# FastAPI
|
||||||
|
WETGIT_HOST={{ backend_host }}
|
||||||
|
WETGIT_PORT={{ backend_port }}
|
||||||
|
WETGIT_WORKERS={{ backend_workers }}
|
||||||
|
|
||||||
|
# Redis / Celery
|
||||||
|
REDIS_URL=redis://{{ redis_host }}:{{ redis_port }}/0
|
||||||
|
CELERY_BROKER_URL=redis://{{ redis_host }}:{{ redis_port }}/0
|
||||||
|
CELERY_RESULT_BACKEND=redis://{{ redis_host }}:{{ redis_port }}/1
|
||||||
|
|
||||||
|
# AgentMail
|
||||||
|
AGENTMAIL_API_KEY={{ agentmail_api_key }}
|
||||||
|
|
||||||
|
# Data
|
||||||
|
WETGIT_DATA_DIR={{ data_dir }}
|
||||||
|
WETGIT_GIT_REPOS_DIR={{ data_dir }}/git-repos
|
||||||
17
ansible/roles/wetgit-app/templates/wetgit.service.j2
Normal file
17
ansible/roles/wetgit-app/templates/wetgit.service.j2
Normal file
|
|
@ -0,0 +1,17 @@
|
||||||
|
[Unit]
|
||||||
|
Description=WetGIT API - Nederlandse wetgeving als code
|
||||||
|
After=network.target docker.service
|
||||||
|
Wants=wetgit-celery.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
User=www-data
|
||||||
|
Group=www-data
|
||||||
|
WorkingDirectory={{ app_dir }}/backend
|
||||||
|
EnvironmentFile={{ app_dir }}/backend/.env
|
||||||
|
ExecStart={{ app_dir }}/backend/venv/bin/uvicorn main:app --host {{ backend_host }} --port {{ backend_port }} --workers {{ backend_workers }}
|
||||||
|
Restart=always
|
||||||
|
RestartSec=5
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
5
ansible/roles/wetgit-forgejo/handlers/main.yml
Normal file
5
ansible/roles/wetgit-forgejo/handlers/main.yml
Normal file
|
|
@ -0,0 +1,5 @@
|
||||||
|
---
|
||||||
|
- name: restart forgejo
|
||||||
|
community.docker.docker_compose_v2:
|
||||||
|
project_src: "{{ app_dir }}/docker"
|
||||||
|
state: restarted
|
||||||
148
ansible/roles/wetgit-forgejo/tasks/main.yml
Normal file
148
ansible/roles/wetgit-forgejo/tasks/main.yml
Normal file
|
|
@ -0,0 +1,148 @@
|
||||||
|
---
|
||||||
|
# WetGIT Forgejo (self-hosted Git) + Redis
|
||||||
|
#
|
||||||
|
# Deploys Forgejo and Redis as Docker containers.
|
||||||
|
# Forgejo serves git.wetgit.nl (HTTPS-only, no SSH — blocked by firewall).
|
||||||
|
# Redis provides Celery broker for the WetGIT pipeline.
|
||||||
|
#
|
||||||
|
# IMPORTANT: Does NOT touch dt-platform's Docker services (grimoire).
|
||||||
|
# All containers use the 'wetgit-network' Docker network.
|
||||||
|
|
||||||
|
# --- System user ---
|
||||||
|
|
||||||
|
- name: Create wetgit system user
|
||||||
|
user:
|
||||||
|
name: wetgit
|
||||||
|
system: yes
|
||||||
|
home: /opt/wetgit
|
||||||
|
shell: /bin/bash
|
||||||
|
create_home: no
|
||||||
|
|
||||||
|
- name: Get wetgit user UID
|
||||||
|
command: id -u wetgit
|
||||||
|
register: wetgit_uid_result
|
||||||
|
changed_when: false
|
||||||
|
check_mode: false
|
||||||
|
|
||||||
|
- name: Get wetgit user GID
|
||||||
|
command: id -g wetgit
|
||||||
|
register: wetgit_gid_result
|
||||||
|
changed_when: false
|
||||||
|
check_mode: false
|
||||||
|
|
||||||
|
- name: Store wetgit UID/GID as facts
|
||||||
|
set_fact:
|
||||||
|
wetgit_uid: "{{ wetgit_uid_result.stdout }}"
|
||||||
|
wetgit_gid: "{{ wetgit_gid_result.stdout }}"
|
||||||
|
|
||||||
|
# --- Directories ---
|
||||||
|
|
||||||
|
- name: Create WetGIT directories
|
||||||
|
file:
|
||||||
|
path: "{{ item.path }}"
|
||||||
|
state: directory
|
||||||
|
owner: "{{ item.owner }}"
|
||||||
|
group: "{{ item.group }}"
|
||||||
|
mode: "0755"
|
||||||
|
loop:
|
||||||
|
# Forgejo directories (owned by wetgit user)
|
||||||
|
- { path: "{{ app_dir }}/docker", owner: wetgit, group: wetgit }
|
||||||
|
- { path: "{{ forgejo_data_dir }}", owner: wetgit, group: wetgit }
|
||||||
|
- { path: "{{ forgejo_data_dir }}/gitea/conf", owner: wetgit, group: wetgit }
|
||||||
|
- { path: "{{ data_dir }}/redis", owner: wetgit, group: wetgit }
|
||||||
|
- { path: "{{ app_dir }}/scripts", owner: wetgit, group: wetgit }
|
||||||
|
- { path: "{{ app_dir }}/backups", owner: wetgit, group: wetgit }
|
||||||
|
- { path: "{{ app_dir }}/logs", owner: wetgit, group: wetgit }
|
||||||
|
- { path: "{{ app_dir }}/mirrors", owner: wetgit, group: wetgit }
|
||||||
|
# Application directories (owned by www-data for FastAPI/Celery)
|
||||||
|
- { path: "{{ app_dir }}", owner: root, group: root }
|
||||||
|
- { path: "{{ app_dir }}/backend", owner: www-data, group: www-data }
|
||||||
|
- { path: "{{ data_dir }}", owner: root, group: root }
|
||||||
|
- { path: "{{ data_dir }}/git-repos", owner: www-data, group: www-data }
|
||||||
|
|
||||||
|
# --- Forgejo config ---
|
||||||
|
|
||||||
|
- name: Deploy Forgejo app.ini (initial seed)
|
||||||
|
template:
|
||||||
|
src: app.ini.j2
|
||||||
|
dest: "{{ forgejo_data_dir }}/gitea/conf/app.ini"
|
||||||
|
owner: wetgit
|
||||||
|
group: wetgit
|
||||||
|
mode: "0644"
|
||||||
|
# Don't overwrite if Forgejo has already modified it
|
||||||
|
force: no
|
||||||
|
notify: restart forgejo
|
||||||
|
|
||||||
|
# --- Docker Compose ---
|
||||||
|
|
||||||
|
- name: Deploy Docker Compose stack
|
||||||
|
template:
|
||||||
|
src: docker-compose.yml.j2
|
||||||
|
dest: "{{ app_dir }}/docker/docker-compose.yml"
|
||||||
|
owner: wetgit
|
||||||
|
group: wetgit
|
||||||
|
mode: "0644"
|
||||||
|
notify: restart forgejo
|
||||||
|
|
||||||
|
- name: Start WetGIT Docker stack
|
||||||
|
community.docker.docker_compose_v2:
|
||||||
|
project_src: "{{ app_dir }}/docker"
|
||||||
|
state: present
|
||||||
|
|
||||||
|
# --- Backup script ---
|
||||||
|
|
||||||
|
- name: Deploy backup script
|
||||||
|
template:
|
||||||
|
src: backup.sh.j2
|
||||||
|
dest: "{{ app_dir }}/scripts/backup.sh"
|
||||||
|
owner: wetgit
|
||||||
|
group: wetgit
|
||||||
|
mode: "0755"
|
||||||
|
|
||||||
|
# --- Mirror script ---
|
||||||
|
|
||||||
|
- name: Deploy Codeberg mirror script
|
||||||
|
template:
|
||||||
|
src: mirror-to-codeberg.sh.j2
|
||||||
|
dest: "{{ app_dir }}/scripts/mirror-to-codeberg.sh"
|
||||||
|
owner: wetgit
|
||||||
|
group: wetgit
|
||||||
|
mode: "0755"
|
||||||
|
|
||||||
|
- name: Deploy Codeberg token
|
||||||
|
copy:
|
||||||
|
content: "{{ codeberg_api_token }}"
|
||||||
|
dest: "{{ app_dir }}/.codeberg-token"
|
||||||
|
owner: wetgit
|
||||||
|
group: wetgit
|
||||||
|
mode: "0600"
|
||||||
|
when: codeberg_api_token is defined and codeberg_api_token | length > 0
|
||||||
|
|
||||||
|
# --- Cron jobs ---
|
||||||
|
|
||||||
|
- name: Configure backup cron (weekly Sunday 02:00)
|
||||||
|
cron:
|
||||||
|
name: "wetgit-backup"
|
||||||
|
user: root
|
||||||
|
weekday: "0"
|
||||||
|
hour: "2"
|
||||||
|
minute: "0"
|
||||||
|
job: "{{ app_dir }}/scripts/backup.sh >> {{ app_dir }}/logs/backup.log 2>&1"
|
||||||
|
|
||||||
|
- name: Configure Codeberg mirror cron (daily 04:00)
|
||||||
|
cron:
|
||||||
|
name: "wetgit-codeberg-mirror"
|
||||||
|
user: wetgit
|
||||||
|
hour: "4"
|
||||||
|
minute: "0"
|
||||||
|
job: "{{ app_dir }}/scripts/mirror-to-codeberg.sh >> {{ app_dir }}/logs/mirror.log 2>&1"
|
||||||
|
when: codeberg_api_token is defined and codeberg_api_token | length > 0
|
||||||
|
|
||||||
|
- name: Configure log cleanup cron (monthly)
|
||||||
|
cron:
|
||||||
|
name: "wetgit-log-cleanup"
|
||||||
|
user: wetgit
|
||||||
|
day: "1"
|
||||||
|
hour: "5"
|
||||||
|
minute: "0"
|
||||||
|
job: "find {{ app_dir }}/logs -name '*.log' -mtime +30 -delete"
|
||||||
75
ansible/roles/wetgit-forgejo/templates/app.ini.j2
Normal file
75
ansible/roles/wetgit-forgejo/templates/app.ini.j2
Normal file
|
|
@ -0,0 +1,75 @@
|
||||||
|
; WetGit Forgejo configuration — managed by Ansible
|
||||||
|
; This file is merged with Forgejo's defaults on first boot.
|
||||||
|
; After first boot, Forgejo writes its own app.ini in /data/gitea/conf/.
|
||||||
|
; This template is used to seed initial configuration.
|
||||||
|
|
||||||
|
[DEFAULT]
|
||||||
|
APP_NAME = WetGit
|
||||||
|
|
||||||
|
[server]
|
||||||
|
DOMAIN = {{ forgejo_domain }}
|
||||||
|
SSH_DOMAIN = {{ forgejo_domain }}
|
||||||
|
ROOT_URL = https://{{ forgejo_domain }}/
|
||||||
|
HTTP_PORT = 3000
|
||||||
|
; HTTPS-only — no SSH, firewall blocks port 2222
|
||||||
|
DISABLE_SSH = true
|
||||||
|
LFS_START_SERVER = true
|
||||||
|
OFFLINE_MODE = false
|
||||||
|
|
||||||
|
[database]
|
||||||
|
DB_TYPE = sqlite3
|
||||||
|
PATH = /data/gitea/forgejo.db
|
||||||
|
|
||||||
|
[service]
|
||||||
|
DISABLE_REGISTRATION = true
|
||||||
|
REQUIRE_SIGNIN_VIEW = false
|
||||||
|
DEFAULT_KEEP_EMAIL_PRIVATE = true
|
||||||
|
|
||||||
|
[repository]
|
||||||
|
DEFAULT_BRANCH = main
|
||||||
|
PREFERRED_LICENSES = MIT License,CC0-1.0
|
||||||
|
MAX_CREATION_LIMIT = -1
|
||||||
|
ENABLE_PUSH_CREATE_USER = true
|
||||||
|
ENABLE_PUSH_CREATE_ORG = true
|
||||||
|
; 100 MB max file size for large law datasets
|
||||||
|
MAX_FILE_SIZE = 104857600
|
||||||
|
|
||||||
|
[git]
|
||||||
|
MAX_GIT_DIFF_LINES = 10000
|
||||||
|
MAX_GIT_DIFF_FILES = 1000
|
||||||
|
|
||||||
|
[git.timeout]
|
||||||
|
DEFAULT = 600
|
||||||
|
MIGRATE = 1200
|
||||||
|
MIRROR = 600
|
||||||
|
CLONE = 600
|
||||||
|
PULL = 600
|
||||||
|
GC = 120
|
||||||
|
|
||||||
|
[lfs]
|
||||||
|
PATH = /data/git/lfs
|
||||||
|
|
||||||
|
[ui]
|
||||||
|
DEFAULT_THEME = forgejo-auto
|
||||||
|
SHOW_USER_EMAIL = false
|
||||||
|
|
||||||
|
[actions]
|
||||||
|
ENABLED = true
|
||||||
|
|
||||||
|
[indexer]
|
||||||
|
REPO_INDEXER_ENABLED = true
|
||||||
|
REPO_INDEXER_PATH = /data/gitea/indexers/repos.bleve
|
||||||
|
REPO_INDEXER_EXCLUDE = node_modules/**
|
||||||
|
|
||||||
|
[markup.markdown]
|
||||||
|
ENABLED = true
|
||||||
|
FILE_EXTENSIONS = .md,.markdown
|
||||||
|
|
||||||
|
[mailer]
|
||||||
|
ENABLED = true
|
||||||
|
PROTOCOL = smtp+starttls
|
||||||
|
SMTP_ADDR = {{ forgejo_smtp_host | default('smtp.email.undefined') }}
|
||||||
|
SMTP_PORT = {{ forgejo_smtp_port | default(587) }}
|
||||||
|
FROM = Coornhert <coornhert@wetgit.nl>
|
||||||
|
USER = {{ forgejo_smtp_user | default('') }}
|
||||||
|
PASSWD = {{ forgejo_smtp_password | default('') }}
|
||||||
37
ansible/roles/wetgit-forgejo/templates/backup.sh.j2
Normal file
37
ansible/roles/wetgit-forgejo/templates/backup.sh.j2
Normal file
|
|
@ -0,0 +1,37 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# WetGIT Forgejo backup — managed by Ansible
|
||||||
|
# Uses Forgejo's built-in dump command (no downtime).
|
||||||
|
|
||||||
|
BACKUP_DIR="{{ app_dir }}/backups"
|
||||||
|
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
||||||
|
RETENTION_DAYS=14
|
||||||
|
LOG_PREFIX="[$(date '+%Y-%m-%d %H:%M:%S')]"
|
||||||
|
|
||||||
|
echo "$LOG_PREFIX Starting WetGit backup..."
|
||||||
|
|
||||||
|
# Forgejo dump (runs inside container, no service stop needed)
|
||||||
|
docker exec wetgit-forgejo forgejo dump \
|
||||||
|
--type tar.gz \
|
||||||
|
--file /data/backup-${TIMESTAMP}.tar.gz \
|
||||||
|
2>&1 || {
|
||||||
|
echo "$LOG_PREFIX ERROR: Forgejo dump failed"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
# Move dump from container volume to backup dir
|
||||||
|
mv "{{ forgejo_data_dir }}/backup-${TIMESTAMP}.tar.gz" \
|
||||||
|
"$BACKUP_DIR/wetgit-forgejo-${TIMESTAMP}.tar.gz"
|
||||||
|
|
||||||
|
# Also backup Redis AOF
|
||||||
|
docker exec wetgit-redis redis-cli BGSAVE 2>/dev/null || true
|
||||||
|
sleep 2
|
||||||
|
cp "{{ data_dir }}/redis/dump.rdb" \
|
||||||
|
"$BACKUP_DIR/wetgit-redis-${TIMESTAMP}.rdb" 2>/dev/null || true
|
||||||
|
|
||||||
|
# Clean old backups
|
||||||
|
find "$BACKUP_DIR" -name "wetgit-forgejo-*.tar.gz" -mtime +${RETENTION_DAYS} -delete
|
||||||
|
find "$BACKUP_DIR" -name "wetgit-redis-*.rdb" -mtime +${RETENTION_DAYS} -delete
|
||||||
|
|
||||||
|
echo "$LOG_PREFIX Backup complete: wetgit-forgejo-${TIMESTAMP}.tar.gz"
|
||||||
45
ansible/roles/wetgit-forgejo/templates/docker-compose.yml.j2
Normal file
45
ansible/roles/wetgit-forgejo/templates/docker-compose.yml.j2
Normal file
|
|
@ -0,0 +1,45 @@
|
||||||
|
services:
|
||||||
|
forgejo:
|
||||||
|
image: codeberg.org/forgejo/forgejo:10
|
||||||
|
container_name: wetgit-forgejo
|
||||||
|
restart: unless-stopped
|
||||||
|
environment:
|
||||||
|
- USER_UID={{ wetgit_uid }}
|
||||||
|
- USER_GID={{ wetgit_gid }}
|
||||||
|
volumes:
|
||||||
|
- {{ forgejo_data_dir }}:/data
|
||||||
|
- /etc/timezone:/etc/timezone:ro
|
||||||
|
- /etc/localtime:/etc/localtime:ro
|
||||||
|
ports:
|
||||||
|
- "{{ backend_host }}:{{ forgejo_port }}:3000"
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
memory: 1G
|
||||||
|
cpus: "2.0"
|
||||||
|
reservations:
|
||||||
|
memory: 256M
|
||||||
|
cpus: "0.5"
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://localhost:3000/api/v1/version"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
networks:
|
||||||
|
- wetgit
|
||||||
|
|
||||||
|
redis:
|
||||||
|
image: redis:7-alpine
|
||||||
|
container_name: wetgit-redis
|
||||||
|
restart: unless-stopped
|
||||||
|
ports:
|
||||||
|
- "{{ backend_host }}:{{ redis_port }}:6379"
|
||||||
|
volumes:
|
||||||
|
- {{ data_dir }}/redis:/data
|
||||||
|
command: redis-server --appendonly yes --maxmemory 256mb --maxmemory-policy allkeys-lru
|
||||||
|
networks:
|
||||||
|
- wetgit
|
||||||
|
|
||||||
|
networks:
|
||||||
|
wetgit:
|
||||||
|
name: wetgit-network
|
||||||
|
|
@ -0,0 +1,55 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Mirror WetGit repos from self-hosted Forgejo to Codeberg
|
||||||
|
# Managed by Ansible — runs daily at 04:00
|
||||||
|
|
||||||
|
CODEBERG_USER="coornhert"
|
||||||
|
CODEBERG_TOKEN_FILE="{{ app_dir }}/.codeberg-token"
|
||||||
|
MIRROR_DIR="{{ app_dir }}/mirrors"
|
||||||
|
LOG_PREFIX="[$(date '+%Y-%m-%d %H:%M:%S')]"
|
||||||
|
|
||||||
|
REPOS=(
|
||||||
|
"wetgit/meta"
|
||||||
|
"wetgit/rijk"
|
||||||
|
# Add more as they are created:
|
||||||
|
# "wetgit/cvdr-noord-holland"
|
||||||
|
# "wetgit/eu"
|
||||||
|
)
|
||||||
|
|
||||||
|
if [ ! -f "$CODEBERG_TOKEN_FILE" ]; then
|
||||||
|
echo "$LOG_PREFIX ERROR: Codeberg token not found at $CODEBERG_TOKEN_FILE"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
CODEBERG_TOKEN=$(cat "$CODEBERG_TOKEN_FILE")
|
||||||
|
mkdir -p "$MIRROR_DIR"
|
||||||
|
|
||||||
|
for REPO in "${REPOS[@]}"; do
|
||||||
|
REPO_NAME=$(basename "$REPO")
|
||||||
|
REPO_MIRROR_DIR="$MIRROR_DIR/$REPO_NAME.git"
|
||||||
|
FORGEJO_URL="https://{{ forgejo_domain }}/${REPO}.git"
|
||||||
|
CODEBERG_URL="https://${CODEBERG_USER}:${CODEBERG_TOKEN}@codeberg.org/${REPO}.git"
|
||||||
|
|
||||||
|
echo "$LOG_PREFIX Mirroring $REPO..."
|
||||||
|
|
||||||
|
if [ ! -d "$REPO_MIRROR_DIR" ]; then
|
||||||
|
echo "$LOG_PREFIX Initial clone from Forgejo..."
|
||||||
|
git clone --bare "$FORGEJO_URL" "$REPO_MIRROR_DIR"
|
||||||
|
cd "$REPO_MIRROR_DIR"
|
||||||
|
git remote add codeberg "$CODEBERG_URL"
|
||||||
|
else
|
||||||
|
cd "$REPO_MIRROR_DIR"
|
||||||
|
echo "$LOG_PREFIX Fetching from Forgejo..."
|
||||||
|
git fetch origin --prune '+refs/heads/*:refs/heads/*' '+refs/tags/*:refs/tags/*'
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "$LOG_PREFIX Pushing to Codeberg..."
|
||||||
|
git push codeberg --mirror --force 2>&1 || {
|
||||||
|
echo "$LOG_PREFIX WARNING: Push to Codeberg failed for $REPO (non-fatal)"
|
||||||
|
}
|
||||||
|
|
||||||
|
echo "$LOG_PREFIX Done: $REPO"
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "$LOG_PREFIX Mirror complete."
|
||||||
5
ansible/roles/wetgit-nginx/handlers/main.yml
Normal file
5
ansible/roles/wetgit-nginx/handlers/main.yml
Normal file
|
|
@ -0,0 +1,5 @@
|
||||||
|
---
|
||||||
|
- name: reload nginx
|
||||||
|
systemd:
|
||||||
|
name: nginx
|
||||||
|
state: reloaded
|
||||||
118
ansible/roles/wetgit-nginx/tasks/main.yml
Normal file
118
ansible/roles/wetgit-nginx/tasks/main.yml
Normal file
|
|
@ -0,0 +1,118 @@
|
||||||
|
---
|
||||||
|
# Nginx vhosts for WetGIT
|
||||||
|
# IMPORTANT: Only adds vhost configs. Does NOT touch global nginx.conf
|
||||||
|
# (managed by dt-platform's nginx role).
|
||||||
|
#
|
||||||
|
# Strategy: Deploy HTTP-only first → get SSL certs → deploy full HTTPS config.
|
||||||
|
|
||||||
|
# --- Step 1: Check existing SSL certificates ---
|
||||||
|
|
||||||
|
- name: Check if API SSL certificate exists
|
||||||
|
stat:
|
||||||
|
path: "/etc/letsencrypt/live/{{ server_name }}/fullchain.pem"
|
||||||
|
register: ssl_cert_api
|
||||||
|
|
||||||
|
- name: Check if Forgejo SSL certificate exists
|
||||||
|
stat:
|
||||||
|
path: "/etc/letsencrypt/live/{{ forgejo_domain }}/fullchain.pem"
|
||||||
|
register: ssl_cert_git
|
||||||
|
|
||||||
|
# --- Step 2: Deploy HTTP-only configs for domains without certs ---
|
||||||
|
|
||||||
|
- name: Deploy API HTTP-only vhost (pre-SSL)
|
||||||
|
copy:
|
||||||
|
content: |
|
||||||
|
# Temporary HTTP-only config for SSL provisioning — managed by Ansible
|
||||||
|
server {
|
||||||
|
listen 80;
|
||||||
|
listen [::]:80;
|
||||||
|
server_name {{ server_name }};
|
||||||
|
location /.well-known/acme-challenge/ { root /var/www/certbot; }
|
||||||
|
location / { return 503; }
|
||||||
|
}
|
||||||
|
dest: /etc/nginx/sites-available/wetgit-api.conf
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: "0644"
|
||||||
|
when: not ssl_cert_api.stat.exists
|
||||||
|
notify: reload nginx
|
||||||
|
|
||||||
|
- name: Deploy Forgejo HTTP-only vhost (pre-SSL)
|
||||||
|
copy:
|
||||||
|
content: |
|
||||||
|
# Temporary HTTP-only config for SSL provisioning — managed by Ansible
|
||||||
|
server {
|
||||||
|
listen 80;
|
||||||
|
listen [::]:80;
|
||||||
|
server_name {{ forgejo_domain }};
|
||||||
|
location /.well-known/acme-challenge/ { root /var/www/certbot; }
|
||||||
|
location / { return 503; }
|
||||||
|
}
|
||||||
|
dest: /etc/nginx/sites-available/wetgit-git.conf
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: "0644"
|
||||||
|
when: not ssl_cert_git.stat.exists
|
||||||
|
notify: reload nginx
|
||||||
|
|
||||||
|
# --- Step 3: Enable vhosts and reload nginx ---
|
||||||
|
|
||||||
|
- name: Enable API vhost
|
||||||
|
file:
|
||||||
|
src: /etc/nginx/sites-available/wetgit-api.conf
|
||||||
|
dest: /etc/nginx/sites-enabled/wetgit-api.conf
|
||||||
|
state: link
|
||||||
|
notify: reload nginx
|
||||||
|
|
||||||
|
- name: Enable Forgejo vhost
|
||||||
|
file:
|
||||||
|
src: /etc/nginx/sites-available/wetgit-git.conf
|
||||||
|
dest: /etc/nginx/sites-enabled/wetgit-git.conf
|
||||||
|
state: link
|
||||||
|
notify: reload nginx
|
||||||
|
|
||||||
|
# Force handler to run now so nginx has the HTTP configs before certbot
|
||||||
|
- name: Flush handlers (reload nginx for certbot)
|
||||||
|
meta: flush_handlers
|
||||||
|
|
||||||
|
# --- Step 4: Obtain SSL certificates via webroot ---
|
||||||
|
|
||||||
|
- name: Obtain SSL certificate for {{ server_name }}
|
||||||
|
command: >
|
||||||
|
certbot certonly --webroot
|
||||||
|
-w /var/www/certbot
|
||||||
|
-d {{ server_name }}
|
||||||
|
--non-interactive --agree-tos
|
||||||
|
--email coornhert@wetgit.nl
|
||||||
|
when: not ssl_cert_api.stat.exists
|
||||||
|
register: certbot_api
|
||||||
|
|
||||||
|
- name: Obtain SSL certificate for {{ forgejo_domain }}
|
||||||
|
command: >
|
||||||
|
certbot certonly --webroot
|
||||||
|
-w /var/www/certbot
|
||||||
|
-d {{ forgejo_domain }}
|
||||||
|
--non-interactive --agree-tos
|
||||||
|
--email coornhert@wetgit.nl
|
||||||
|
when: not ssl_cert_git.stat.exists
|
||||||
|
register: certbot_git
|
||||||
|
|
||||||
|
# --- Step 5: Deploy full HTTPS configs ---
|
||||||
|
|
||||||
|
- name: Deploy API nginx vhost (full HTTPS)
|
||||||
|
template:
|
||||||
|
src: wetgit-api.conf.j2
|
||||||
|
dest: /etc/nginx/sites-available/wetgit-api.conf
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: "0644"
|
||||||
|
notify: reload nginx
|
||||||
|
|
||||||
|
- name: Deploy Forgejo nginx vhost (full HTTPS)
|
||||||
|
template:
|
||||||
|
src: wetgit-git.conf.j2
|
||||||
|
dest: /etc/nginx/sites-available/wetgit-git.conf
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: "0644"
|
||||||
|
notify: reload nginx
|
||||||
51
ansible/roles/wetgit-nginx/templates/wetgit-api.conf.j2
Normal file
51
ansible/roles/wetgit-nginx/templates/wetgit-api.conf.j2
Normal file
|
|
@ -0,0 +1,51 @@
|
||||||
|
# WetGIT API — managed by WetGIT Ansible (not dt-platform)
|
||||||
|
# Do NOT edit manually
|
||||||
|
|
||||||
|
server {
|
||||||
|
listen 80;
|
||||||
|
listen [::]:80;
|
||||||
|
server_name {{ server_name }};
|
||||||
|
|
||||||
|
# ACME challenge (reuse existing certbot webroot)
|
||||||
|
location /.well-known/acme-challenge/ {
|
||||||
|
root /var/www/certbot;
|
||||||
|
}
|
||||||
|
|
||||||
|
location / {
|
||||||
|
return 301 https://$host$request_uri;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
server {
|
||||||
|
listen 443 ssl http2;
|
||||||
|
listen [::]:443 ssl http2;
|
||||||
|
server_name {{ server_name }};
|
||||||
|
|
||||||
|
ssl_certificate /etc/letsencrypt/live/{{ server_name }}/fullchain.pem;
|
||||||
|
ssl_certificate_key /etc/letsencrypt/live/{{ server_name }}/privkey.pem;
|
||||||
|
|
||||||
|
# Security headers
|
||||||
|
add_header Strict-Transport-Security "max-age=63072000; includeSubDomains" always;
|
||||||
|
add_header X-Content-Type-Options "nosniff" always;
|
||||||
|
add_header X-Frame-Options "DENY" always;
|
||||||
|
add_header Referrer-Policy "strict-origin-when-cross-origin" always;
|
||||||
|
|
||||||
|
# API proxy
|
||||||
|
location / {
|
||||||
|
proxy_pass http://{{ backend_host }}:{{ backend_port }};
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
|
||||||
|
# Timeouts for long-running legislation processing
|
||||||
|
proxy_read_timeout 120s;
|
||||||
|
proxy_connect_timeout 10s;
|
||||||
|
}
|
||||||
|
|
||||||
|
# Health check (no rate limit)
|
||||||
|
location = /health {
|
||||||
|
proxy_pass http://{{ backend_host }}:{{ backend_port }}/health;
|
||||||
|
access_log off;
|
||||||
|
}
|
||||||
|
}
|
||||||
52
ansible/roles/wetgit-nginx/templates/wetgit-git.conf.j2
Normal file
52
ansible/roles/wetgit-nginx/templates/wetgit-git.conf.j2
Normal file
|
|
@ -0,0 +1,52 @@
|
||||||
|
# Forgejo (git.wetgit.nl) — managed by WetGIT Ansible (not dt-platform)
|
||||||
|
# Do NOT edit manually
|
||||||
|
|
||||||
|
server {
|
||||||
|
listen 80;
|
||||||
|
listen [::]:80;
|
||||||
|
server_name {{ forgejo_domain }};
|
||||||
|
|
||||||
|
location /.well-known/acme-challenge/ {
|
||||||
|
root /var/www/certbot;
|
||||||
|
}
|
||||||
|
|
||||||
|
location / {
|
||||||
|
return 301 https://$host$request_uri;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
server {
|
||||||
|
listen 443 ssl http2;
|
||||||
|
listen [::]:443 ssl http2;
|
||||||
|
server_name {{ forgejo_domain }};
|
||||||
|
|
||||||
|
ssl_certificate /etc/letsencrypt/live/{{ forgejo_domain }}/fullchain.pem;
|
||||||
|
ssl_certificate_key /etc/letsencrypt/live/{{ forgejo_domain }}/privkey.pem;
|
||||||
|
|
||||||
|
# Security headers
|
||||||
|
add_header Strict-Transport-Security "max-age=63072000; includeSubDomains" always;
|
||||||
|
add_header X-Content-Type-Options "nosniff" always;
|
||||||
|
add_header X-Frame-Options "SAMEORIGIN" always;
|
||||||
|
add_header Referrer-Policy "strict-origin-when-cross-origin" always;
|
||||||
|
|
||||||
|
# Large body size for git pushes (law datasets can be large)
|
||||||
|
client_max_body_size 512M;
|
||||||
|
|
||||||
|
# Timeouts for large git operations
|
||||||
|
proxy_connect_timeout 300;
|
||||||
|
proxy_send_timeout 300;
|
||||||
|
proxy_read_timeout 300;
|
||||||
|
|
||||||
|
location / {
|
||||||
|
proxy_pass http://{{ backend_host }}:{{ forgejo_port }};
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
|
||||||
|
# WebSocket support (Forgejo live features)
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Upgrade $http_upgrade;
|
||||||
|
proxy_set_header Connection "upgrade";
|
||||||
|
}
|
||||||
|
}
|
||||||
27
ansible/site.yml
Normal file
27
ansible/site.yml
Normal file
|
|
@ -0,0 +1,27 @@
|
||||||
|
---
|
||||||
|
# WetGIT - Nederlandse wetgeving als code
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# ansible-playbook ansible/site.yml
|
||||||
|
# ansible-playbook ansible/site.yml --tags forgejo
|
||||||
|
# ansible-playbook ansible/site.yml --tags app
|
||||||
|
# ansible-playbook ansible/site.yml --tags nginx
|
||||||
|
# ansible-playbook ansible/site.yml --check (dry-run)
|
||||||
|
#
|
||||||
|
# NOTE: This server is shared with dt-platform.
|
||||||
|
# This playbook only manages WetGIT resources.
|
||||||
|
# System-level config (users, packages, firewall) is managed by dt-platform.
|
||||||
|
|
||||||
|
- name: Deploy WetGIT
|
||||||
|
hosts: wetgit
|
||||||
|
become: yes
|
||||||
|
|
||||||
|
roles:
|
||||||
|
- role: wetgit-forgejo
|
||||||
|
tags: [forgejo, docker]
|
||||||
|
|
||||||
|
- role: wetgit-app
|
||||||
|
tags: [app]
|
||||||
|
|
||||||
|
- role: wetgit-nginx
|
||||||
|
tags: [nginx]
|
||||||
61
flake.lock
generated
Normal file
61
flake.lock
generated
Normal file
|
|
@ -0,0 +1,61 @@
|
||||||
|
{
|
||||||
|
"nodes": {
|
||||||
|
"flake-utils": {
|
||||||
|
"inputs": {
|
||||||
|
"systems": "systems"
|
||||||
|
},
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1731533236,
|
||||||
|
"narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
|
||||||
|
"owner": "numtide",
|
||||||
|
"repo": "flake-utils",
|
||||||
|
"rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "numtide",
|
||||||
|
"repo": "flake-utils",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nixpkgs": {
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1774610258,
|
||||||
|
"narHash": "sha256-HaThtroVD9wRdx7KQk0B75JmFcXlMUoEdDFNOMOlsOs=",
|
||||||
|
"owner": "NixOS",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"rev": "832efc09b4caf6b4569fbf9dc01bec3082a00611",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "NixOS",
|
||||||
|
"ref": "nixpkgs-unstable",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"root": {
|
||||||
|
"inputs": {
|
||||||
|
"flake-utils": "flake-utils",
|
||||||
|
"nixpkgs": "nixpkgs"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"systems": {
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1681028828,
|
||||||
|
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
|
||||||
|
"owner": "nix-systems",
|
||||||
|
"repo": "default",
|
||||||
|
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "nix-systems",
|
||||||
|
"repo": "default",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"root": "root",
|
||||||
|
"version": 7
|
||||||
|
}
|
||||||
116
flake.nix
Normal file
116
flake.nix
Normal file
|
|
@ -0,0 +1,116 @@
|
||||||
|
{
|
||||||
|
description = "WetGit - Nederlandse wetgeving als code";
|
||||||
|
|
||||||
|
inputs = {
|
||||||
|
nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable";
|
||||||
|
flake-utils.url = "github:numtide/flake-utils";
|
||||||
|
};
|
||||||
|
|
||||||
|
outputs = { self, nixpkgs, flake-utils }:
|
||||||
|
flake-utils.lib.eachDefaultSystem (system:
|
||||||
|
let
|
||||||
|
pkgs = nixpkgs.legacyPackages.${system};
|
||||||
|
|
||||||
|
# Python 3.13 (zelfde versie als ansible gebruikt, voorkomt PATH-conflicten)
|
||||||
|
pythonEnv = pkgs.python313.withPackages (ps: with ps; [
|
||||||
|
# Conversie-pipeline (PRD: Technische Stack)
|
||||||
|
lxml # BWB XML-parsing met XPath/XSLT
|
||||||
|
pygit2 # Git-operaties via libgit2 (performanter dan GitPython)
|
||||||
|
pyyaml # YAML frontmatter generatie
|
||||||
|
python-frontmatter # Markdown + YAML frontmatter parsing
|
||||||
|
|
||||||
|
# API-laag (PRD: FastAPI)
|
||||||
|
fastapi
|
||||||
|
uvicorn # ASGI server
|
||||||
|
httpx # Async HTTP client (SRU-API, EUR-Lex)
|
||||||
|
pydantic # Data validatie
|
||||||
|
|
||||||
|
# Achtergrondtaken (PRD: Celery + Redis)
|
||||||
|
celery
|
||||||
|
redis # Python Redis client
|
||||||
|
|
||||||
|
# CLI-tool (PRD: wetgit CLI)
|
||||||
|
click
|
||||||
|
rich # Terminal formatting
|
||||||
|
|
||||||
|
# Testing
|
||||||
|
pytest
|
||||||
|
pytest-cov
|
||||||
|
pytest-asyncio
|
||||||
|
|
||||||
|
# Development tools
|
||||||
|
black
|
||||||
|
ruff
|
||||||
|
mypy
|
||||||
|
pip
|
||||||
|
setuptools
|
||||||
|
wheel
|
||||||
|
build
|
||||||
|
|
||||||
|
# Typing stubs
|
||||||
|
types-requests
|
||||||
|
types-pyyaml
|
||||||
|
]);
|
||||||
|
|
||||||
|
in {
|
||||||
|
devShells.default = pkgs.mkShell {
|
||||||
|
name = "wetgit";
|
||||||
|
|
||||||
|
buildInputs = with pkgs; [
|
||||||
|
# Python environment
|
||||||
|
pythonEnv
|
||||||
|
|
||||||
|
# Dependency management
|
||||||
|
uv
|
||||||
|
|
||||||
|
# Ansible (infrastructuur provisioning Hetzner)
|
||||||
|
ansible
|
||||||
|
ansible-lint
|
||||||
|
|
||||||
|
# Hetzner Cloud CLI
|
||||||
|
hcloud
|
||||||
|
|
||||||
|
# Redis server (lokale development)
|
||||||
|
redis
|
||||||
|
|
||||||
|
# Git & tools
|
||||||
|
git
|
||||||
|
jq
|
||||||
|
yq-go
|
||||||
|
curl
|
||||||
|
|
||||||
|
# Native dependencies voor pygit2
|
||||||
|
libgit2
|
||||||
|
];
|
||||||
|
|
||||||
|
shellHook = ''
|
||||||
|
echo "WetGit - Nederlandse wetgeving als code"
|
||||||
|
echo ""
|
||||||
|
echo "Python: $(python --version)"
|
||||||
|
echo "Ansible: $(ansible --version 2>/dev/null | head -1)"
|
||||||
|
echo "hcloud: $(hcloud version 2>/dev/null)"
|
||||||
|
echo ""
|
||||||
|
echo "Pipeline tools: lxml, pygit2, fastapi"
|
||||||
|
echo "Infra tools: ansible, hcloud"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Laad .env als die bestaat (API keys, Hetzner token)
|
||||||
|
if [ -f .env ]; then
|
||||||
|
set -a
|
||||||
|
source .env
|
||||||
|
set +a
|
||||||
|
echo "Loaded environment from .env"
|
||||||
|
echo ""
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Venv voor PyPI-only packages (agentmail etc.)
|
||||||
|
if [ ! -d .venv ]; then
|
||||||
|
uv venv .venv --python python3.13 --seed
|
||||||
|
uv pip install --python .venv/bin/python agentmail
|
||||||
|
echo "Created .venv and installed PyPI dependencies"
|
||||||
|
fi
|
||||||
|
source .venv/bin/activate
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
});
|
||||||
|
}
|
||||||
92
pyproject.toml
Normal file
92
pyproject.toml
Normal file
|
|
@ -0,0 +1,92 @@
|
||||||
|
[project]
|
||||||
|
name = "wetgit"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "Nederlandse wetgeving als code — elke wet een Markdown-bestand, elke wijziging een Git-commit"
|
||||||
|
readme = "README.md"
|
||||||
|
license = "MIT"
|
||||||
|
requires-python = ">=3.12"
|
||||||
|
authors = [
|
||||||
|
{ name = "Coornhert", email = "coornhert@wetgit.nl" },
|
||||||
|
]
|
||||||
|
keywords = ["wetgeving", "dutch-law", "bwb", "git", "markdown"]
|
||||||
|
classifiers = [
|
||||||
|
"Development Status :: 2 - Pre-Alpha",
|
||||||
|
"Intended Audience :: Developers",
|
||||||
|
"Intended Audience :: Legal Industry",
|
||||||
|
"License :: OSI Approved :: MIT License",
|
||||||
|
"Programming Language :: Python :: 3.13",
|
||||||
|
"Topic :: Text Processing :: Markup",
|
||||||
|
]
|
||||||
|
dependencies = [
|
||||||
|
"lxml>=5.0",
|
||||||
|
"pygit2>=1.13",
|
||||||
|
"pyyaml>=6.0",
|
||||||
|
"python-frontmatter>=1.1",
|
||||||
|
"httpx>=0.27",
|
||||||
|
"click>=8.1",
|
||||||
|
"rich>=13.0",
|
||||||
|
"pydantic>=2.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
api = [
|
||||||
|
"fastapi>=0.115",
|
||||||
|
"uvicorn>=0.30",
|
||||||
|
"celery>=5.4",
|
||||||
|
"redis>=5.0",
|
||||||
|
]
|
||||||
|
dev = [
|
||||||
|
"pytest>=8.0",
|
||||||
|
"pytest-cov>=5.0",
|
||||||
|
"pytest-asyncio>=0.24",
|
||||||
|
"black>=24.0",
|
||||||
|
"ruff>=0.6",
|
||||||
|
"mypy>=1.11",
|
||||||
|
"types-pyyaml",
|
||||||
|
"types-requests",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
wetgit = "wetgit.cli.main:cli"
|
||||||
|
|
||||||
|
[project.urls]
|
||||||
|
Homepage = "https://wetgit.nl"
|
||||||
|
Repository = "https://git.wetgit.nl/wetgit/meta"
|
||||||
|
Issues = "https://git.wetgit.nl/wetgit/meta/issues"
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["setuptools>=75.0"]
|
||||||
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[tool.setuptools.packages.find]
|
||||||
|
where = ["src"]
|
||||||
|
|
||||||
|
[tool.pytest.ini_options]
|
||||||
|
testpaths = ["tests"]
|
||||||
|
markers = [
|
||||||
|
"unit: Unit tests (fast, no I/O)",
|
||||||
|
"integration: Integration tests (may need network/disk)",
|
||||||
|
"slow: Slow tests (large XML parsing, bulk operations)",
|
||||||
|
]
|
||||||
|
asyncio_mode = "auto"
|
||||||
|
|
||||||
|
[tool.black]
|
||||||
|
line-length = 99
|
||||||
|
target-version = ["py313"]
|
||||||
|
|
||||||
|
[tool.ruff]
|
||||||
|
line-length = 99
|
||||||
|
target-version = "py313"
|
||||||
|
|
||||||
|
[tool.ruff.lint]
|
||||||
|
select = ["E", "F", "I", "N", "W", "UP", "B", "A", "SIM", "TCH"]
|
||||||
|
ignore = ["E501"]
|
||||||
|
|
||||||
|
[tool.ruff.lint.isort]
|
||||||
|
known-first-party = ["wetgit"]
|
||||||
|
|
||||||
|
[tool.mypy]
|
||||||
|
python_version = "3.13"
|
||||||
|
warn_return_any = true
|
||||||
|
warn_unused_configs = true
|
||||||
|
disallow_untyped_defs = true
|
||||||
3
src/wetgit/__init__.py
Normal file
3
src/wetgit/__init__.py
Normal file
|
|
@ -0,0 +1,3 @@
|
||||||
|
"""WetGit — Nederlandse wetgeving als code."""
|
||||||
|
|
||||||
|
__version__ = "0.1.0"
|
||||||
1
src/wetgit/api/__init__.py
Normal file
1
src/wetgit/api/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
"""FastAPI REST API."""
|
||||||
1
src/wetgit/cli/__init__.py
Normal file
1
src/wetgit/cli/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
"""WetGit CLI tool."""
|
||||||
17
src/wetgit/cli/main.py
Normal file
17
src/wetgit/cli/main.py
Normal file
|
|
@ -0,0 +1,17 @@
|
||||||
|
"""WetGit CLI — command-line interface."""
|
||||||
|
|
||||||
|
import click
|
||||||
|
|
||||||
|
from wetgit import __version__
|
||||||
|
|
||||||
|
|
||||||
|
@click.group()
|
||||||
|
@click.version_option(version=__version__, prog_name="wetgit")
|
||||||
|
def cli() -> None:
|
||||||
|
"""WetGit — Nederlandse wetgeving als code."""
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
def version() -> None:
|
||||||
|
"""Toon de WetGit versie."""
|
||||||
|
click.echo(f"wetgit {__version__}")
|
||||||
55
src/wetgit/models.py
Normal file
55
src/wetgit/models.py
Normal file
|
|
@ -0,0 +1,55 @@
|
||||||
|
"""Domain models voor WetGit."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from datetime import date
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
|
||||||
|
class RegelingType(str, Enum):
|
||||||
|
"""Type regeling conform BWB-classificatie."""
|
||||||
|
|
||||||
|
WET = "wet"
|
||||||
|
AMVB = "amvb"
|
||||||
|
MINISTERIELE_REGELING = "ministeriele-regeling"
|
||||||
|
KB = "kb"
|
||||||
|
RIJKSWET = "rijkswet"
|
||||||
|
VERDRAG = "verdrag"
|
||||||
|
BELEIDSREGEL = "beleidsregel"
|
||||||
|
CIRCULAIRE = "circulaire"
|
||||||
|
ZBO = "zbo"
|
||||||
|
|
||||||
|
|
||||||
|
class RegelingStatus(str, Enum):
|
||||||
|
"""Status van een regeling."""
|
||||||
|
|
||||||
|
GELDEND = "geldend"
|
||||||
|
VERVALLEN = "vervallen"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class Regeling:
|
||||||
|
"""Metadata van een regeling."""
|
||||||
|
|
||||||
|
bwb_id: str
|
||||||
|
titel: str
|
||||||
|
type: RegelingType
|
||||||
|
status: RegelingStatus
|
||||||
|
datum_inwerkingtreding: date
|
||||||
|
datum_laatste_wijziging: date | None = None
|
||||||
|
datum_verval: date | None = None
|
||||||
|
citeertitel: str | None = None
|
||||||
|
ministerie: str | None = None
|
||||||
|
bron_url: str | None = None
|
||||||
|
eu_implementatie: list[dict[str, str]] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class Artikel:
|
||||||
|
"""Een artikel binnen een regeling."""
|
||||||
|
|
||||||
|
nummer: str
|
||||||
|
titel: str | None
|
||||||
|
inhoud: str
|
||||||
|
leden: list[str] = field(default_factory=list)
|
||||||
1
src/wetgit/pipeline/__init__.py
Normal file
1
src/wetgit/pipeline/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
"""BWB/CVDR/EUR-Lex conversie-pipeline."""
|
||||||
487
src/wetgit/pipeline/bwb_parser.py
Normal file
487
src/wetgit/pipeline/bwb_parser.py
Normal file
|
|
@ -0,0 +1,487 @@
|
||||||
|
"""BWB XML naar Markdown parser.
|
||||||
|
|
||||||
|
Parseert BWB toestand-XML (schema versie 2.0) naar Markdown + YAML frontmatter.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from datetime import date
|
||||||
|
from io import StringIO
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ParsedRegeling:
|
||||||
|
"""Resultaat van het parsen van een BWB toestand-XML."""
|
||||||
|
|
||||||
|
bwb_id: str
|
||||||
|
titel: str
|
||||||
|
citeertitel: str | None
|
||||||
|
soort: str
|
||||||
|
datum_inwerkingtreding: str | None
|
||||||
|
markdown: str
|
||||||
|
frontmatter: dict[str, str | list[str] | None]
|
||||||
|
|
||||||
|
|
||||||
|
def parse_bwb_xml(xml_path: str) -> ParsedRegeling:
|
||||||
|
"""Parse een BWB toestand-XML bestand naar Markdown.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
xml_path: Pad naar het BWB XML-bestand.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ParsedRegeling met metadata en Markdown-tekst.
|
||||||
|
"""
|
||||||
|
tree = etree.parse(xml_path)
|
||||||
|
wetgeving = tree.find(".//wetgeving")
|
||||||
|
if wetgeving is None:
|
||||||
|
raise ValueError(f"Geen <wetgeving> element gevonden in {xml_path}")
|
||||||
|
|
||||||
|
# Metadata extraheren
|
||||||
|
soort = wetgeving.get("soort", "onbekend")
|
||||||
|
bwb_id = _extract_bwb_id(tree)
|
||||||
|
datum = wetgeving.get("inwerkingtredingsdatum")
|
||||||
|
|
||||||
|
intitule = wetgeving.findtext(".//intitule", default="").strip()
|
||||||
|
citeertitel = wetgeving.findtext(".//citeertitel", default="").strip() or None
|
||||||
|
titel = citeertitel or intitule or bwb_id
|
||||||
|
|
||||||
|
# Wettekst parsen — probeer meerdere structuren
|
||||||
|
# Formele wetten: <wet-besluit><wettekst>
|
||||||
|
# Ministeriële regelingen: <regeling-tekst>
|
||||||
|
# Circulaires/ZBO: <circulaire-tekst> met <circulaire.divisie>
|
||||||
|
# Fallback: <wettekst> direct
|
||||||
|
wettekst = wetgeving.find(".//wet-besluit//wettekst")
|
||||||
|
if wettekst is None:
|
||||||
|
wettekst = wetgeving.find(".//regeling-tekst")
|
||||||
|
if wettekst is None:
|
||||||
|
wettekst = wetgeving.find(".//circulaire-tekst")
|
||||||
|
if wettekst is None:
|
||||||
|
wettekst = wetgeving.find(".//wettekst")
|
||||||
|
|
||||||
|
md_parts: list[str] = []
|
||||||
|
if wettekst is not None:
|
||||||
|
md_parts = _parse_wettekst(wettekst)
|
||||||
|
|
||||||
|
# Bijlagen parsen (staan buiten de wettekst)
|
||||||
|
for bijlage in wetgeving.findall(".//bijlage"):
|
||||||
|
bijlage_md = _parse_bijlage(bijlage)
|
||||||
|
if bijlage_md:
|
||||||
|
md_parts.append(bijlage_md)
|
||||||
|
|
||||||
|
# Frontmatter opbouwen
|
||||||
|
frontmatter: dict[str, str | list[str] | None] = {
|
||||||
|
"titel": titel,
|
||||||
|
"bwb_id": bwb_id,
|
||||||
|
"type": soort,
|
||||||
|
"status": "geldend",
|
||||||
|
"datum_inwerkingtreding": datum,
|
||||||
|
"bron": f"https://wetten.overheid.nl/{bwb_id}",
|
||||||
|
}
|
||||||
|
if citeertitel:
|
||||||
|
frontmatter["citeertitel"] = citeertitel
|
||||||
|
|
||||||
|
# Markdown samenstellen
|
||||||
|
fm_yaml = yaml.dump(frontmatter, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
||||||
|
markdown = f"---\n{fm_yaml.strip()}\n---\n\n# {titel}\n\n"
|
||||||
|
markdown += "\n\n".join(md_parts)
|
||||||
|
markdown += "\n"
|
||||||
|
|
||||||
|
return ParsedRegeling(
|
||||||
|
bwb_id=bwb_id,
|
||||||
|
titel=titel,
|
||||||
|
citeertitel=citeertitel,
|
||||||
|
soort=soort,
|
||||||
|
datum_inwerkingtreding=datum,
|
||||||
|
markdown=markdown,
|
||||||
|
frontmatter=frontmatter,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_bwb_id(tree: etree._ElementTree) -> str:
|
||||||
|
"""Haal BWB-ID op uit het XML-document."""
|
||||||
|
root = tree.getroot()
|
||||||
|
# Probeer eerst via wetgeving stam-id
|
||||||
|
wetgeving = tree.find(".//wetgeving")
|
||||||
|
if wetgeving is not None:
|
||||||
|
stam_id = wetgeving.get("stam-id", "")
|
||||||
|
# stam-id is niet het BWB-ID, zoek in meta-data
|
||||||
|
# Zoek in meta-data
|
||||||
|
for elem in tree.iter():
|
||||||
|
if elem.tag == "toestand":
|
||||||
|
bwb_id = elem.get("bwb-id")
|
||||||
|
if bwb_id:
|
||||||
|
return bwb_id
|
||||||
|
# Fallback: zoek in bwb-inputbestand
|
||||||
|
inp = tree.find(".//bwb-inputbestand")
|
||||||
|
if inp is not None:
|
||||||
|
for child in inp.iter():
|
||||||
|
bwb_id = child.get("bwb-id")
|
||||||
|
if bwb_id:
|
||||||
|
return bwb_id
|
||||||
|
return "ONBEKEND"
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_wettekst(wettekst: etree._Element) -> list[str]:
|
||||||
|
"""Parse het <wettekst> element naar Markdown-blokken."""
|
||||||
|
parts: list[str] = []
|
||||||
|
for child in wettekst:
|
||||||
|
tag = child.tag
|
||||||
|
if tag == "deel":
|
||||||
|
parts.append(_parse_structuur(child, level=2, label="Deel"))
|
||||||
|
elif tag == "boek":
|
||||||
|
parts.append(_parse_structuur(child, level=2, label="Boek"))
|
||||||
|
elif tag == "hoofdstuk":
|
||||||
|
parts.append(_parse_structuur(child, level=2, label="Hoofdstuk"))
|
||||||
|
elif tag == "titeldeel":
|
||||||
|
parts.append(_parse_structuur(child, level=2, label="Titel"))
|
||||||
|
elif tag == "afdeling":
|
||||||
|
parts.append(_parse_structuur(child, level=3, label="Afdeling"))
|
||||||
|
elif tag == "paragraaf":
|
||||||
|
parts.append(_parse_structuur(child, level=3, label="Paragraaf"))
|
||||||
|
elif tag == "circulaire.divisie":
|
||||||
|
parts.append(_parse_structuur(child, level=2, label=""))
|
||||||
|
elif tag == "sub-paragraaf":
|
||||||
|
parts.append(_parse_structuur(child, level=3, label="Paragraaf"))
|
||||||
|
elif tag == "divisie":
|
||||||
|
parts.append(_parse_structuur(child, level=2, label=""))
|
||||||
|
elif tag in ("artikel", "enig-artikel"):
|
||||||
|
parts.append(_parse_artikel(child))
|
||||||
|
elif tag == "tekst":
|
||||||
|
parts.append(_parse_tekst_content(child))
|
||||||
|
elif tag == "al":
|
||||||
|
parts.append(_get_text(child))
|
||||||
|
elif tag == "table":
|
||||||
|
parts.append(_parse_cals_table(child))
|
||||||
|
elif tag == "tabel":
|
||||||
|
parts.append("*[tabel]*")
|
||||||
|
elif tag in ("plaatje", "illustratie"):
|
||||||
|
parts.append("*[afbeelding]*")
|
||||||
|
return [p for p in parts if p.strip()]
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_structuur(elem: etree._Element, level: int, label: str) -> str:
|
||||||
|
"""Parse een structuurelement (hoofdstuk, afdeling, paragraaf, boek)."""
|
||||||
|
nr = ""
|
||||||
|
titel = ""
|
||||||
|
|
||||||
|
# <kop> element bevat nr en titel bij boeken en sommige andere structuren
|
||||||
|
kop = elem.find("./kop")
|
||||||
|
if kop is not None:
|
||||||
|
nr = kop.findtext("./nr", default="").strip()
|
||||||
|
titel = kop.findtext("./titel", default="").strip()
|
||||||
|
|
||||||
|
# Fallback: directe child-elementen
|
||||||
|
if not nr:
|
||||||
|
nr = elem.findtext("./nr", default="").strip()
|
||||||
|
if not titel:
|
||||||
|
titel = (
|
||||||
|
elem.findtext("./hoofdstuktitel", default="")
|
||||||
|
or elem.findtext("./titeldeel-titel", default="")
|
||||||
|
or elem.findtext("./afdelingtitel", default="")
|
||||||
|
or elem.findtext("./paragraaftitel", default="")
|
||||||
|
or elem.findtext("./boektitel", default="")
|
||||||
|
or elem.findtext("./titel", default="")
|
||||||
|
).strip()
|
||||||
|
|
||||||
|
heading = "#" * level
|
||||||
|
header = f"{heading} {label} {nr}"
|
||||||
|
if titel:
|
||||||
|
header += f". {titel}"
|
||||||
|
|
||||||
|
skip_tags = {
|
||||||
|
"nr", "kop", "titel",
|
||||||
|
"hoofdstuktitel", "titeldeel-titel", "afdelingtitel",
|
||||||
|
"paragraaftitel", "boektitel",
|
||||||
|
}
|
||||||
|
|
||||||
|
parts = [header]
|
||||||
|
|
||||||
|
for child in elem:
|
||||||
|
tag = child.tag
|
||||||
|
if tag in skip_tags:
|
||||||
|
continue
|
||||||
|
elif tag == "deel":
|
||||||
|
parts.append(_parse_structuur(child, level=level + 1, label="Deel"))
|
||||||
|
elif tag == "boek":
|
||||||
|
parts.append(_parse_structuur(child, level=level + 1, label="Boek"))
|
||||||
|
elif tag == "hoofdstuk":
|
||||||
|
parts.append(_parse_structuur(child, level=level + 1, label="Hoofdstuk"))
|
||||||
|
elif tag == "titeldeel":
|
||||||
|
parts.append(_parse_structuur(child, level=level + 1, label="Titel"))
|
||||||
|
elif tag == "afdeling":
|
||||||
|
parts.append(_parse_structuur(child, level=level + 1, label="Afdeling"))
|
||||||
|
elif tag == "paragraaf":
|
||||||
|
parts.append(_parse_structuur(child, level=level + 1, label="Paragraaf"))
|
||||||
|
elif tag == "circulaire.divisie":
|
||||||
|
parts.append(_parse_structuur(child, level=level + 1, label=""))
|
||||||
|
elif tag == "sub-paragraaf":
|
||||||
|
parts.append(_parse_structuur(child, level=level + 1, label="Paragraaf"))
|
||||||
|
elif tag == "divisie":
|
||||||
|
parts.append(_parse_structuur(child, level=level + 1, label=""))
|
||||||
|
elif tag in ("artikel", "enig-artikel"):
|
||||||
|
parts.append(_parse_artikel(child))
|
||||||
|
elif tag == "al":
|
||||||
|
parts.append(_get_text(child))
|
||||||
|
elif tag == "tekst":
|
||||||
|
parts.append(_parse_tekst_content(child))
|
||||||
|
elif tag == "tussenkop":
|
||||||
|
text = _get_text(child).strip()
|
||||||
|
if text:
|
||||||
|
sub_heading = "#" * min(level + 1, 6)
|
||||||
|
parts.append(f"{sub_heading} {text}")
|
||||||
|
elif tag == "redactie":
|
||||||
|
text = _get_text(child).strip()
|
||||||
|
if text:
|
||||||
|
parts.append(f"*[{text}]*")
|
||||||
|
elif tag == "table":
|
||||||
|
parts.append(_parse_cals_table(child))
|
||||||
|
elif tag == "tabel":
|
||||||
|
parts.append("*[tabel]*")
|
||||||
|
elif tag in ("plaatje", "illustratie"):
|
||||||
|
parts.append("*[afbeelding]*")
|
||||||
|
|
||||||
|
return "\n\n".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_artikel(artikel: etree._Element) -> str:
|
||||||
|
"""Parse een <artikel> element naar Markdown."""
|
||||||
|
nr = artikel.findtext(".//nr", default="").strip()
|
||||||
|
heading = f"### Artikel {nr}" if nr else "### Artikel"
|
||||||
|
|
||||||
|
parts = [heading]
|
||||||
|
|
||||||
|
for child in artikel:
|
||||||
|
tag = child.tag
|
||||||
|
if tag == "nr":
|
||||||
|
continue
|
||||||
|
elif tag == "titel":
|
||||||
|
titel_text = _get_text(child).strip()
|
||||||
|
if titel_text:
|
||||||
|
parts.append(f"*{titel_text}*")
|
||||||
|
elif tag == "lid":
|
||||||
|
parts.append(_parse_lid(child))
|
||||||
|
elif tag == "al":
|
||||||
|
parts.append(_get_text(child))
|
||||||
|
elif tag == "lijst":
|
||||||
|
parts.append(_parse_lijst(child))
|
||||||
|
elif tag == "tekst":
|
||||||
|
parts.append(_parse_tekst_content(child))
|
||||||
|
elif tag == "tabel":
|
||||||
|
parts.append("*[tabel]*")
|
||||||
|
elif tag == "gereserveerd":
|
||||||
|
parts.append("*[Dit artikel is gereserveerd.]*")
|
||||||
|
elif tag == "vervallen":
|
||||||
|
parts.append("*[Dit artikel is vervallen.]*")
|
||||||
|
elif tag == "lid-vervallen":
|
||||||
|
lidnr = child.findtext(".//lidnr", default="").strip()
|
||||||
|
parts.append(f"**{lidnr}.** *[Vervallen.]*" if lidnr else "*[Lid vervallen.]*")
|
||||||
|
elif tag == "lidnr":
|
||||||
|
continue
|
||||||
|
|
||||||
|
return "\n\n".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_lid(lid: etree._Element) -> str:
|
||||||
|
"""Parse een <lid> element."""
|
||||||
|
lidnr = lid.findtext(".//lidnr", default="").strip()
|
||||||
|
parts: list[str] = []
|
||||||
|
if lidnr:
|
||||||
|
parts.append(f"**{lidnr}.**")
|
||||||
|
|
||||||
|
for child in lid:
|
||||||
|
tag = child.tag
|
||||||
|
if tag == "lidnr":
|
||||||
|
continue
|
||||||
|
elif tag == "al":
|
||||||
|
parts.append(_get_text(child))
|
||||||
|
elif tag == "lijst":
|
||||||
|
parts.append(_parse_lijst(child))
|
||||||
|
elif tag == "tekst":
|
||||||
|
parts.append(_parse_tekst_content(child))
|
||||||
|
elif tag == "table":
|
||||||
|
parts.append(_parse_cals_table(child))
|
||||||
|
elif tag == "tabel":
|
||||||
|
parts.append("*[tabel]*")
|
||||||
|
elif tag == "formule":
|
||||||
|
parts.append(f"*[formule: {_get_text(child)}]*")
|
||||||
|
elif tag == "redactie":
|
||||||
|
text = _get_text(child).strip()
|
||||||
|
if text:
|
||||||
|
parts.append(f"*[{text}]*")
|
||||||
|
|
||||||
|
return " ".join(parts) if lidnr and len(parts) <= 2 else "\n\n".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_lijst(lijst: etree._Element) -> str:
|
||||||
|
"""Parse een <lijst> element naar Markdown-lijst."""
|
||||||
|
items: list[str] = []
|
||||||
|
for li in lijst.findall(".//li"):
|
||||||
|
nr = li.findtext(".//li.nr", default="").strip()
|
||||||
|
body = li.find(".//li.body")
|
||||||
|
if body is not None:
|
||||||
|
text = _parse_tekst_content(body)
|
||||||
|
else:
|
||||||
|
text = _get_text(li)
|
||||||
|
prefix = f"{nr} " if nr else "- "
|
||||||
|
items.append(f"{prefix}{text}")
|
||||||
|
return "\n".join(items)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_tekst_content(elem: etree._Element) -> str:
|
||||||
|
"""Parse gemengde content (al, lijst, etc.) binnen een element."""
|
||||||
|
parts: list[str] = []
|
||||||
|
for child in elem:
|
||||||
|
if child.tag == "al":
|
||||||
|
parts.append(_get_text(child))
|
||||||
|
elif child.tag == "lijst":
|
||||||
|
parts.append(_parse_lijst(child))
|
||||||
|
elif child.tag == "table":
|
||||||
|
parts.append(_parse_cals_table(child))
|
||||||
|
elif child.tag == "tabel":
|
||||||
|
parts.append("*[tabel]*")
|
||||||
|
elif child.tag in ("plaatje", "illustratie"):
|
||||||
|
parts.append("*[afbeelding]*")
|
||||||
|
elif child.tag == "redactie":
|
||||||
|
text = _get_text(child).strip()
|
||||||
|
if text:
|
||||||
|
parts.append(f"*[{text}]*")
|
||||||
|
if not parts:
|
||||||
|
text = _get_text(elem)
|
||||||
|
if text:
|
||||||
|
parts.append(text)
|
||||||
|
return "\n\n".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_cals_table(table: etree._Element) -> str:
|
||||||
|
"""Parse een CALS <table> element naar Markdown tabel."""
|
||||||
|
rows: list[list[str]] = []
|
||||||
|
has_header = False
|
||||||
|
|
||||||
|
# Thead
|
||||||
|
thead = table.find(".//thead")
|
||||||
|
if thead is not None:
|
||||||
|
has_header = True
|
||||||
|
for row in thead.findall(".//row"):
|
||||||
|
cells = [_get_text(e) for e in row.findall(".//entry")]
|
||||||
|
rows.append(cells)
|
||||||
|
|
||||||
|
# Tbody
|
||||||
|
tbody = table.find(".//tbody")
|
||||||
|
if tbody is not None:
|
||||||
|
for row in tbody.findall(".//row"):
|
||||||
|
cells = [_get_text(e) for e in row.findall(".//entry")]
|
||||||
|
rows.append(cells)
|
||||||
|
else:
|
||||||
|
# Geen tbody — rows direct onder tgroup
|
||||||
|
for row in table.findall(".//row"):
|
||||||
|
cells = [_get_text(e) for e in row.findall(".//entry")]
|
||||||
|
rows.append(cells)
|
||||||
|
|
||||||
|
if not rows:
|
||||||
|
return "*[tabel]*"
|
||||||
|
|
||||||
|
# Normaliseer kolom-aantallen
|
||||||
|
max_cols = max(len(r) for r in rows)
|
||||||
|
for r in rows:
|
||||||
|
while len(r) < max_cols:
|
||||||
|
r.append("")
|
||||||
|
|
||||||
|
# Markdown tabel genereren
|
||||||
|
lines: list[str] = []
|
||||||
|
for i, row in enumerate(rows):
|
||||||
|
escaped = [cell.replace("|", "\\|").replace("\n", " ") for cell in row]
|
||||||
|
lines.append("| " + " | ".join(escaped) + " |")
|
||||||
|
if i == 0:
|
||||||
|
lines.append("| " + " | ".join("---" for _ in escaped) + " |")
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_bijlage(bijlage: etree._Element) -> str:
|
||||||
|
"""Parse een <bijlage> element naar Markdown."""
|
||||||
|
# Titel uit <kop>
|
||||||
|
kop = bijlage.find("./kop")
|
||||||
|
nr = ""
|
||||||
|
titel = ""
|
||||||
|
if kop is not None:
|
||||||
|
nr = kop.findtext("./nr", default="").strip()
|
||||||
|
titel = kop.findtext("./titel", default="").strip()
|
||||||
|
|
||||||
|
header = f"## Bijlage {nr}"
|
||||||
|
if titel:
|
||||||
|
header += f". {titel}"
|
||||||
|
|
||||||
|
parts = [header]
|
||||||
|
|
||||||
|
for child in bijlage:
|
||||||
|
tag = child.tag
|
||||||
|
if tag == "kop":
|
||||||
|
continue
|
||||||
|
elif tag == "artikel":
|
||||||
|
parts.append(_parse_artikel(child))
|
||||||
|
elif tag == "tekst":
|
||||||
|
parts.append(_parse_tekst_content(child))
|
||||||
|
elif tag == "tabel":
|
||||||
|
parts.append("*[tabel]*")
|
||||||
|
elif tag == "plaatje":
|
||||||
|
parts.append("*[afbeelding]*")
|
||||||
|
elif tag == "bijlage-tekst":
|
||||||
|
parts.append(_parse_tekst_content(child))
|
||||||
|
elif tag in ("hoofdstuk", "titeldeel", "afdeling", "paragraaf"):
|
||||||
|
parts.append(_parse_structuur(child, level=3, label=tag.capitalize()))
|
||||||
|
elif tag == "al":
|
||||||
|
parts.append(_get_text(child))
|
||||||
|
|
||||||
|
return "\n\n".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_text(elem: etree._Element) -> str:
|
||||||
|
"""Haal alle tekst op uit een element, met inline formatting.
|
||||||
|
|
||||||
|
Verwerkt <nadruk>, <sup>, <sub/inf>, <extref> en <intref> inline.
|
||||||
|
"""
|
||||||
|
parts: list[str] = []
|
||||||
|
_collect_text(elem, parts)
|
||||||
|
return "".join(parts).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _collect_text(elem: etree._Element, parts: list[str]) -> None:
|
||||||
|
"""Recursief tekst verzamelen met inline Markdown formatting."""
|
||||||
|
if elem.text:
|
||||||
|
parts.append(elem.text)
|
||||||
|
|
||||||
|
for child in elem:
|
||||||
|
tag = child.tag
|
||||||
|
if tag == "nadruk":
|
||||||
|
nadruk_type = child.get("type", "")
|
||||||
|
inner = "".join(child.itertext())
|
||||||
|
if nadruk_type == "vet":
|
||||||
|
parts.append(f"**{inner}**")
|
||||||
|
elif nadruk_type == "cur":
|
||||||
|
parts.append(f"*{inner}*")
|
||||||
|
else:
|
||||||
|
parts.append(inner)
|
||||||
|
elif tag == "sup":
|
||||||
|
inner = "".join(child.itertext())
|
||||||
|
parts.append(f"^{inner}")
|
||||||
|
elif tag in ("sub", "inf"):
|
||||||
|
inner = "".join(child.itertext())
|
||||||
|
parts.append(f"_{inner}")
|
||||||
|
elif tag in ("extref", "intref"):
|
||||||
|
# Tekst behouden, link-info gaat verloren in Markdown v0.1
|
||||||
|
_collect_text(child, parts)
|
||||||
|
elif tag == "nootref":
|
||||||
|
# Voetnootverwijzing — neem tekst mee
|
||||||
|
inner = "".join(child.itertext())
|
||||||
|
parts.append(inner)
|
||||||
|
else:
|
||||||
|
# Onbekend inline element — neem tekst gewoon mee
|
||||||
|
_collect_text(child, parts)
|
||||||
|
|
||||||
|
if child.tail:
|
||||||
|
parts.append(child.tail)
|
||||||
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
0
tests/pipeline/__init__.py
Normal file
0
tests/pipeline/__init__.py
Normal file
59
tests/pipeline/test_bwb_parser.py
Normal file
59
tests/pipeline/test_bwb_parser.py
Normal file
|
|
@ -0,0 +1,59 @@
|
||||||
|
"""Tests voor de BWB XML parser."""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from wetgit.pipeline.bwb_parser import parse_bwb_xml
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def grondwet_xml(tmp_path):
|
||||||
|
"""Download de Grondwet XML voor tests."""
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
url = "https://repository.officiele-overheidspublicaties.nl/bwb/BWBR0001840/2023-02-22_0/xml/BWBR0001840_2023-02-22_0.xml"
|
||||||
|
resp = httpx.get(url, timeout=30)
|
||||||
|
resp.raise_for_status()
|
||||||
|
xml_path = tmp_path / "grondwet.xml"
|
||||||
|
xml_path.write_bytes(resp.content)
|
||||||
|
return str(xml_path)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
class TestBWBParser:
|
||||||
|
def test_parse_grondwet_metadata(self, grondwet_xml: str) -> None:
|
||||||
|
result = parse_bwb_xml(grondwet_xml)
|
||||||
|
assert result.bwb_id == "BWBR0001840"
|
||||||
|
assert result.titel == "Grondwet"
|
||||||
|
assert result.soort == "wet"
|
||||||
|
|
||||||
|
def test_parse_grondwet_frontmatter(self, grondwet_xml: str) -> None:
|
||||||
|
result = parse_bwb_xml(grondwet_xml)
|
||||||
|
assert result.frontmatter["bwb_id"] == "BWBR0001840"
|
||||||
|
assert result.frontmatter["type"] == "wet"
|
||||||
|
assert result.frontmatter["status"] == "geldend"
|
||||||
|
assert "wetten.overheid.nl" in str(result.frontmatter["bron"])
|
||||||
|
|
||||||
|
def test_parse_grondwet_has_artikel_1(self, grondwet_xml: str) -> None:
|
||||||
|
result = parse_bwb_xml(grondwet_xml)
|
||||||
|
assert "### Artikel 1" in result.markdown
|
||||||
|
assert "gelijke gevallen gelijk behandeld" in result.markdown
|
||||||
|
|
||||||
|
def test_parse_grondwet_has_hoofdstukken(self, grondwet_xml: str) -> None:
|
||||||
|
result = parse_bwb_xml(grondwet_xml)
|
||||||
|
assert "## Hoofdstuk 1" in result.markdown
|
||||||
|
assert "## Hoofdstuk 2" in result.markdown
|
||||||
|
|
||||||
|
def test_parse_grondwet_yaml_frontmatter(self, grondwet_xml: str) -> None:
|
||||||
|
result = parse_bwb_xml(grondwet_xml)
|
||||||
|
assert result.markdown.startswith("---\n")
|
||||||
|
assert "\n---\n" in result.markdown
|
||||||
|
|
||||||
|
def test_parse_grondwet_markdown_structure(self, grondwet_xml: str) -> None:
|
||||||
|
result = parse_bwb_xml(grondwet_xml)
|
||||||
|
# Should start with frontmatter then h1
|
||||||
|
lines = result.markdown.split("\n")
|
||||||
|
assert lines[0] == "---"
|
||||||
|
# Find the h1
|
||||||
|
h1_lines = [l for l in lines if l.startswith("# ")]
|
||||||
|
assert len(h1_lines) == 1
|
||||||
|
assert h1_lines[0] == "# Grondwet"
|
||||||
Loading…
Add table
Reference in a new issue