Initial commit — Pseudonymisation de PDF v5

- GUI v5 : vue unique épurée (tkinter), 2 étapes visuelles - Core ONNX : anonymisation regex + NER optionnel - Extraction globale des noms depuis champs structurés (Patient, Rédigé par, MME/Madame, DR) - Génération simultanée PDF Image + PDF Anonymisé (structure préservée) - Build Windows via Nuitka (script batch + GitHub Actions CI) - install.sh pour setup/run Linux Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-16 15:03:37 +01:00
commit 8339069c83
18 changed files with 5127 additions and 0 deletions
--- a/config/dictionnaires.yml
+++ b/config/dictionnaires.yml
@@ -0,0 +1,37 @@
+version: 1
+encoding: utf-8
+normalization: NFKC
+whitelist:
+  sections_titres:
+  - DIM
+  - GHM
+  - GHS
+  - RUM
+  - COMPTE
+  - RENDU
+  - DIAGNOSTIC
+  noms_maj_excepts:
+  - Médecin DIM
+  - Praticien conseil
+  org_gpe_keep: true
+blacklist:
+  force_mask_terms:
+  - CENTRE HOSPITALIER COTE BASQUE
+  - 'Dates du séjour :'
+  - CONCERTATION
+  force_mask_regex: []
+kv_labels_preserve:
+- FINESS
+- IPP
+- N° OGC
+- Etablissement
+regex_overrides:
+- name: OGC_court
+  pattern: \b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b
+  placeholder: '[OGC]'
+  flags:
+  - IGNORECASE
+flags:
+  case_insensitive: true
+  unicode_word_boundaries: true
+  regex_engine: python