From 34dcf8f3604730b901f6dde62403ec096d4faa00 Mon Sep 17 00:00:00 2001 From: Domi31tls Date: Tue, 21 Apr 2026 10:32:57 +0200 Subject: [PATCH] Externalize dictionaries and add anonymization review corpus --- FONCTIONNEMENT.md | 8 +- Pseudonymisation_Gui_Models_V4.py | 41 +- Pseudonymisation_Gui_V5.py | 46 +- anonymizer_core_refactored.py | 48 +- anonymizer_core_refactored_onnx.py | 786 +++++------------- build_windows.bat | 1 + config/dictionnaires.default.yml | 59 ++ config/dictionnaires.yml | 94 +-- config_defaults.py | 177 ++++ data/bdpm/medication_whitelist_manual.txt | 11 + data/finess/address_blacklist.txt | 7 + data/finess/generic_name_blacklist.txt | 112 +++ data/finess/generic_phrase_blacklist.txt | 26 + pseudonymisation_pipeline_gui_v3.py | 45 +- run_batch_30_audit.py | 3 +- run_batch_59ogc.py | 3 +- run_batch_silver_export.py | 4 +- scripts/merge_params.py | 7 +- server.py | 6 +- test_gui_error.py | 3 +- test_gui_fixed.py | 3 +- tests/conftest.py | 12 + tests/synthetic_regression/README.md | 26 + .../expected.audit.json | 22 + .../001_patient_header_and_birth/expected.txt | 3 + .../001_patient_header_and_birth/input.txt | 3 + .../001_patient_header_and_birth/test.txt | 3 + .../002_contact_bundle/expected.audit.json | 12 + .../cases/002_contact_bundle/expected.txt | 1 + .../cases/002_contact_bundle/input.txt | 1 + .../cases/002_contact_bundle/test.txt | 1 + .../expected.audit.json | 7 + .../003_multiline_venue_number/expected.txt | 3 + .../003_multiline_venue_number/input.txt | 3 + .../cases/003_multiline_venue_number/test.txt | 3 + .../004_identifier_bundle/expected.audit.json | 27 + .../cases/004_identifier_bundle/expected.txt | 5 + .../cases/004_identifier_bundle/input.txt | 5 + .../cases/004_identifier_bundle/test.txt | 5 + .../expected.audit.json | 7 + .../005_force_mask_default_term/expected.txt | 1 + .../005_force_mask_default_term/input.txt | 1 + .../005_force_mask_default_term/test.txt | 1 + .../expected.audit.json | 1 + .../expected.txt | 1 + .../006_whitelist_phrases_preserved/input.txt | 1 + .../006_whitelist_phrases_preserved/test.txt | 1 + .../config_overlay.yml | 3 + .../expected.audit.json | 7 + .../007_overlay_force_mask_local/expected.txt | 1 + .../007_overlay_force_mask_local/input.txt | 1 + .../007_overlay_force_mask_local/test.txt | 1 + .../008_ville_header/expected.audit.json | 7 + .../cases/008_ville_header/expected.txt | 2 + .../cases/008_ville_header/input.txt | 2 + .../cases/008_ville_header/test.txt | 2 + .../expected.audit.json | 17 + .../009_header_and_repeated_name/expected.txt | 2 + .../009_header_and_repeated_name/input.txt | 2 + .../009_header_and_repeated_name/test.txt | 2 + .../expected.audit.json | 7 + .../expected.txt | 2 + .../010_spaced_establishment_header/input.txt | 2 + .../010_spaced_establishment_header/test.txt | 2 + tests/synthetic_regression/manifest.json | 110 +++ tests/synthetic_regression/tests.md | 25 + tests/synthetic_review/README.md | 26 + .../expectations.json | 31 + .../expected.txt | 19 + .../review.md | 10 + .../001_crh_hospitalisation_complete/test.txt | 19 + .../002_imagerie_complete/expectations.json | 26 + .../cases/002_imagerie_complete/expected.txt | 13 + .../cases/002_imagerie_complete/review.md | 7 + .../cases/002_imagerie_complete/test.txt | 13 + .../expectations.json | 29 + .../003_consultation_complete/expected.txt | 14 + .../cases/003_consultation_complete/review.md | 7 + .../cases/003_consultation_complete/test.txt | 14 + .../expectations.json | 27 + .../expected.txt | 11 + .../004_structured_admin_complete/review.md | 7 + .../004_structured_admin_complete/test.txt | 12 + tests/synthetic_review/tests.md | 15 + tests/unit/test_config_externalization.py | 92 ++ tests/unit/test_header_pii_detection.py | 63 ++ tests/unit/test_synthetic_regression.py | 100 +++ tools/debug_force_term.py | 10 +- tools/quick_test_date_correction.py | 4 +- tools/run_synthetic_review_corpus.py | 169 ++++ tools/test_all_cro.py | 3 +- tools/test_chcb_leak.py | 5 +- tools/test_date_propagation.py | 3 +- tools/test_gui_complete.py | 3 +- tools/test_gui_simulation.py | 3 +- tools/test_phase1_corrections.py | 3 +- tools/validate_corpus_sample.py | 3 +- tools/validate_full_corpus.py | 3 +- tools/validate_phase1_on_production.py | 3 +- 99 files changed, 1805 insertions(+), 805 deletions(-) create mode 100644 config/dictionnaires.default.yml create mode 100644 config_defaults.py create mode 100644 data/bdpm/medication_whitelist_manual.txt create mode 100644 data/finess/address_blacklist.txt create mode 100644 data/finess/generic_name_blacklist.txt create mode 100644 data/finess/generic_phrase_blacklist.txt create mode 100644 tests/conftest.py create mode 100644 tests/synthetic_regression/README.md create mode 100644 tests/synthetic_regression/cases/001_patient_header_and_birth/expected.audit.json create mode 100644 tests/synthetic_regression/cases/001_patient_header_and_birth/expected.txt create mode 100644 tests/synthetic_regression/cases/001_patient_header_and_birth/input.txt create mode 100644 tests/synthetic_regression/cases/001_patient_header_and_birth/test.txt create mode 100644 tests/synthetic_regression/cases/002_contact_bundle/expected.audit.json create mode 100644 tests/synthetic_regression/cases/002_contact_bundle/expected.txt create mode 100644 tests/synthetic_regression/cases/002_contact_bundle/input.txt create mode 100644 tests/synthetic_regression/cases/002_contact_bundle/test.txt create mode 100644 tests/synthetic_regression/cases/003_multiline_venue_number/expected.audit.json create mode 100644 tests/synthetic_regression/cases/003_multiline_venue_number/expected.txt create mode 100644 tests/synthetic_regression/cases/003_multiline_venue_number/input.txt create mode 100644 tests/synthetic_regression/cases/003_multiline_venue_number/test.txt create mode 100644 tests/synthetic_regression/cases/004_identifier_bundle/expected.audit.json create mode 100644 tests/synthetic_regression/cases/004_identifier_bundle/expected.txt create mode 100644 tests/synthetic_regression/cases/004_identifier_bundle/input.txt create mode 100644 tests/synthetic_regression/cases/004_identifier_bundle/test.txt create mode 100644 tests/synthetic_regression/cases/005_force_mask_default_term/expected.audit.json create mode 100644 tests/synthetic_regression/cases/005_force_mask_default_term/expected.txt create mode 100644 tests/synthetic_regression/cases/005_force_mask_default_term/input.txt create mode 100644 tests/synthetic_regression/cases/005_force_mask_default_term/test.txt create mode 100644 tests/synthetic_regression/cases/006_whitelist_phrases_preserved/expected.audit.json create mode 100644 tests/synthetic_regression/cases/006_whitelist_phrases_preserved/expected.txt create mode 100644 tests/synthetic_regression/cases/006_whitelist_phrases_preserved/input.txt create mode 100644 tests/synthetic_regression/cases/006_whitelist_phrases_preserved/test.txt create mode 100644 tests/synthetic_regression/cases/007_overlay_force_mask_local/config_overlay.yml create mode 100644 tests/synthetic_regression/cases/007_overlay_force_mask_local/expected.audit.json create mode 100644 tests/synthetic_regression/cases/007_overlay_force_mask_local/expected.txt create mode 100644 tests/synthetic_regression/cases/007_overlay_force_mask_local/input.txt create mode 100644 tests/synthetic_regression/cases/007_overlay_force_mask_local/test.txt create mode 100644 tests/synthetic_regression/cases/008_ville_header/expected.audit.json create mode 100644 tests/synthetic_regression/cases/008_ville_header/expected.txt create mode 100644 tests/synthetic_regression/cases/008_ville_header/input.txt create mode 100644 tests/synthetic_regression/cases/008_ville_header/test.txt create mode 100644 tests/synthetic_regression/cases/009_header_and_repeated_name/expected.audit.json create mode 100644 tests/synthetic_regression/cases/009_header_and_repeated_name/expected.txt create mode 100644 tests/synthetic_regression/cases/009_header_and_repeated_name/input.txt create mode 100644 tests/synthetic_regression/cases/009_header_and_repeated_name/test.txt create mode 100644 tests/synthetic_regression/cases/010_spaced_establishment_header/expected.audit.json create mode 100644 tests/synthetic_regression/cases/010_spaced_establishment_header/expected.txt create mode 100644 tests/synthetic_regression/cases/010_spaced_establishment_header/input.txt create mode 100644 tests/synthetic_regression/cases/010_spaced_establishment_header/test.txt create mode 100644 tests/synthetic_regression/manifest.json create mode 100644 tests/synthetic_regression/tests.md create mode 100644 tests/synthetic_review/README.md create mode 100644 tests/synthetic_review/cases/001_crh_hospitalisation_complete/expectations.json create mode 100644 tests/synthetic_review/cases/001_crh_hospitalisation_complete/expected.txt create mode 100644 tests/synthetic_review/cases/001_crh_hospitalisation_complete/review.md create mode 100644 tests/synthetic_review/cases/001_crh_hospitalisation_complete/test.txt create mode 100644 tests/synthetic_review/cases/002_imagerie_complete/expectations.json create mode 100644 tests/synthetic_review/cases/002_imagerie_complete/expected.txt create mode 100644 tests/synthetic_review/cases/002_imagerie_complete/review.md create mode 100644 tests/synthetic_review/cases/002_imagerie_complete/test.txt create mode 100644 tests/synthetic_review/cases/003_consultation_complete/expectations.json create mode 100644 tests/synthetic_review/cases/003_consultation_complete/expected.txt create mode 100644 tests/synthetic_review/cases/003_consultation_complete/review.md create mode 100644 tests/synthetic_review/cases/003_consultation_complete/test.txt create mode 100644 tests/synthetic_review/cases/004_structured_admin_complete/expectations.json create mode 100644 tests/synthetic_review/cases/004_structured_admin_complete/expected.txt create mode 100644 tests/synthetic_review/cases/004_structured_admin_complete/review.md create mode 100644 tests/synthetic_review/cases/004_structured_admin_complete/test.txt create mode 100644 tests/synthetic_review/tests.md create mode 100644 tests/unit/test_config_externalization.py create mode 100644 tests/unit/test_header_pii_detection.py create mode 100644 tests/unit/test_synthetic_regression.py create mode 100644 tools/run_synthetic_review_corpus.py diff --git a/FONCTIONNEMENT.md b/FONCTIONNEMENT.md index f8f0676..36b24cf 100644 --- a/FONCTIONNEMENT.md +++ b/FONCTIONNEMENT.md @@ -122,8 +122,9 @@ Fonction : `_mask_line_by_regex` | Dates | `[DATE]` | 12/03/2024 | | Adresses | `[ADRESSE]` | 12 rue de la Paix | -Configuration supplementaire via `config/dictionnaires.yml` : -listes blanches, force-mask et regex personnalisees. +Configuration : +- `config/dictionnaires.default.yml` : template versionne, source de verite des valeurs par defaut +- `config/dictionnaires.yml` : surcharge locale chargee par defaut, contenant uniquement les ecarts site/runtime ### 3. Reconnaissance d'entites nommees (NER) @@ -180,6 +181,7 @@ un fallback OCR est utilise : | Element | Description | |-------------------------------|------------------------------------------------| -| `config/dictionnaires.yml` | Listes blanches, force-mask, regex custom | +| `config/dictionnaires.default.yml` | Valeurs par defaut completes et versionnees | +| `config/dictionnaires.yml` | Surcharge locale optionnelle (ecarts uniquement) | | `Pseudonymisation_Gui_V5.py` | Interface graphique (traitement par lots) | | Ligne de commande | `python anonymizer_core_refactored_onnx.py fichier.pdf --hf --raster` | diff --git a/Pseudonymisation_Gui_Models_V4.py b/Pseudonymisation_Gui_Models_V4.py index 2e89dca..768196a 100644 --- a/Pseudonymisation_Gui_Models_V4.py +++ b/Pseudonymisation_Gui_Models_V4.py @@ -48,33 +48,16 @@ try: except Exception: yaml = None -APP_TITLE = "Pseudonymisation de PDF" -DEFAULT_CFG = Path("config/dictionnaires.yml") +from config_defaults import ( + RUNTIME_DICTIONARIES_CONFIG_PATH, + read_default_dictionaries_text, + read_runtime_dictionaries_overlay_text, +) -DEFAULTS_CFG_TEXT = r""" -# dictionnaires.yml – valeurs par défaut (bloc littéral pour les regex) -version: 1 -encoding: "utf-8" -normalization: "NFKC" -whitelist: - sections_titres: [DIM, GHM, GHS, RUM, COMPTE, RENDU, DIAGNOSTIC] - noms_maj_excepts: ["Médecin DIM", "Praticien conseil"] - org_gpe_keep: true -blacklist: - force_mask_terms: [] - force_mask_regex: [] -kv_labels_preserve: [FINESS, IPP, "N° OGC", Etablissement] -regex_overrides: - - name: OGC_court - pattern: |- - \b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b - placeholder: '[OGC]' - flags: [IGNORECASE] -flags: - case_insensitive: true - unicode_word_boundaries: true - regex_engine: "python" -""" +APP_TITLE = "Pseudonymisation de PDF" +DEFAULT_CFG = RUNTIME_DICTIONARIES_CONFIG_PATH +DEFAULTS_CFG_TEXT = read_default_dictionaries_text() +RUNTIME_CFG_TEXT = read_runtime_dictionaries_overlay_text() class ToolTip: @@ -208,7 +191,7 @@ class App: # YAML helpers def _ensure_cfg_exists(self): p = Path(self.cfg_path.get()); p.parent.mkdir(parents=True, exist_ok=True) - if not p.exists(): p.write_text(DEFAULTS_CFG_TEXT, encoding="utf-8") + if not p.exists(): p.write_text(RUNTIME_CFG_TEXT, encoding="utf-8") def _cfg_browse(self): d = filedialog.asksaveasfilename(defaultextension=".yml", filetypes=[("YAML","*.yml *.yaml"), ("Tous","*.*")]) if d: self.cfg_path.set(d) @@ -225,14 +208,14 @@ class App: if yaml is None: messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml)."); return try: - Path(self.cfg_path.get()).write_text(yaml.safe_dump(self.cfg_data or yaml.safe_load(DEFAULTS_CFG_TEXT), allow_unicode=True, sort_keys=False), encoding="utf-8") + Path(self.cfg_path.get()).write_text(yaml.safe_dump(self.cfg_data or {}, allow_unicode=True, sort_keys=False), encoding="utf-8") self._log("Règles sauvegardées.") except Exception as e: messagebox.showerror("Erreur", f"Impossible d'écrire le YAML: {e}") def _reload_cfg(self): self._load_cfg(); self._log("Règles rechargées.") def _restore_defaults(self): try: - Path(self.cfg_path.get()).write_text(DEFAULTS_CFG_TEXT, encoding="utf-8"); self._log("CFG par défaut écrit."); self._load_cfg() + Path(self.cfg_path.get()).write_text(RUNTIME_CFG_TEXT, encoding="utf-8"); self._log("Surcharge locale réinitialisée."); self._load_cfg() except Exception as e: messagebox.showerror("Erreur", f"Impossible d'écrire le YAML par défaut: {e}") diff --git a/Pseudonymisation_Gui_V5.py b/Pseudonymisation_Gui_V5.py index 472b7ca..bd680f0 100644 --- a/Pseudonymisation_Gui_V5.py +++ b/Pseudonymisation_Gui_V5.py @@ -20,7 +20,6 @@ import os import platform import queue import re -import shutil import subprocess import sys import threading @@ -75,6 +74,11 @@ try: except Exception: yaml = None +from config_defaults import ( + read_default_dictionaries_text, + read_runtime_dictionaries_overlay_text, +) + # --------------------------------------------------------------------------- # Thème optionnel # --------------------------------------------------------------------------- @@ -142,47 +146,19 @@ def _resolve_config() -> Path: pour que l'utilisateur puisse la modifier sans recompiler. """ exe_cfg = _exe_dir() / "config" / "dictionnaires.yml" - app_cfg = _app_dir() / "config" / "dictionnaires.yml" if exe_cfg.exists(): return exe_cfg - # Premier lancement : copier la config embarquée à côté de l'exe - if app_cfg.exists(): - exe_cfg.parent.mkdir(parents=True, exist_ok=True) - import shutil - shutil.copy2(str(app_cfg), str(exe_cfg)) - return exe_cfg - - return app_cfg # fallback + exe_cfg.parent.mkdir(parents=True, exist_ok=True) + exe_cfg.write_text(read_runtime_dictionaries_overlay_text(), encoding="utf-8") + return exe_cfg DEFAULT_CFG = _resolve_config() MODELS_DIR = _app_dir() / "models" -DEFAULTS_CFG_TEXT = r""" -# dictionnaires.yml – valeurs par défaut (bloc littéral pour les regex) -version: 1 -encoding: "utf-8" -normalization: "NFKC" -whitelist: - sections_titres: [DIM, GHM, GHS, RUM, COMPTE, RENDU, DIAGNOSTIC] - noms_maj_excepts: ["Médecin DIM", "Praticien conseil"] - org_gpe_keep: true -blacklist: - force_mask_terms: [] - force_mask_regex: [] -kv_labels_preserve: [FINESS, IPP, "N° OGC", Etablissement] -regex_overrides: - - name: OGC_court - pattern: |- - \b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b - placeholder: '[OGC]' - flags: [IGNORECASE] -flags: - case_insensitive: true - unicode_word_boundaries: true - regex_engine: "python" -""" +DEFAULTS_CFG_TEXT = read_default_dictionaries_text() +RUNTIME_CFG_TEXT = read_runtime_dictionaries_overlay_text() # Palette dérivée du logo aivanonym (gradient magenta → rose → pêche → noir) # Magenta du logo : primaire (boutons, accents) @@ -1593,7 +1569,7 @@ class App: p = Path(self.cfg_path.get()) p.parent.mkdir(parents=True, exist_ok=True) if not p.exists(): - p.write_text(DEFAULTS_CFG_TEXT, encoding="utf-8") + p.write_text(RUNTIME_CFG_TEXT, encoding="utf-8") def _load_cfg(self): if yaml is None: diff --git a/anonymizer_core_refactored.py b/anonymizer_core_refactored.py index c1fd2f9..cfe2613 100644 --- a/anonymizer_core_refactored.py +++ b/anonymizer_core_refactored.py @@ -24,36 +24,11 @@ try: import yaml # PyYAML for dictionaries except Exception: yaml = None - -# ----------------- Defaults & Config ----------------- -DEFAULTS_CFG = { - "version": 1, - "encoding": "utf-8", - "normalization": "NFKC", - "whitelist": { - "sections_titres": ["DIM", "GHM", "GHS", "RUM", "COMPTE", "RENDU", "DIAGNOSTIC"], - "noms_maj_excepts": ["Médecin DIM", "Praticien conseil"], - "org_gpe_keep": True, - }, - "blacklist": { - "force_mask_terms": [], - "force_mask_regex": [], - }, - "kv_labels_preserve": ["FINESS", "IPP", "N° OGC", "Etablissement"], - "regex_overrides": [ - { - "name": "OGC_court", - "pattern": r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b", - "placeholder": "[OGC]", - "flags": ["IGNORECASE"], - } - ], - "flags": { - "case_insensitive": True, - "unicode_word_boundaries": True, - "regex_engine": "python", - }, -} +from config_defaults import ( + RUNTIME_DICTIONARIES_CONFIG_PATH, + load_effective_dictionaries_dict, + load_default_dictionaries_dict, +) PLACEHOLDERS = { "EMAIL": "[EMAIL]", @@ -103,16 +78,7 @@ class AnonResult: # ----------------- Config loader ----------------- def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]: - cfg = DEFAULTS_CFG.copy() - if config_path and config_path.exists() and yaml is not None: - try: - user = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {} - # shallow-merge for top-level keys - for k, v in user.items(): - cfg[k] = v - except Exception: - pass - return cfg + return load_default_dictionaries_dict() if config_path is None else load_effective_dictionaries_dict(config_path) # ----------------- Extraction ----------------- @@ -416,7 +382,7 @@ if __name__ == "__main__": ap.add_argument("--out", type=str, default="out") ap.add_argument("--no-vector", action="store_true") ap.add_argument("--raster", action="store_true") - ap.add_argument("--config", type=str, default=str(Path("config/dictionnaires.yml"))) + ap.add_argument("--config", type=str, default=str(RUNTIME_DICTIONARIES_CONFIG_PATH)) args = ap.parse_args() outs = process_pdf(Path(args.pdf), Path(args.out), make_vector_redaction=not args.no_vector, also_make_raster_burn=args.raster, config_path=Path(args.config)) print(json.dumps(outs, indent=2, ensure_ascii=False)) diff --git a/anonymizer_core_refactored_onnx.py b/anonymizer_core_refactored_onnx.py index 5346d58..1629b59 100644 --- a/anonymizer_core_refactored_onnx.py +++ b/anonymizer_core_refactored_onnx.py @@ -44,6 +44,12 @@ try: except Exception: yaml = None +from config_defaults import ( + RUNTIME_DICTIONARIES_CONFIG_PATH, + load_effective_dictionaries_dict, + load_default_dictionaries_dict, +) + try: from doctr.models import ocr_predictor as _doctr_ocr_predictor _DOCTR_AVAILABLE = True @@ -115,6 +121,29 @@ def _load_bdpm_medication_names() -> set: return set() +def _load_wordlist_file( + path: Path, + *, + transform=lambda s: s, + label: str, + min_len: int = 1, +) -> set: + """Charge un fichier texte, un mot par ligne.""" + result: set = set() + if not path.exists(): + log.warning("%s introuvable : %s", label, path) + return result + try: + for line in path.read_text(encoding="utf-8").splitlines(): + word = line.strip() + if word and not word.startswith("#") and len(word) >= min_len: + result.add(transform(word)) + log.info("%s chargé : %d entrées depuis %s", label, len(result), path.name) + except Exception as exc: + log.error("%s : erreur de lecture %s — %s", label, path, exc) + return result + + # ----------------- Gazetteers INSEE (prénoms + communes + noms de famille) ----------------- # Prénoms et noms de famille sont utilisés sous deux formes : # - _INSEE_PRENOMS (lowercase) : check rapide "w.lower() in _INSEE_PRENOMS" @@ -199,62 +228,24 @@ _FINESS_ADDR_AC = None # Automate Aho-Corasick pour adresses (noms d _VILLE_AC = None # Automate Aho-Corasick pour villes (INSEE + FINESS) # Communes trop ambiguës (homonymes de mots courants, trop courts, etc.) -_VILLE_BLACKLIST = { - # Directions / mots géographiques génériques - "SAINT", "NORD", "SUD", "EST", "OUEST", - "CENTRE", "SERVICE", "BOURG", - # Communes homonymes de mots courants français - "ORANGE", "TOURS", "NICE", "SENS", "VITRE", - "ROMANS", "MENTON", "SALON", "VIENNE", - "BREST", # trop court et ambigu - "HYERES", # proche de termes médicaux - "AGEN", "AUCH", "ALBI", - "BLOIS", "LAON", "LENS", - "GIEN", "GRAY", - "AIRE", "LURE", "SETE", "DOLE", - "VIRE", "LUNEL", "MURET", "MORET", - "COEUR", "FOIX", "GIVET", - "EVIAN", "MAURE", "MENDE", - "JOUE", "MEAUX", "REDON", - "CREIL", "CERGY", - # Communes de 4-5 lettres homonymes de mots très courants - "VERS", "MONT", "MARS", "PORT", "PONT", "FORT", - "BOIS", "ISLE", "LACS", "MURS", "OUST", "PREY", - "VAUX", "VERT", "FAUX", "REZE", - "BILLE", "PLACE", "VILLE", "COURS", "GRAND", - "ROUGE", "RICHE", "NUITS", "SORE", "SARE", - "TRANS", "RANS", "MARSA", - # Mots courants français (6+ lettres) aussi communes - "CHARGE", "SIGNES", "BARRES", "FOSSES", "GARDES", - "MARCHE", "LIGNES", "MOULIN", "PIERRE", "CHAISE", - "SOURCE", "VALLEE", "MAISON", "BEAUNE", "CORPS", - "PUITS", "CROIX", "LIGNE", "QUATRE", "PRISON", - # Prénoms très courants (aussi communes) - "MARIE", "PIERRE", "JEAN", "PAUL", "ANNE", - # Expressions composées ambiguës (aussi communes INSEE) - "LONG", "RECY", "PLAN", "MARCHE", "SALLE", - "CONTRE", "MERE", "ONDRES", "VEBRE", - # Mots structurels / médicaux - "PARIS", # omniprésent, source de faux positifs - "FRANCE", "EUROPE", - # Termes ambigus (aussi communes INSEE) - trackare/DPI - "COURANT", # "Médecin courant" ≠ ville - # Parties du corps homonymes de communes (FP "prurit invalidant (COU, décolleté)") - "COU", "DOS", "SEIN", "BRAS", +_VILLE_BLACKLIST_FALLBACK = { + "PARIS", + "FRANCE", + "EUROPE", + "COURANT", + "COU", + "DOS", + "SEIN", + "BRAS", } -# Enrichissement depuis fichier externe (modifiable sans toucher au code) -_villes_bl_file = Path(__file__).parent / "data" / "villes_blacklist.txt" -if _villes_bl_file.exists(): - try: - for _line in _villes_bl_file.read_text(encoding="utf-8").splitlines(): - _w = _line.strip() - if _w and not _w.startswith("#"): - _VILLE_BLACKLIST.add(_w) - log.info("Villes blacklist chargées : %d entrées", len(_VILLE_BLACKLIST)) - except Exception as _exc: - log.error("Villes blacklist : erreur de lecture %s — %s", _villes_bl_file, _exc) -else: - log.warning("Villes blacklist : fichier introuvable %s — défauts intégrés utilisés", _villes_bl_file) +_VILLE_BLACKLIST = _load_wordlist_file( + Path(__file__).parent / "data" / "villes_blacklist.txt", + transform=str.upper, + label="Villes blacklist", +) +if not _VILLE_BLACKLIST: + _VILLE_BLACKLIST = set(_VILLE_BLACKLIST_FALLBACK) +_BASE_VILLE_BLACKLIST = set(_VILLE_BLACKLIST) try: import ahocorasick as _ahocorasick @@ -331,7 +322,7 @@ def load_medical_whitelists(): global _MEDICAL_STRUCTURAL_TERMS, _MEDICATION_WHITELIST # 1. Charger les termes médicaux structurels - config_path = Path("config/medical_terms_whitelist.yml") + config_path = Path(__file__).parent / "config" / "medical_terms_whitelist.yml" if config_path.exists() and yaml: try: with open(config_path, 'r', encoding='utf-8') as f: @@ -345,48 +336,20 @@ def load_medical_whitelists(): # 2. Charger la whitelist des médicaments (edsnlp + BDPM + manuels) _MEDICATION_WHITELIST = _load_edsnlp_drug_names() _MEDICATION_WHITELIST.update(_load_bdpm_medication_names()) - # Ajouter médicaments manquants - additional_meds = { - "idacio", "salazopyrine", "infliximab", "apranax", - "ketoprofene", "prevenar", "pneumovax", "bétadine" - } - _MEDICATION_WHITELIST.update(additional_meds) + _MEDICATION_WHITELIST.update( + _load_wordlist_file( + Path(__file__).parent / "data" / "bdpm" / "medication_whitelist_manual.txt", + transform=str.lower, + label="Whitelist médicaments manuelle", + min_len=3, + ) + ) log.info(f"Whitelist médicaments chargée: {len(_MEDICATION_WHITELIST)} médicaments (edsnlp+BDPM)") # Charger les whitelists au démarrage du module load_medical_whitelists() -# ----------------- Defaults & Config ----------------- -DEFAULTS_CFG = { - "version": 1, - "encoding": "utf-8", - "normalization": "NFKC", - "whitelist": { - "sections_titres": ["DIM", "GHM", "GHS", "RUM", "COMPTE", "RENDU", "DIAGNOSTIC"], - "noms_maj_excepts": ["Médecin DIM", "Praticien conseil"], - "org_gpe_keep": False, - }, - "blacklist": { - "force_mask_terms": [], - "force_mask_regex": [], - }, - "kv_labels_preserve": ["FINESS", "IPP", "N° OGC", "Etablissement"], - "regex_overrides": [ - { - "name": "OGC_court", - "pattern": r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b", - "placeholder": "[OGC]", - "flags": ["IGNORECASE"], - } - ], - "flags": { - "case_insensitive": True, - "unicode_word_boundaries": True, - "regex_engine": "python", - }, -} - PLACEHOLDERS = { "EMAIL": "[EMAIL]", "TEL": "[TEL]", @@ -445,408 +408,49 @@ def validate_nir(nir_raw: str) -> bool: return False return key_int == (97 - (body_int % 97)) -# Mots médicaux/techniques/courants qui ne sont pas des noms de personnes -_MEDICAL_STOP_WORDS_SET = { - # Mots français courants (déterminants, prépositions, adverbes, etc.) - "pas", "mon", "bien", "ancien", "ancienne", "bon", "bonne", "tout", "tous", - "mais", "donc", "car", "que", "qui", "avec", "dans", "pour", "sur", "par", - "les", "des", "une", "est", "son", "ses", "nos", "aux", "cette", "ces", - "cher", "chez", "entre", "sans", "sous", "vers", "selon", "après", "avant", - "puis", "aussi", "très", "plus", "moins", "peu", "non", "oui", "quelques", - "mise", "début", "fin", "suite", "fait", "lieu", "cas", "jour", "jours", - "semaine", "semaines", "mois", "temps", "place", "nouvelle", "nouveau", - "franche", "légère", "quelque", "depuis", "comme", "encore", "votre", - "date", "note", "notes", "nom", "heure", "matin", "soir", "midi", - "signé", "réalisé", "courrier", "cabinet", "rue", - # Verbes / participes courants - "remontée", "associée", "réalisée", "débuté", "prolongé", "prolongée", - "prescrit", "prescrite", "présente", "présent", "absente", "absent", - "reprise", "introduction", "arrêt", "relais", - # Titres / rôles hospitaliers - "chef", "assistant", "assistante", "praticien", "praticienne", - "docteur", "professeur", "hospitalier", "hospitalière", "hospitaliers", - "spécialiste", "contractuel", "contractuelle", "titulaire", - "confrère", "consoeur", "coordonnateur", "coordonnatrice", - "médecin", "médical", "infirmier", "infirmière", - "praticiens", "patient", "patiente", - # Structure hospitalière - "service", "pôle", "clinique", "consultation", "secrétariat", - "hôpital", "hôpitaux", "centre", "établissement", "polyclinique", - # Villes / géographie (pas des noms de personnes) - "bordeaux", "bayonne", "paris", "lyon", "lille", "marseille", - "toulouse", "nantes", "montpellier", "pessac", "biarritz", "soustons", - "basque", "basques", "sud", "côte", - # Médicaments génériques et spécialités (DCI + noms commerciaux) - "colchicine", "aspirine", "cortancyl", "bisoprolol", "entresto", - "methotrexate", "eplerenone", "speciafoldine", "prednisone", - "corticoïdes", "cortisone", - "paracetamol", "metformine", "solupred", "novorapid", "abasaglar", - "lovenox", "methylprednisolone", "potassium", "humalog", "furosemide", - "insuline", "trulicity", "forxiga", "atorvastatine", "amlodipine", - "ondansetron", "eliquis", "nebivolol", "gaviscon", "loxen", - "morphine", "oxycodone", "kardegic", "tercian", "zopiclone", - "seresta", "tramadol", "alprazolam", "forlax", "levothyrox", - "bromazepam", "gliclazide", "zymad", "pravastatine", "spiriva", - "quetiapine", "sertraline", "crestor", "lercanidipine", "amoxicilline", - "opocalcium", "ferinject", "candesartan", "ceftriaxone", "calcidose", - "laroxyl", "brintellix", "ketoprofene", "adrenaline", "exacyl", - "terbutaline", "ipratropium", "actiskenan", "vialebex", "oxynormoro", - "lansoprazole", "perindopril", "sodium", "velmetia", - "doliprane", "dafalgan", "efferalgan", "spasfon", "vogalene", - "augmentin", "inexium", "omeprazole", "pantoprazole", "esomeprazole", - "ramipril", "lisinopril", "enalapril", "losartan", "valsartan", - "irbesartan", "olmesartan", "telmisartan", "hydrochlorothiazide", - "spironolactone", "furosemide", "lasilix", "aldactone", - "tahor", "crestor", "rosuvastatine", "simvastatine", "fluvastatine", - "xarelto", "pradaxa", "apixaban", "rivaroxaban", "dabigatran", - "plavix", "clopidogrel", "ticagrelor", "brilique", - "ventoline", "seretide", "symbicort", "salmeterol", "fluticasone", - "salbutamol", "tiotropium", "budesonide", "beclometasone", - "oxycodone", "oxynorm", "skenan", "actiskenan", "fentanyl", - "nubain", "nalbuphine", "nefopam", "acupan", "profenid", - "ibuprofene", "diclofenac", "naproxene", "celecoxib", - "gabapentine", "pregabaline", "lyrica", "neurontin", - "amitriptyline", "duloxetine", "venlafaxine", "fluoxetine", - "paroxetine", "escitalopram", "citalopram", "mirtazapine", - "olanzapine", "risperidone", "aripiprazole", "haloperidol", - "loxapine", "cyamemazine", "diazepam", "oxazepam", "lorazepam", - "clonazepam", "midazolam", "hydroxyzine", "atarax", "melatonine", - "stilnox", "zolpidem", "imovane", - "levothyroxine", "metformine", "glimepiride", "sitagliptine", - "januvia", "jardiance", "empagliflozine", "dapagliflozine", - "ozempic", "semaglutide", "dulaglutide", "liraglutide", "victoza", - "heparine", "enoxaparine", "tinzaparine", "innohep", - "warfarine", "coumadine", "fluindione", "previscan", - "ciprofloxacine", "levofloxacine", "ofloxacine", "metronidazole", - "vancomycine", "gentamicine", "tazocilline", "piperacilline", - "meropenem", "imipenem", "clindamycine", "doxycycline", - "azithromycine", "clarithromycine", "cotrimoxazole", "bactrim", - "polyionique", "propranolol", "apidra", "solostar", - # Noms et suffixes laboratoires pharmaceutiques - "arw", "myl", "myp", "arg", "teva", "bga", "agt", - "mylan", "biogaran", "arrow", "sandoz", "zentiva", "cristers", - "accord", "viatris", "ranbaxy", "ratiopharm", "almus", "qualimed", - "evolugen", "alter", "zydus", "medisol", "substipharm", - "sdz", "bgr", "egt", "rnb", - # Formes galéniques / voies d'administration - "cpr", "sachet", "orale", "oral", "sol", "buv", "stylo", "flexpen", - "flestouch", "kwikpen", "inj", "susp", "gelule", "comprime", - "unidose", "perf", "inh", "seringue", "aerosol", "sach", "pdr", - "orodisp", "capsule", "patch", "suppositoire", "gouttes", - # Termes de prescription / pharmacie - "prescription", "prescriptions", "dose", "fréquence", "statut", - "technique", "capteur", "bandelettes", "glycemiques", "glycemique", - "lancettes", "aiguilles", "fines", "micro", "pompe", "réserve", - "glycemie", "capillaire", "hgt", - # Termes médicaux / cliniques - "myocardite", "myosite", "corticothérapie", "biopsie", "pathologie", - "dysimmunitaire", "récidive", "récidivante", "traitement", "diagnostic", - "antécédents", "examen", "bilan", "résultats", "analyse", - "interne", "externe", "médecine", "chirurgie", "rhumatologie", - "dermatologie", "immunologie", "cardiologie", "pneumologie", - "neurologie", "gynécologie", "radiologie", "sénologie", - "douleur", "douleurs", "douloureux", "musculaire", "musculaires", - "thoracique", "thoraciques", "membres", "supérieurs", "inférieurs", - "normale", "normaux", "habituelle", "habituelles", - "synthèse", "hospitalisation", "syndrome", "vaccination", "ophtalmo", - "pelvien", "diabétique", "sommeil", "régime", "diet", - "desinfection", "environnement", "identification", "bracelet", - "toilettes", "accompagner", "installer", "transfusion", - "signes", "vitaux", "alimentaire", "avis", "zone", - "calcémie", - # Abréviations médicales - "irm", "ett", "ecg", "mtx", "fevg", "bdc", "crp", "sfu", "hdj", - "bnp", "asat", "alat", "cpk", "ctc", "hba", "hba1c", - "saos", "tsh", "inr", "vgm", "pnn", "plq", "hb", - "poc", "bax", "act", "bic", "cfx", "acc", "ado", "acf", "vfo", - "qvl", "cci", "pse", "pca", "chl", "crt", "bbm", "pds", "ren", - "vit", "zen", - "scanner", "radio", "écho", "échographie", - # Spécialités médicales (éviter faux positifs NOM) - "hépato-gastro-entérologue", "gastro-entérologue", "gastro-entérologie", - "proctologue", "oncologue", "anesthésiste", "pneumologue", "gérontologue", - "cardiologue", "néphrologue", "urologue", "gériatre", - "hépatologue", "endocrinologue", "stomatologue", - # Termes médicaux / titres fréquemment détectés comme NOM par le NER - "supplémentation", "supplementation", "endocrinologie", "monsieur", "madame", - "suivi", "sortie", "emog", "ophtalmo", - # Médicaments détectés comme NOM/PRENOM par EDS-Pseudo - "eliquis", "trulicity", "saos", "wind", "taxotere", "eupantol", "ezetimibe", - "lansoyl", "xatral", "xenetix", "trimbow", "buspirone", "cetirizine", - "depakote", "versatis", "durogesic", "montelukast", "metformine", "viatris", - "rosuvastatine", "gliclazide", "amlodipine", "perindopril", "nebivolol", - "pravastatine", "bisoprolol", "amoxicilline", "kardegic", "lovenox", - # Termes médicaux / soins / actes détectés comme NOM - "partielle", "cutanee", "cutané", "cutanée", "osseuse", "diabetique", - "diabétique", "transdermique", "transderm", "diarrhees", "diarrhées", - "ionogramme", "scintigraphie", "thoraco", "thorax", "négative", "negative", - "diététicienne", "pressurise", "pressuriser", "inhalee", "inhalée", "inhal", - # Mots courants français détectés comme NOM dans les trackare - "toilette", "repas", "poche", "installation", "education", "éducation", - "refection", "réfection", "complete", "complète", "regime", "régime", - "normal", "traité", "traite", "arrêté", "arrete", "volume", - "commentaires", "france", "covid", "framboise", "epoux", "époux", - # Abréviations médicales courtes (3-4 chars) détectées comme NOM - "ide", "ipp", "pcr", "tap", "gel", "ahl", "ssr", "hds", "tca", "etp", - "mcg", "sdz", "iao", "ser", "orod", "clav", "disp", "cart", "atcd", "mdrd", - "amox", "endoc", "microg", "item", "pyélo", "néphro", - # En-têtes de colonnes / mots structurels trackare - "observations", "observation", "commentaires", "commentaire", - "surveillance", "température", "temperature", "glycémie", "glycemie", - "diurèse", "diurese", "balance", "pouls", "systolique", "diastolique", - "saturation", "fréquence", "frequence", "respiratoire", "douleur", - "alertes", "alerte", "antécédents", "antecedents", "habitus", - "allergies", "prescriptions", "prescription", "administration", - "catégorie", "categorie", "expiration", "message", - "destination", "diagnostique", "diagnostiques", - "date", "note", "nom", "heure", "type", "code", "etat", - "comprime", "comprimé", "gelule", "gélule", "solution", "injectable", - # Médicaments supplémentaires détectés dans les trackare - "depakote", "versatis", "humalog", "forxiga", "durogesic", - "montelukast", "rosuvastatine", - # Abréviations pharma courtes - "cpr", "sol", "bic", "agt", "poche", "inhal", - # Termes chirurgicaux/cliniques FP - "cure", "endocrine", "operatoire", "opératoire", "realisee", "réalisée", - "gauche", "droit", "droite", "face", "profil", - # Faux positifs EDS supplémentaires - "psy", "inhales", "inhalés", "kwikpen", "lansoprazole", "tiorfan", "smecta", - "axa", "ttt", "anionique", "abdomino", "cod", "omi", "urg", "med", - "10mg", "20mg", "40mg", "100mg", "300ui", "500ml", "innohep", "coaprovel", - "actiskenan", "simvastatine", "forlax", - # Mots temporels / contextuels détectés comme EDS_HOPITAL - "semaine", "jour", "matin", "soir", "nuit", "midi", - # Mots clés de contexte document - "compétences", "maladies", "inflammatoires", "systémiques", "rares", - "fret", "fax", "contexte", "résultat", "resultat", "résultats", "resultats", - "haute", "maison", "aide", "rpps", "poste", "fonct", - "sante", "santé", "etxe", "ttipi", "gastro", "concha", - "endoscopie", "endoscopique", "fibroscopie", - "indication", "conclusion", "technique", "anesthésie", - "digestif", "digestive", "digestives", "nutritive", - # Abréviations soins trackare détectées comme NOM (batch 20 OGC) - "soins", "lit", "jeun", "lever", "pose", "surv", "ggt", "vvp", - # Verbes d'instructions soins (aussi des patronymes INSEE → FP) - "coucher", "manger", "marcher", "sortir", - "verif", "crop", "evs", "maco", "pan", "cet", "trou", "nit", "nfs", - # Mots narratifs CRH capturés par fusion sidebar 2-colonnes - "evolution", "évolution", "explorations", "fermeture", "allergie", "allergies", - "lotissement", "cholangiographie", "cholecystectomie", "cholécystectomie", - "paracetamol", "paracétamol", "unité", "unite", - # FP résiduels batch 10 OGC (termes médicaux/instructions soins) - "glyc", "glycosurie", "vider", "forte", - # FP audit batch 59 OGC (mots courants/médicaux flagués comme NOM) - "oncologie", "confrères", "confrere", "doubles", "chers", "motif", - "responsable", "autre", "autres", "autonome", "autonomes", - "préparations", "preparations", "prévenir", "prevenir", - "acétylsalicylique", "acetylsalicylique", "angio", - "desc", "diu", "barreau", - "haitz", "alde", - # FP audit OGC 21 — termes médicaux/courants flagués NOM_GLOBAL - "alimentation", "augmentation", "amelioration", "amélioration", - "biliaire", "biliaires", "bili", "voies", "voie", - "apyrexie", "apyréxie", "apyrétique", "apyretique", - "clavulanique", "mecillinam", "sulfamides", "sulfamide", - "tazobactam", "temocilline", "ecoflac", "furanes", "furane", - "exilar", "lipruzet", "mopral", - "sensible", "sensibles", "dossier", "dossiers", - "entero", "entéro", "medecine", "bio", - "aviation", "contention", "isolement", - "elimination", "élimination", "infectieux", - "hémodynamique", "hemodynamique", "pancréatite", "pancreatite", - "cholecystite", "cholécystite", "cholécystectomie", "cholecystectomie", - "appendicectomie", "néoplasie", "neoplasie", - "ovarienne", "prandial", "fébrile", "febrile", - "eupnéique", "eupneique", "normocarde", "normotendue", - "variable", "dosage", "posologie", - # Abréviations diététiques/soins trackare - "bcy", "po2", "po1", "po3", "bha", "atg", "ras", "cat", "ass", - # FP audit OGC 17 CRH - "mode", "retraitee", "retraité", "retraitée", "régression", "regression", "tel", - "strasbourg", "bordeaux", "toulouse", "paris", "lyon", "marseille", "bayonne", "anglet", - "saint-palais", "tarnos", "hendaye", "dax", "orthez", "oloron", "pau", "cambo", - # Spécialités/services récurrents comme FP NOM - "cancérologie", "cancerologie", "réanimation", "reanimation", - "urologie", "néphrologie", "nephrologie", "hématologie", "hematologie", - "gériatrie", "geriatrie", "pédiatrie", "pediatrie", - "ophtalmologie", "stomatologie", "allergologie", - "kinésithérapie", "kinesitherapie", "ergothérapie", "ergotherapie", - "orthopédie", "orthopedie", "traumatologie", - "palliatifs", "palliative", "palliatif", - "addictologie", "alcoologie", "tabacologie", - # FP soignants trackare (mots courants capturés par patterns Note d'évolution / Signé / Flacon) - "discussion", "echelle", "échelle", "scope", "tdm", "bouteille", - "evendol", "relais", "repas", "poursuite", "indication", - # FP pattern timestamp (termes ALL-CAPS capturés par "HH:MM NOM") - "eliminatin", "elimination", "élimination", "preremplie", "pré-remplie", - "thermie", "alim", "alimentation", "admin", - # Médicaments/tests labo capturés par patterns soignants - "biprofenid", "bi-profenid", "phosphatase", "phosphatases", - "ecbu", "suppo", "suppositoire", "mucite", "microlax", "normacol", - "ciprofloxacine", "lavement", "desinfection", "désinfection", - "avaler", "rachis", "lombaire", "thoraco-lombaire", - "cérébrale", "cerebrale", "cérébral", "cerebral", "hépatique", "hepatique", - "thoracique", "abdominale", "abdominal", "pelvienne", "pelvien", "médullaire", - # Dosages et labos pharma (FP fréquents dans prescriptions Trackare) - "faible", "fort", "forte", - "myl", "mylan", "sandoz", "teva", "arrow", "biogaran", "zentiva", "eg", - "arw", "pan", "mso", "bgn", "ratiopharm", "accord", "vts", "viatris", - "abdomino-pelvien", "abdomino", "pelvien", "thoraco-abdominal", - "entree", "entrée", "continu", "continue", - "morphine", "claforan", "skenan", "actiskenan", - # Fragments de noms de médicaments (pdfplumber split) - "sium", "pegic", "fenid", "profenid", - # Catégories cliniques Trackare (en-têtes de section masqués à tort) - "respi", "respiratoire", "nephro", "cardio", "neuro", "onco", "pulmo", - "hemato", "hémato", "infectieux", "thermie", "diurese", "diurèse", - "transit", "anemie", "anémie", "constantes", "examen", - "post-op", "postop", "pré-op", "preop", "chimio", "elim", - "toilette", "sommeil", "hypota", "hypotension", "spo2", - "urine", "urines", "sng", - "rénale", "renale", "rénal", "renal", "cardiaque", - # Termes structurels trackare - "transmissions", "transmission", "releve", "relevé", - "objectif", "objectifs", "evaluation", "évaluation", - "planification", "planifié", "planifiee", - # ── FP détectés automatiquement par audit_fp_detector.py ── - # Lot 2 : tokens basse confiance (DICT_FR seul) clairement non-noms - "acide", "adulte", "ambulatoire", "avenue", "bandelette", "bassin", - "bijoux", "bouche", "bouchon", "changes", "court", "demande", "dessert", - "devenir", "diffusé", "douche", "entrée", "escarre", "espace", - "explications", "fauteuil", "feuillet", "fixateur", "fois", "gamma", - "germes", "glace", "habillage", "liste", "maquillage", "matelas", - "mettre", "obésité", "ongles", "palais", "perlant", "pertes", - "pièce", "plaie", "risque", "saint", "sang", "signe", "sonde", - "tenue", "texte", "transaminases", "transit", "transmis", "urinal", - "vernis", "vessie", "vrac", - # Lot 2 : termes médicaux (préfixes/suffixes) - "anatomo-pathologique", "anemie", "anémie", "angioscanner", - "cétonurie", "cetonurie", "depilation", "dépilation", - "folique", "gastroentérologue", "gastroenterologue", - "microgrammes", "nalidixique", "naso-gastrique", - "angio-irm", "neuro", "neuro-chirurgie", "endoplasmique", - "cyto", "plaie-colle", "bionolyte", - # Lot 1 (103 tokens, confiance >= 0.5) ── - # Anatomie / clinique - "abdomen", "aortique", "bilirubine", "drain", "gastrique", "histologique", - "intraveineuse", "lithiasique", "macroscopie", "miction", "molles", "pelvienne", - "plaquette", "plaquettes", "rectale", "sanguine", "sciatique", "selle", "urinaire", - # Pathologies / symptômes - "algie", "angoisse", "antécédent", "douloureuse", "dyslipidémie", - "hemodialyse", "hemorragique", "hyperthermie", "hématologue", - # Médicaments / matériel médical - "ampoule", "antalgique", "antiseptique", "compresse", "flacon", - "oxygène", "pansement", "vitamine", - # Biologie / examens - "biochimie", "biologie", "fer", - # Actions / états cliniques - "ablation", "absence", "admission", "bloc", "changement", "cliniquement", - "cognitif", "couchage", "dispositif", "dynamique", "entretien", "histoire", - "intervention", "position", "rappel", "relation", "retour", "réalisation", - "résistant", "réévaluation", "situation", "temporaire", "urgence", "urgences", - "urgent", "validation", - # Mots courants / contextuels - "angle", "bille", "boisson", "bureau", "cases", "circuit", - "concubin", "confortable", "demain", "densité", "dernière", - "distant", "domaine", "elle", "fils", "frère", "grand", "horizon", - "hui", "identifiant", "minuit", "murent", "neuf", "original", "pages", - "personne", "premier", "quartier", "retraite", "route", "rés", - "trouve", "verrouillé", "villa", "étage", - # Termes médicaux courants faussement détectés comme NOM (Phase 2 audit mars 2026) - "ains", "ponction", "hanche", "burkitt", "orl", "gds", "oap", "tvp", "epp", - "bronchite", "accueil", "cadre", "transfert", "relecture", "examens", - "traitements", "traitement", "infectiologie", "cancérologie", "cancerologie", - "maternité", "orale", "sachet", "absence", - # FP audit 30 fichiers Phase 2 (mars 2026) - "bouffee", "bouffée", "discontinue", "respimat", "lyoc", - "probnp", "pro-bnp", "nt-probnp", - "bpco", "colle", "gsc", "masse", - "selle", "selles", - # Acronymes médicaux courts (3 lettres) souvent FP comme NOM - "epo", "irc", "sib", "inr", "iec", "ira", "ait", "avc", - "imc", "ipp", "ivo", "amp", "ivg", "img", "had", "ssr", - "hta", "ecg", "irm", "tep", "crp", "nfs", "bhc", "vgm", - "vni", "aeg", "bas", "snv", "hba", "ide", "dci", - # Termes pharmaceutiques FP comme NOM (audit 30 fichiers mars 2026) - "buvable", "buvables", "nominal", "nominaux", - "acide", "principale", "principal", "principaux", - "hyaluronique", "valproique", "valproïque", "tranexamique", "tranéxamique", - "clavulanique", "nalidixique", - "grancher", # Centre de réadaptation (nom d'établissement homonyme) - "experf", # Prestataire HAD (nom commercial homonyme) - # Noms de services hospitaliers (FP comme [NOM]) - "ortho", "mobile", "polyvalente", "polyvalent", - "geriatrie", "gériatrie", "ambulatoire", "provisoire", - "intraveineuse", "intraveineux", "sous-cutanee", "sous-cutané", - # Noms de services hospitaliers (aussi patronymes INSEE → FP récurrents) - "viscerale", "viscérale", "vasculaire", "vasculaires", - "conventionnelle", "conventionnel", - "polyvalente", "polyvalent", - "infectieuse", "infectieuses", - # Termes soins infirmiers / activités de la vie quotidienne (FP trackare doc 216) - "aide", "partielle", "partiel", "complete", "complète", "complet", - "contention", "lavabo", "blader", "scan", "post", "lunettes", - "deshabillage", "déshabillage", "habillage", - "surveillance", "surv", "refection", "réfection", - "miction", "toilette", "douche", "changes", - "installation", "transfert", "mobilisation", - "alimentation", "hydratation", "collation", - "stimulation", "prevention", "prévention", - # Termes pharmaceutiques/matériel médical FP (retour relecteur 2026-03-16) - "chlorure", - # Dispositifs médicaux (FP "OXYGENE LUNETTES" → [NOM]) - "canule", "canules", "masque", "sonde", "sondes", - # Termes chirurgicaux FP comme [NOM] (retour relecteur 2026-03-17) - "totale", "total", "partielle", "partiel", - "prothese", "prothèse", "protheses", "prothèses", "unicompartimentale", - # Antiseptiques / produits de soins (FP trackare prescriptions) - "betascrub", "hibiscrub", "betadine", "biseptine", "chlorhexidine", - # Nutrition entérale / compléments - "fresubin", "nutrison", "sondalis", "isosource", "novasource", - # Termes médicaux FP dans bactério / texte libre - "nombreuses", "nombreux", "plusieurs", "quelques", - "internationale", "international", - "resorbable", "résorbable", "resorbables", "résorbables", - "alfa", "capsule", "capsules", +# Mots médicaux/techniques/courants qui ne sont pas des noms de personnes. +# Source de vérité externalisée dans data/stopwords_manuels.txt + BDPM/edsnlp. +_MEDICAL_STOP_WORDS_FALLBACK = { + "date", + "note", + "heure", + "type", + "traitement", + "traitements", + "soins", + "surveillance", + "consultation", + "hospitalisation", } -# Enrichissement automatique avec les ~4000 noms de médicaments d'edsnlp -_MEDICAL_STOP_WORDS_SET.update(_load_edsnlp_drug_names()) - -# Enrichissement depuis fichier externe (modifiable sans toucher au code) -_stopwords_file = Path(__file__).parent / "data" / "stopwords_manuels.txt" -if _stopwords_file.exists(): - try: - _sw_count = 0 - for _line in _stopwords_file.read_text(encoding="utf-8").splitlines(): - _w = _line.strip() - if _w and not _w.startswith("#"): - _MEDICAL_STOP_WORDS_SET.add(_w) - _sw_count += 1 - log.info("Stop-words manuels chargés : %d mots depuis %s", _sw_count, _stopwords_file.name) - except Exception as _exc: - log.error("Stop-words manuels : erreur de lecture %s — %s", _stopwords_file, _exc) -else: - log.warning("Stop-words manuels : fichier introuvable %s — qualité dégradée", _stopwords_file) - -# Enrichissement BDPM : ~7300 noms commerciaux + DCI/substances actives -_bdpm_path = Path(__file__).parent / "data" / "bdpm" / "medicaments_stopwords.txt" -if _bdpm_path.exists(): - try: - _bdpm_count = 0 - for _line in _bdpm_path.read_text(encoding="utf-8").splitlines(): - _w = _line.strip() - if _w and not _w.startswith("#"): - _MEDICAL_STOP_WORDS_SET.add(_w) - _bdpm_count += 1 - log.info("BDPM stop-words chargés : %d mots", _bdpm_count) - except Exception as _exc: - log.error("BDPM stop-words : erreur de lecture %s — %s", _bdpm_path, _exc) -else: - log.warning("BDPM stop-words : fichier introuvable %s — qualité dégradée", _bdpm_path) - -_MEDICAL_STOP_WORDS = ( - r"(?:" + "|".join(re.escape(w) for w in _MEDICAL_STOP_WORDS_SET) + r")" +_MEDICAL_STOP_WORDS_SET = _load_wordlist_file( + Path(__file__).parent / "data" / "stopwords_manuels.txt", + transform=str.lower, + label="Stop-words manuels", ) +_MEDICAL_STOP_WORDS_SET.update(_load_edsnlp_drug_names()) +_MEDICAL_STOP_WORDS_SET.update( + _load_wordlist_file( + Path(__file__).parent / "data" / "bdpm" / "medicaments_stopwords.txt", + transform=str.lower, + label="BDPM stop-words", + ) +) +if not _MEDICAL_STOP_WORDS_SET: + _MEDICAL_STOP_WORDS_SET = set(_MEDICAL_STOP_WORDS_FALLBACK) +_BASE_MEDICAL_STOP_WORDS_SET = set(_MEDICAL_STOP_WORDS_SET) + + +def _refresh_medical_stopwords_pattern() -> None: + global _MEDICAL_STOP_WORDS + if not _MEDICAL_STOP_WORDS_SET: + _MEDICAL_STOP_WORDS = r"(?!)" + return + _MEDICAL_STOP_WORDS = ( + r"(?:" + "|".join(re.escape(w) for w in sorted(_MEDICAL_STOP_WORDS_SET)) + r")" + ) + + +_refresh_medical_stopwords_pattern() # Un token de nom : commence par majuscule, lettres/tirets/apostrophes (PAS d'espace ni de point) _PERSON_TOKEN = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+" RE_PERSON_CONTEXT = re.compile( @@ -985,7 +589,17 @@ RE_CIVILITE_INITIALE = re.compile( # --- N° examen / N° patient imagerie (radiologie) --- RE_NUM_EXAMEN_PATIENT = re.compile( - r"N[°o]?\s*(?:d['''\u2019]\s*)?(?:examen|patient|accession|passage)\s*[:\-]?\s*([A-Za-z]{0,4}\d{5,12})", + r"N[°o]?\s*(?:d['''\u2019]\s*)?(?:examen|patient(?:\s+imagerie)?|accession|passage)\s*[:\-]?\s*" + r"((?=[A-Za-z0-9\-]{6,20}\b)(?=[A-Za-z0-9\-]*\d)[A-Za-z0-9\-]+)", + re.IGNORECASE, +) +# --- N° bare d'entête labo / imagerie --- +# Exemple: +# N° 23L35781 +# Prélevé le 26/07/2023 Enregistré le 27/07/2023 +RE_NUM_ACCESSION_HEADER = re.compile( + r"(?:^|\n)\s*N[°o]\s*[:\-]?\s*([A-Za-z0-9\-]{6,20})\s*\n" + r"(?:[^\n]*\n){0,2}\s*(?:Pr[ée]lev[ée]\s+le|Enregistr[ée]\s+le)", re.IGNORECASE, ) @@ -1177,6 +791,7 @@ _DPI_LABELS_SET: set = _load_txt_set( ) if not _DPI_LABELS_SET: _DPI_LABELS_SET = set(_DPI_LABELS_FALLBACK) +_BASE_DPI_LABELS_SET = set(_DPI_LABELS_SET) # Companion blacklist : termes EN MAJUSCULES qui ne sont JAMAIS des noms # (spécialités, labos pharma, mots courants ambigus). @@ -1189,6 +804,7 @@ _COMPANION_BLACKLIST_SET: set = _load_txt_set( ) if not _COMPANION_BLACKLIST_SET: _COMPANION_BLACKLIST_SET = set(_COMPANION_BLACKLIST_FALLBACK) +_BASE_COMPANION_BLACKLIST_SET = set(_COMPANION_BLACKLIST_SET) _WHITELIST_FUNCTION_WORDS = { @@ -1223,14 +839,15 @@ def _load_whitelist_phrases(phrases) -> int: def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]: - cfg = DEFAULTS_CFG.copy() - if config_path and config_path.exists() and yaml is not None: - try: - user = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {} - for k, v in user.items(): - cfg[k] = v - except Exception: - pass + global _MEDICAL_STOP_WORDS_SET, _VILLE_BLACKLIST, _DPI_LABELS_SET, _COMPANION_BLACKLIST_SET + cfg = load_default_dictionaries_dict() if config_path is None else load_effective_dictionaries_dict(config_path) + + _MEDICAL_STOP_WORDS_SET = set(_BASE_MEDICAL_STOP_WORDS_SET) + _VILLE_BLACKLIST = set(_BASE_VILLE_BLACKLIST) + _DPI_LABELS_SET = set(_BASE_DPI_LABELS_SET) + _COMPANION_BLACKLIST_SET = set(_BASE_COMPANION_BLACKLIST_SET) + _WHITELIST_NEVER_MASK_TOKENS.clear() + _WHITELIST_NEVER_MASK_PHRASES.clear() # Charger les stop-words et villes supplémentaires depuis le YAML extra_sw = cfg.get("additional_stopwords", []) @@ -1239,6 +856,7 @@ def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]: if w and str(w).strip(): _MEDICAL_STOP_WORDS_SET.add(str(w).strip().lower()) log.info("Stop-words YAML supplémentaires : %d", len(extra_sw)) + _refresh_medical_stopwords_pattern() extra_villes = cfg.get("additional_villes_blacklist", []) if extra_villes: @@ -1871,8 +1489,49 @@ def _mask_critical_in_key(key: str, audit: List[PiiHit], page_idx: int) -> str: return key +def _replace_captured_value(full_match: str, captured_value: str, placeholder: str) -> str: + start = full_match.find(captured_value) + if start < 0: + return placeholder + end = start + len(captured_value) + return full_match[:start] + placeholder + full_match[end:] + + +def _mask_structured_line(line: str, audit: List[PiiHit], page_idx: int) -> str: + """Masque les champs structurés dont la détection dépend du libellé de la ligne.""" + + def _repl_code_postal(m: re.Match) -> str: + original = m.group(1) or m.group(2) or m.group(0) + audit.append(PiiHit(page_idx, "CODE_POSTAL", original, PLACEHOLDERS["CODE_POSTAL"])) + if m.group(1): + return _replace_captured_value(m.group(0), m.group(1), PLACEHOLDERS["CODE_POSTAL"]) + return PLACEHOLDERS["CODE_POSTAL"] + + def _repl_num_examen(m: re.Match) -> str: + audit.append(PiiHit(page_idx, "DOSSIER", m.group(1), PLACEHOLDERS["DOSSIER"])) + return _replace_captured_value(m.group(0), m.group(1), PLACEHOLDERS["DOSSIER"]) + + def _repl_dossier(m: re.Match) -> str: + original = m.group(1) or m.group(2) or m.group(0) + audit.append(PiiHit(page_idx, "DOSSIER", original, PLACEHOLDERS["DOSSIER"])) + return _replace_captured_value(m.group(0), original, PLACEHOLDERS["DOSSIER"]) + + def _repl_venue(m: re.Match) -> str: + audit.append(PiiHit(page_idx, "NDA", m.group(1), PLACEHOLDERS["NDA"])) + return _replace_captured_value(m.group(0), m.group(1), PLACEHOLDERS["NDA"]) + + masked = RE_CODE_POSTAL.sub(_repl_code_postal, line) + masked = RE_NUM_EXAMEN_PATIENT.sub(_repl_num_examen, masked) + masked = RE_NUMERO_DOSSIER.sub(_repl_dossier, masked) + masked = RE_VENUE_SEJOUR.sub(_repl_venue, masked) + return masked + + def _kv_value_only_mask(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str: line = _mask_admin_label(line, audit, page_idx) + structured_line = _mask_structured_line(line, audit, page_idx) + if structured_line != line: + return structured_line parts = SPLITTER.split(line, maxsplit=1) if len(parts) == 2: key, value = parts @@ -2413,6 +2072,35 @@ def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> Tuple[set, s for m in _RE_EMAIL_HEADER.finditer(full_text): _add_tokens_force_all(m.group(1), "EMAIL_HEADER", "medium") + # En-têtes patient en capitales, sans libellé explicite. + # Exemple: + # ETCHEVERRY JEAN CLAUDE + # On reste conservateur: 2-4 tokens uppercase, avec au moins un prénom + # INSEE et un nom de famille INSEE. Les tokens proposés viennent + # exclusivement des dictionnaires INSEE, sans blacklist codée en dur ici. + _UPPER_NAME_LINE_RE = re.compile( + r"^[ \t]*([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ\-' ]+" + r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ])[ \t]*$", + re.MULTILINE, + ) + for m in _UPPER_NAME_LINE_RE.finditer(full_text): + raw_line = re.sub(r"\s+", " ", m.group(1)).strip() + tokens = [tok.strip(" .-'") for tok in raw_line.split() if tok.strip(" .-'")] + if len(tokens) < 2 or len(tokens) > 4: + continue + if any(len(tok) < 3 for tok in tokens): + continue + + norm_tokens = [_normalize_nfkd_upper(tok) for tok in tokens] + has_prenom = any(tok in _INSEE_PRENOMS_SET for tok in norm_tokens) + has_nom = any(tok in _INSEE_NOMS_FAMILLE for tok in norm_tokens) + if not (has_prenom and has_nom): + continue + + for tok, norm_tok in zip(tokens, norm_tokens): + if norm_tok in _INSEE_PRENOMS_SET or norm_tok in _INSEE_NOMS_FAMILLE: + _add_candidate(tok, "UPPER_NAME_LINE", "low", False) + # Pour les noms composés avec tiret (ex: "LACLAU-LACROUTS"), # ajouter aussi les parties individuelles pour capturer les occurrences standalone. # _apply_extracted_names traite le composé en premier (plus long) puis les parties. @@ -2582,10 +2270,10 @@ def _apply_extracted_names(text: str, names: set, audit: List[PiiHit], force_nam def _apply_trackare_hits_to_text(text: str, audit: List[PiiHit]) -> str: - """Applique les PiiHit non-NOM dans le texte (NDA footers, EPISODE, RPPS, FINESS, etc.). + """Applique les PiiHit non-NOM dans le texte (NDA, DOSSIER, EPISODE, RPPS, FINESS, etc.). Ces hits sont détectés par _extract_trackare_identity ou la phase 0c mais n'étaient appliqués qu'au PDF raster, pas au fichier .pseudonymise.txt.""" - _APPLY_KINDS = {"EPISODE", "RPPS", "FINESS"} + _APPLY_KINDS = {"DOSSIER", "EPISODE", "FINESS", "NDA", "RPPS"} # Collecter les valeurs à remplacer, groupées par placeholder replacements: Dict[str, str] = {} # original → placeholder for h in audit: @@ -2698,7 +2386,14 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str] for m in _RE_IPP_MULTILINE.finditer(full_raw): audit.append(PiiHit(-1, "IPP", m.group(1), PLACEHOLDERS["IPP"])) - # Phase 0f : DEMANDE N° multiline (DEMANDE N°\n2300261164) + # Phase 0f : numéro d'accession / d'examen en en-tête de labo ou imagerie + # Ex: + # N° 23L35781 + # Prélevé le 26/07/2023 + for m in RE_NUM_ACCESSION_HEADER.finditer(full_raw): + audit.append(PiiHit(-1, "DOSSIER", m.group(1), PLACEHOLDERS["DOSSIER"])) + + # Phase 0g : DEMANDE N° multiline (DEMANDE N°\n2300261164) _RE_DEMANDE_MULTILINE = re.compile( r"DEMANDE\s+N[°o]?\s*\n\s*(\d{8,})", re.IGNORECASE, @@ -2706,14 +2401,14 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str] for m in _RE_DEMANDE_MULTILINE.finditer(full_raw): audit.append(PiiHit(-1, "DOSSIER", m.group(1), PLACEHOLDERS["DOSSIER"])) - # Phase 0g : N° venue multiline (tableaux BACTERIO : label et valeur séparés) + # Phase 0h : N° venue multiline (tableaux BACTERIO : label et valeur séparés) _RE_VENUE_MULTILINE = re.compile( r"N[°o]?\s*venue\s*[:\-]?\s*\n(?:[^\n]*\n){0,2}\s*(\d{6,})", re.IGNORECASE, ) for m in _RE_VENUE_MULTILINE.finditer(full_raw): audit.append(PiiHit(-1, "NDA", m.group(1), PLACEHOLDERS["NDA"])) - # Phase 0g-bis : N° venue inversé (layout-aware réordonne : numéro AVANT label) + # Phase 0h-bis : N° venue inversé (layout-aware réordonne : numéro AVANT label) _RE_VENUE_REVERSE = re.compile( r"(? str: protected = RE_ADRESSE.sub(PLACEHOLDERS["ADRESSE"], protected) protected = RE_ADRESSE_LIEU_DIT.sub(PLACEHOLDERS["ADRESSE"], protected) protected = RE_BP.sub(PLACEHOLDERS["ADRESSE"], protected) - protected = RE_CODE_POSTAL.sub(PLACEHOLDERS["CODE_POSTAL"], protected) + def _rescan_code_postal(m: re.Match) -> str: + if m.group(1): + return _replace_captured_value(m.group(0), m.group(1), PLACEHOLDERS["CODE_POSTAL"]) + return PLACEHOLDERS["CODE_POSTAL"] + + protected = RE_CODE_POSTAL.sub(_rescan_code_postal, protected) # N° Episode protected = RE_EPISODE.sub(PLACEHOLDERS["EPISODE"], protected) # N° venue / séjour - protected = RE_VENUE_SEJOUR.sub(PLACEHOLDERS["NDA"], protected) + protected = RE_VENUE_SEJOUR.sub( + lambda m: _replace_captured_value(m.group(0), m.group(1), PLACEHOLDERS["NDA"]), + protected, + ) # N° RPPS protected = RE_RPPS.sub(PLACEHOLDERS["RPPS"], protected) # FINESS par gazetteer (nombres 9 chiffres matchant un vrai numéro FINESS) @@ -4825,7 +4493,7 @@ if __name__ == "__main__": ap.add_argument("--out", type=str, default="out") ap.add_argument("--no-vector", action="store_true") ap.add_argument("--raster", action="store_true") - ap.add_argument("--config", type=str, default=str(Path("config/dictionnaires.yml"))) + ap.add_argument("--config", type=str, default=str(RUNTIME_DICTIONARIES_CONFIG_PATH)) ap.add_argument("--hf", action="store_true", help="Activer NER ONNX sur narratif (nécessite ner_manager_onnx)") ap.add_argument("--model", type=str, default="cmarkea/distilcamembert-base-ner") args = ap.parse_args() diff --git a/build_windows.bat b/build_windows.bat index 7c594e6..db57fb5 100644 --- a/build_windows.bat +++ b/build_windows.bat @@ -33,6 +33,7 @@ python -m nuitka ^ --include-module=ner_manager_onnx ^ --include-module=eds_pseudo_manager ^ --include-data-dir=config=config ^ + --include-data-dir=data=data ^ --include-data-dir=models=models ^ --nofollow-import-to=onnxruntime ^ --nofollow-import-to=numpy ^ diff --git a/config/dictionnaires.default.yml b/config/dictionnaires.default.yml new file mode 100644 index 0000000..d93fb45 --- /dev/null +++ b/config/dictionnaires.default.yml @@ -0,0 +1,59 @@ +# Template versionné des règles d'anonymisation. +# Ce fichier décrit les valeurs par défaut complètes de l'application. +# La surcharge locale chargée par défaut est config/dictionnaires.yml. +version: 1 +encoding: utf-8 +normalization: NFKC +whitelist: + sections_titres: + - DIM + - GHM + - GHS + - RUM + - COMPTE + - RENDU + - DIAGNOSTIC + noms_maj_excepts: + - Médecin DIM + - Praticien conseil + org_gpe_keep: false +blacklist: + # Sigles et libellés propres à l'établissement non couverts par les gazetteers + # nationaux (FINESS / INSEE / BDPM). Évitez d'ajouter ici des noms d'hôpitaux, + # villes, codes postaux ou numéros FINESS — ils sont déjà détectés automatiquement. + force_mask_terms: + - CHCB + - 'Dates du séjour :' + - CONCERTATION + - LABORATOIRE de BIOLOGIE MEDICALE + force_mask_regex: + - '13\s*,?\s*Avenue\s+de\s+l.Interne\s+J\.?\s*LOEB\s+BP\s*\d+' +kv_labels_preserve: +- FINESS +- IPP +- N° OGC +- Etablissement +regex_overrides: +- name: OGC_court + pattern: \b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b + placeholder: '[OGC]' + flags: + - IGNORECASE +whitelist_phrases: + - "classification internationale" + - "prise en charge" + - "bas de contention" + - "date de naissance" + - "lieu de naissance" + - "ville de résidence" + - "date de sortie" + - "date d'admission" + - "code postal" +additional_stopwords: [] +additional_villes_blacklist: [] +additional_dpi_labels: [] +additional_companion_blacklist: [] +flags: + case_insensitive: true + unicode_word_boundaries: true + regex_engine: python diff --git a/config/dictionnaires.yml b/config/dictionnaires.yml index fa36e53..b6efeee 100644 --- a/config/dictionnaires.yml +++ b/config/dictionnaires.yml @@ -1,83 +1,11 @@ -version: 1 -encoding: utf-8 -normalization: NFKC -whitelist: - sections_titres: - - DIM - - GHM - - GHS - - RUM - - COMPTE - - RENDU - - DIAGNOSTIC - noms_maj_excepts: - - Médecin DIM - - Praticien conseil - org_gpe_keep: false -blacklist: - # Sigles et libellés propres à l'établissement non couverts par les gazetteers - # nationaux (FINESS / INSEE / BDPM). Évitez d'ajouter ici des noms d'hôpitaux, - # villes, codes postaux ou numéros FINESS — ils sont déjà détectés automatiquement. - force_mask_terms: - - CHCB # Sigle local non référencé FINESS - - 'Dates du séjour :' # Libellé administratif (politique masquage) - - CONCERTATION # Mention de RCP (politique métier) - - LABORATOIRE de BIOLOGIE MEDICALE # Libellé administratif générique - force_mask_regex: - # Adresse précise du CHCB — couverte par l'AC FINESS adresses mais on garde - # la regex en filet de sécurité (encodages PDF, espaces non standards). - - '13\s*,?\s*Avenue\s+de\s+l.Interne\s+J\.?\s*LOEB\s+BP\s*\d+' -kv_labels_preserve: -- FINESS -- IPP -- N° OGC -- Etablissement -regex_overrides: -- name: OGC_court - pattern: \b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b - placeholder: '[OGC]' - flags: - - IGNORECASE -# Phrases à ne JAMAIS anonymiser (faux positifs récurrents) -# Ajouter ici les expressions qui sont masquées à tort. -# La correspondance est insensible à la casse. -whitelist_phrases: - - "classification internationale" - - "prise en charge" - - "bas de contention" - - "date de naissance" - - "lieu de naissance" - - "ville de résidence" - - "date de sortie" - - "date d'admission" - - "code postal" -# Mots supplémentaires à ne jamais masquer comme noms de personnes -# (complète les 9000+ stop-words intégrés) -additional_stopwords: [] -# Exemple : -# - "votre_mot" - -# Villes supplémentaires à ne jamais matcher comme lieux -# (complète les 115+ villes blacklistées intégrées) -additional_villes_blacklist: [] -# Exemple : -# - "VOTRE_VILLE" - -# Labels DPI supplémentaires à ne jamais masquer comme noms -# (complète data/dpi_labels_blacklist.txt) -# Utiliser pour : titres de colonnes, en-têtes de sections, libellés de champs -additional_dpi_labels: [] -# Exemple : -# - "Service" -# - "Statut" - -# Termes en MAJUSCULES à ne jamais propager comme noms compagnons -# (complète data/companion_blacklist.txt — spécialités, labos pharma, mots ambigus) -additional_companion_blacklist: [] -# Exemple : -# - "VOTRE_SPECIALITE" - -flags: - case_insensitive: true - unicode_word_boundaries: true - regex_engine: python +# Surcharge locale chargée par défaut par l'application. +# Source de vérité des valeurs par défaut : config/dictionnaires.default.yml +# Ce fichier ne doit contenir que les écarts spécifiques à l'environnement courant. +# +# Exemples : +# blacklist: +# force_mask_terms: +# - VOTRE_SIGLE +# additional_stopwords: +# - votre_terme +{} diff --git a/config_defaults.py b/config_defaults.py new file mode 100644 index 0000000..2762c95 --- /dev/null +++ b/config_defaults.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python3 +""" +Helpers partagés pour la config dictionnaires. +""" +from __future__ import annotations + +from copy import deepcopy +from pathlib import Path +from typing import Any, Dict + +try: + import yaml +except Exception: + yaml = None + + +PROJECT_DIR = Path(__file__).resolve().parent +CONFIG_DIR = PROJECT_DIR / "config" +DEFAULT_DICTIONARIES_CONFIG_PATH = CONFIG_DIR / "dictionnaires.default.yml" +RUNTIME_DICTIONARIES_CONFIG_PATH = CONFIG_DIR / "dictionnaires.yml" + +_RUNTIME_DICTIONARIES_OVERLAY_TEXT = """# Surcharge locale chargée par défaut par l'application. +# Seuls les écarts par rapport à config/dictionnaires.default.yml sont nécessaires ici. +# Si ce fichier est vide, les valeurs du template par défaut s'appliquent. +# +# Exemples : +# blacklist: +# force_mask_terms: +# - VOTRE_SIGLE +# additional_stopwords: +# - votre_terme +{} +""" + +_FALLBACK_DEFAULT_DICTIONARIES_TEXT = """version: 1 +encoding: utf-8 +normalization: NFKC +whitelist: + sections_titres: + - DIM + - GHM + - GHS + - RUM + - COMPTE + - RENDU + - DIAGNOSTIC + noms_maj_excepts: + - Médecin DIM + - Praticien conseil + org_gpe_keep: false +blacklist: + force_mask_terms: [] + force_mask_regex: [] +kv_labels_preserve: +- FINESS +- IPP +- N° OGC +- Etablissement +regex_overrides: +- name: OGC_court + pattern: \\b(?:N°\\s*)?OGC\\s*[:\\-]?\\s*([A-Za-z0-9\\-]{1,3})\\b + placeholder: '[OGC]' + flags: + - IGNORECASE +whitelist_phrases: [] +additional_stopwords: [] +additional_villes_blacklist: [] +additional_dpi_labels: [] +additional_companion_blacklist: [] +flags: + case_insensitive: true + unicode_word_boundaries: true + regex_engine: python +""" + +_FALLBACK_DEFAULT_DICTIONARIES_DICT: Dict[str, Any] = { + "version": 1, + "encoding": "utf-8", + "normalization": "NFKC", + "whitelist": { + "sections_titres": ["DIM", "GHM", "GHS", "RUM", "COMPTE", "RENDU", "DIAGNOSTIC"], + "noms_maj_excepts": ["Médecin DIM", "Praticien conseil"], + "org_gpe_keep": False, + }, + "blacklist": { + "force_mask_terms": [], + "force_mask_regex": [], + }, + "kv_labels_preserve": ["FINESS", "IPP", "N° OGC", "Etablissement"], + "regex_overrides": [ + { + "name": "OGC_court", + "pattern": r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b", + "placeholder": "[OGC]", + "flags": ["IGNORECASE"], + } + ], + "whitelist_phrases": [], + "additional_stopwords": [], + "additional_villes_blacklist": [], + "additional_dpi_labels": [], + "additional_companion_blacklist": [], + "flags": { + "case_insensitive": True, + "unicode_word_boundaries": True, + "regex_engine": "python", + }, +} + + +def read_default_dictionaries_text() -> str: + try: + return DEFAULT_DICTIONARIES_CONFIG_PATH.read_text(encoding="utf-8") + except Exception: + return _FALLBACK_DEFAULT_DICTIONARIES_TEXT + + +def read_runtime_dictionaries_overlay_text() -> str: + return _RUNTIME_DICTIONARIES_OVERLAY_TEXT + + +def load_default_dictionaries_dict() -> Dict[str, Any]: + text = read_default_dictionaries_text() + if yaml is not None: + try: + loaded = yaml.safe_load(text) or {} + if isinstance(loaded, dict): + return loaded + except Exception: + pass + return deepcopy(_FALLBACK_DEFAULT_DICTIONARIES_DICT) + + +def load_runtime_dictionaries_overlay_dict(path: Path | None = None) -> Dict[str, Any]: + target = Path(path) if path is not None else RUNTIME_DICTIONARIES_CONFIG_PATH + if not target.exists(): + return {} + if yaml is None: + return {} + try: + loaded = yaml.safe_load(target.read_text(encoding="utf-8")) or {} + if isinstance(loaded, dict): + return loaded + except Exception: + pass + return {} + + +def load_effective_dictionaries_dict(path: Path | None = None) -> Dict[str, Any]: + return deep_merge_dict( + load_default_dictionaries_dict(), + load_runtime_dictionaries_overlay_dict(path), + ) + + +def deep_merge_dict(base: Dict[str, Any], override: Dict[str, Any]) -> Dict[str, Any]: + merged = deepcopy(base) + for key, value in (override or {}).items(): + if isinstance(value, dict) and isinstance(merged.get(key), dict): + merged[key] = deep_merge_dict(merged[key], value) + elif isinstance(value, list) and isinstance(merged.get(key), list): + combined = list(merged[key]) + for item in value: + if item not in combined: + combined.append(deepcopy(item)) + merged[key] = combined + else: + merged[key] = deepcopy(value) + return merged + + +def ensure_runtime_dictionaries_config(path: Path | None = None) -> Path: + target = Path(path) if path is not None else RUNTIME_DICTIONARIES_CONFIG_PATH + if not target.exists(): + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text(read_runtime_dictionaries_overlay_text(), encoding="utf-8") + return target diff --git a/data/bdpm/medication_whitelist_manual.txt b/data/bdpm/medication_whitelist_manual.txt new file mode 100644 index 0000000..b114bc2 --- /dev/null +++ b/data/bdpm/medication_whitelist_manual.txt @@ -0,0 +1,11 @@ +# Compléments manuels à la whitelist médicaments. +# Un terme par ligne, en lowercase. + +idacio +salazopyrine +infliximab +apranax +ketoprofene +prevenar +pneumovax +bétadine diff --git a/data/finess/address_blacklist.txt b/data/finess/address_blacklist.txt new file mode 100644 index 0000000..fe75393 --- /dev/null +++ b/data/finess/address_blacklist.txt @@ -0,0 +1,7 @@ +# Faux positifs à exclure du gazetteer d'adresses FINESS. + +cabinet medical +cabinet dentaire +cabinet infirmier +cabinet paramedical +cabinet sage-femme diff --git a/data/finess/generic_name_blacklist.txt b/data/finess/generic_name_blacklist.txt new file mode 100644 index 0000000..6d1567e --- /dev/null +++ b/data/finess/generic_name_blacklist.txt @@ -0,0 +1,112 @@ +# Noms d'établissements trop génériques à ignorer dans l'automate FINESS. + +clinique +pharmacie +hopital +centre +foyer +residence +maison +cabinet +service +laboratoire +institut +association +fondation +mutuelle +polyclinique +dispensaire +hospice +annexe +antenne +site +collegiale +collegial +cathedral +cathedrale +providence +esperance +renaissance +liberation +republique +fraternite +solidarite +independance +beauregard +bellevue +belvedere +promenade +esplanade +corniche +prefecture +croissant +confluence +bienvenue +chartreuse +commanderie +chapelle +basilique +departement +departementale +communautaire +chirurgicale +radiologie +addictologie +prevention +psychotherapique +ambulatoire +hospitalisation +consultation +surveillance +therapeutique +readaptation +reeducation +reanimation +specialisee +conventionnelle +professionnelle +informatique +administrative +regionale +generation +revolution +assomption +visitation +consolation +atlantique +manutention +prefiguration +intervalle +pharmaciens +pharmacien +transfert +comprimee +comprimees +injectable +injectables +maintenant +actuellement +auparavant +prochainement +rapidement +correctement +directement +simplement +internationale +international +intercommunal +intercommunale +resistance +radiotherapie +chimiotherapie +curietherapie +hormonotherapie +immunotherapie +kinesitherapie +ergotherapie +orthophonie +psychomotricite +convalescence +dependance +autonomie +gerontologie diff --git a/data/finess/generic_phrase_blacklist.txt b/data/finess/generic_phrase_blacklist.txt new file mode 100644 index 0000000..42e25a6 --- /dev/null +++ b/data/finess/generic_phrase_blacklist.txt @@ -0,0 +1,26 @@ +# Expressions FINESS multi-mots trop génériques à ignorer. + +a domicile +au domicile +menage a domicile +du nord +du sud +de l est +de l ouest +la maison +la residence +les jardins +le village +le parc +la colline +au soleil +en france +long cours +au long cours +le bourg +le val +le clos +le mas +les pins +les chenes +les oliviers diff --git a/pseudonymisation_pipeline_gui_v3.py b/pseudonymisation_pipeline_gui_v3.py index 68fc7ec..cc1f44d 100644 --- a/pseudonymisation_pipeline_gui_v3.py +++ b/pseudonymisation_pipeline_gui_v3.py @@ -37,33 +37,18 @@ try: except Exception: yaml = None -APP_TITLE = "Pseudonymisation de PDF" -DEFAULT_CFG = Path("config/dictionnaires.yml") +from config_defaults import ( + RUNTIME_DICTIONARIES_CONFIG_PATH, + read_default_dictionaries_text, + read_runtime_dictionaries_overlay_text, +) -# YAML par défaut (patterns en bloc littéral pour éviter les échappements) -DEFAULTS_CFG_TEXT = """# dictionnaires.yml – valeurs par défaut -version: 1 -encoding: "utf-8" -normalization: "NFKC" -whitelist: - sections_titres: [DIM, GHM, GHS, RUM, COMPTE, RENDU, DIAGNOSTIC] - noms_maj_excepts: ["Médecin DIM", "Praticien conseil"] - org_gpe_keep: true -blacklist: - force_mask_terms: [] - force_mask_regex: [] -kv_labels_preserve: [FINESS, IPP, "N° OGC", Etablissement] -regex_overrides: - - name: OGC_court - pattern: |- - \b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b - placeholder: '[OGC]' - flags: [IGNORECASE] -flags: - case_insensitive: true - unicode_word_boundaries: true - regex_engine: "python" -""" +APP_TITLE = "Pseudonymisation de PDF" +DEFAULT_CFG = RUNTIME_DICTIONARIES_CONFIG_PATH + +# YAML par défaut externalisé dans config/dictionnaires.default.yml +DEFAULTS_CFG_TEXT = read_default_dictionaries_text() +RUNTIME_CFG_TEXT = read_runtime_dictionaries_overlay_text() # ---------- util : ToolTip & helpers ---------- class ToolTip: @@ -211,7 +196,7 @@ class App: p = Path(self.cfg_path.get()) p.parent.mkdir(parents=True, exist_ok=True) if not p.exists(): - p.write_text(DEFAULTS_CFG_TEXT, encoding="utf-8") + p.write_text(RUNTIME_CFG_TEXT, encoding="utf-8") def _cfg_browse(self): d = filedialog.asksaveasfilename(defaultextension=".yml", filetypes=[("YAML","*.yml *.yaml"), ("Tous","*.*")]) @@ -248,7 +233,7 @@ class App: return try: with open(self.cfg_path.get(), "w", encoding="utf-8") as f: - yaml.safe_dump(self.cfg_data or yaml.safe_load(DEFAULTS_CFG_TEXT), f, allow_unicode=True, sort_keys=False) + yaml.safe_dump(self.cfg_data or {}, f, allow_unicode=True, sort_keys=False) self._log("Règles sauvegardées.") except Exception as e: messagebox.showerror("Erreur", f"Impossible d'écrire le fichier de règles: {e}") @@ -258,8 +243,8 @@ class App: def _restore_defaults(self): try: - Path(self.cfg_path.get()).write_text(DEFAULTS_CFG_TEXT, encoding="utf-8") - self._log("Règles restaurées aux valeurs par défaut.") + Path(self.cfg_path.get()).write_text(RUNTIME_CFG_TEXT, encoding="utf-8") + self._log("Surcharge locale réinitialisée.") self._load_cfg() except Exception as e: messagebox.showerror("Erreur", f"Impossible d'écrire le YAML par défaut: {e}") diff --git a/run_batch_30_audit.py b/run_batch_30_audit.py index 4764cfb..7c5af8e 100644 --- a/run_batch_30_audit.py +++ b/run_batch_30_audit.py @@ -9,6 +9,7 @@ from collections import Counter sys.path.insert(0, str(Path(__file__).parent)) import anonymizer_core_refactored_onnx as core +from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH from eds_pseudo_manager import EdsPseudoManager from vlm_manager import VlmManager from gliner_manager import GlinerManager @@ -16,7 +17,7 @@ from camembert_ner_manager import CamembertNerManager SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)") OUTDIR = SRC / "anonymise_audit_30" -CONFIG = Path("/home/dom/ai/anonymisation/config/dictionnaires.yml") +CONFIG = RUNTIME_DICTIONARIES_CONFIG_PATH PDFS = [ SRC / "114_23060661/CONSULTATION ANESTHESISTE 23060661.pdf", diff --git a/run_batch_59ogc.py b/run_batch_59ogc.py index 56e520a..3d27aea 100644 --- a/run_batch_59ogc.py +++ b/run_batch_59ogc.py @@ -9,11 +9,12 @@ from collections import Counter sys.path.insert(0, str(Path(__file__).parent)) import anonymizer_core_refactored_onnx as core +from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH from eds_pseudo_manager import EdsPseudoManager SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)") OUTDIR = SRC / "anonymise" -CONFIG = Path("/home/dom/ai/anonymisation/config/dictionnaires.yml") +CONFIG = RUNTIME_DICTIONARIES_CONFIG_PATH def main(): # Charger EDS-Pseudo diff --git a/run_batch_silver_export.py b/run_batch_silver_export.py index 55b5ba7..f48a88c 100644 --- a/run_batch_silver_export.py +++ b/run_batch_silver_export.py @@ -19,9 +19,11 @@ from pathlib import Path sys.path.insert(0, str(Path(__file__).parent)) +from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH + SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)") OUTDIR = SRC / "anonymise_silver_extra" -CONFIG = Path("/home/dom/ai/anonymisation/config/dictionnaires.yml") +CONFIG = RUNTIME_DICTIONARIES_CONFIG_PATH # PDFs déjà traités dans l'audit 30 (à exclure) ALREADY_DONE_AUDIT30 = { diff --git a/scripts/merge_params.py b/scripts/merge_params.py index be5f450..f421b63 100644 --- a/scripts/merge_params.py +++ b/scripts/merge_params.py @@ -13,13 +13,18 @@ import json import sys from pathlib import Path +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + try: import yaml except ImportError: print("ERREUR : pyyaml requis (pip install pyyaml)") sys.exit(1) -CONFIG = Path(__file__).parent.parent / "config" / "dictionnaires.yml" +from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH + + +CONFIG = RUNTIME_DICTIONARIES_CONFIG_PATH def merge_params(json_files: list, config_path: Path = CONFIG, dry_run: bool = False): diff --git a/server.py b/server.py index 38af3ab..0c12516 100644 --- a/server.py +++ b/server.py @@ -29,6 +29,8 @@ from typing import Optional from fastapi import FastAPI, File, Form, UploadFile from fastapi.responses import JSONResponse +from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH + logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", @@ -86,7 +88,7 @@ def _load_models(): """Charge tous les modèles NER une seule fois au démarrage.""" global _eds_manager, _camembert_manager, _gliner_manager, _vlm_manager, _cfg - _cfg = load_dictionaries(Path(__file__).parent / "config" / "dictionnaires.yml") + _cfg = load_dictionaries(RUNTIME_DICTIONARIES_CONFIG_PATH) # EDS-Pseudo (F1=0.97) if EdsPseudoManager is not None: @@ -288,7 +290,7 @@ async def anonymize_pdf( out_dir=out_dir, make_vector_redaction=vector_redaction, also_make_raster_burn=raster_redaction, - config_path=Path(__file__).parent / "config" / "dictionnaires.yml", + config_path=RUNTIME_DICTIONARIES_CONFIG_PATH, use_hf=use_ner and ner_mgr is not None, ner_manager=ner_mgr, gliner_manager=_gliner_manager if use_ner else None, diff --git a/test_gui_error.py b/test_gui_error.py index 2fc937e..b632dea 100644 --- a/test_gui_error.py +++ b/test_gui_error.py @@ -3,6 +3,7 @@ from pathlib import Path import anonymizer_core_refactored_onnx as core +from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH # Tester avec un seul PDF test_pdf = Path("/home/dom/Téléchargements").rglob("*.pdf") @@ -16,7 +17,7 @@ if test_pdf: Path("/tmp/test_gui"), make_vector_redaction=False, also_make_raster_burn=True, - config_path=Path("config/dictionnaires.yml"), + config_path=RUNTIME_DICTIONARIES_CONFIG_PATH, use_hf=False, ) print(f"✅ Succès: {result}") diff --git a/test_gui_fixed.py b/test_gui_fixed.py index 8034048..3dd0212 100644 --- a/test_gui_fixed.py +++ b/test_gui_fixed.py @@ -6,6 +6,7 @@ from pathlib import Path sys.path.insert(0, str(Path(__file__).parent)) import anonymizer_core_refactored_onnx as core +from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH # Test avec un PDF simple test_pdf = Path("/tmp/test_gui_pdfs") @@ -31,7 +32,7 @@ try: out_dir=out_dir, make_vector_redaction=False, also_make_raster_burn=True, - config_path=Path("config/dictionnaires.yml"), + config_path=RUNTIME_DICTIONARIES_CONFIG_PATH, use_hf=False, ner_manager=None, ner_thresholds=None, diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..4764d24 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python3 +""" +Configuration pytest partagée pour les imports du dépôt. +""" +import sys +from pathlib import Path + + +ROOT_DIR = Path(__file__).resolve().parent.parent + +if str(ROOT_DIR) not in sys.path: + sys.path.insert(0, str(ROOT_DIR)) diff --git a/tests/synthetic_regression/README.md b/tests/synthetic_regression/README.md new file mode 100644 index 0000000..6c18c22 --- /dev/null +++ b/tests/synthetic_regression/README.md @@ -0,0 +1,26 @@ +# Tests synthétiques de non-régression + +Cette suite fournit 10 cas synthétiques courts, relisibles et diffables, pensés +comme première barrière de sécurité avant la revue humaine. + +Principe : +- `test.txt` contient le document synthétique d'entrée à relire ou diff-er. +- `expected.txt` contient la sortie anonymisée attendue, normalisée. +- `expected.audit.json` contient un résumé stable de l'audit attendu. +- `config_overlay.yml` est optionnel et permet de tester une surcharge locale. + +Objectif : +- bloquer les régressions évidentes sur les règles critiques ; +- rendre les écarts lisibles dans un diff Git ou dans la sortie de `pytest` ; +- compléter, et non remplacer, la validation humaine sur corpus réel. + +Portée de cette première version : +- texte uniquement ; +- pas encore de PDF/OCR/layout ; +- pas encore de cas `xfail` pour les bugs connus. + +Exécution : + +```bash +pytest -q tests/unit/test_synthetic_regression.py +``` diff --git a/tests/synthetic_regression/cases/001_patient_header_and_birth/expected.audit.json b/tests/synthetic_regression/cases/001_patient_header_and_birth/expected.audit.json new file mode 100644 index 0000000..94735b2 --- /dev/null +++ b/tests/synthetic_regression/cases/001_patient_header_and_birth/expected.audit.json @@ -0,0 +1,22 @@ +[ + { + "kind": "DATE_NAISSANCE", + "original": "Né le 12/03/1980", + "replacement": "[DATE_NAISSANCE]" + }, + { + "kind": "NOM_GLOBAL", + "original": "ETCHEVERRY", + "replacement": "[NOM]" + }, + { + "kind": "NOM_GLOBAL", + "original": "CLAUDE", + "replacement": "[NOM]" + }, + { + "kind": "NOM_GLOBAL", + "original": "JEAN", + "replacement": "[NOM]" + } +] diff --git a/tests/synthetic_regression/cases/001_patient_header_and_birth/expected.txt b/tests/synthetic_regression/cases/001_patient_header_and_birth/expected.txt new file mode 100644 index 0000000..999c1b0 --- /dev/null +++ b/tests/synthetic_regression/cases/001_patient_header_and_birth/expected.txt @@ -0,0 +1,3 @@ +[NOM] [NOM] [NOM] +[DATE_NAISSANCE] +Consultation du 14/03/2024 diff --git a/tests/synthetic_regression/cases/001_patient_header_and_birth/input.txt b/tests/synthetic_regression/cases/001_patient_header_and_birth/input.txt new file mode 100644 index 0000000..2be982c --- /dev/null +++ b/tests/synthetic_regression/cases/001_patient_header_and_birth/input.txt @@ -0,0 +1,3 @@ +ETCHEVERRY JEAN CLAUDE +Né le 12/03/1980 +Consultation du 14/03/2024 diff --git a/tests/synthetic_regression/cases/001_patient_header_and_birth/test.txt b/tests/synthetic_regression/cases/001_patient_header_and_birth/test.txt new file mode 100644 index 0000000..2be982c --- /dev/null +++ b/tests/synthetic_regression/cases/001_patient_header_and_birth/test.txt @@ -0,0 +1,3 @@ +ETCHEVERRY JEAN CLAUDE +Né le 12/03/1980 +Consultation du 14/03/2024 diff --git a/tests/synthetic_regression/cases/002_contact_bundle/expected.audit.json b/tests/synthetic_regression/cases/002_contact_bundle/expected.audit.json new file mode 100644 index 0000000..e890ca1 --- /dev/null +++ b/tests/synthetic_regression/cases/002_contact_bundle/expected.audit.json @@ -0,0 +1,12 @@ +[ + { + "kind": "EMAIL", + "original": "jean.dupont@example.com", + "replacement": "[EMAIL]" + }, + { + "kind": "TEL", + "original": "01 23 45 67 89", + "replacement": "[TEL]" + } +] diff --git a/tests/synthetic_regression/cases/002_contact_bundle/expected.txt b/tests/synthetic_regression/cases/002_contact_bundle/expected.txt new file mode 100644 index 0000000..0b62ff3 --- /dev/null +++ b/tests/synthetic_regression/cases/002_contact_bundle/expected.txt @@ -0,0 +1 @@ +Contact : [EMAIL] ou [TEL] diff --git a/tests/synthetic_regression/cases/002_contact_bundle/input.txt b/tests/synthetic_regression/cases/002_contact_bundle/input.txt new file mode 100644 index 0000000..557042d --- /dev/null +++ b/tests/synthetic_regression/cases/002_contact_bundle/input.txt @@ -0,0 +1 @@ +Contact: jean.dupont@example.com ou 01 23 45 67 89 diff --git a/tests/synthetic_regression/cases/002_contact_bundle/test.txt b/tests/synthetic_regression/cases/002_contact_bundle/test.txt new file mode 100644 index 0000000..557042d --- /dev/null +++ b/tests/synthetic_regression/cases/002_contact_bundle/test.txt @@ -0,0 +1 @@ +Contact: jean.dupont@example.com ou 01 23 45 67 89 diff --git a/tests/synthetic_regression/cases/003_multiline_venue_number/expected.audit.json b/tests/synthetic_regression/cases/003_multiline_venue_number/expected.audit.json new file mode 100644 index 0000000..562a997 --- /dev/null +++ b/tests/synthetic_regression/cases/003_multiline_venue_number/expected.audit.json @@ -0,0 +1,7 @@ +[ + { + "kind": "NDA", + "original": "1234567", + "replacement": "[NDA]" + } +] diff --git a/tests/synthetic_regression/cases/003_multiline_venue_number/expected.txt b/tests/synthetic_regression/cases/003_multiline_venue_number/expected.txt new file mode 100644 index 0000000..367b882 --- /dev/null +++ b/tests/synthetic_regression/cases/003_multiline_venue_number/expected.txt @@ -0,0 +1,3 @@ +N° venue : +[NDA] +Date de séjour : 14/03/2024 diff --git a/tests/synthetic_regression/cases/003_multiline_venue_number/input.txt b/tests/synthetic_regression/cases/003_multiline_venue_number/input.txt new file mode 100644 index 0000000..68b3941 --- /dev/null +++ b/tests/synthetic_regression/cases/003_multiline_venue_number/input.txt @@ -0,0 +1,3 @@ +N° venue : +1234567 +Date de séjour : 14/03/2024 diff --git a/tests/synthetic_regression/cases/003_multiline_venue_number/test.txt b/tests/synthetic_regression/cases/003_multiline_venue_number/test.txt new file mode 100644 index 0000000..68b3941 --- /dev/null +++ b/tests/synthetic_regression/cases/003_multiline_venue_number/test.txt @@ -0,0 +1,3 @@ +N° venue : +1234567 +Date de séjour : 14/03/2024 diff --git a/tests/synthetic_regression/cases/004_identifier_bundle/expected.audit.json b/tests/synthetic_regression/cases/004_identifier_bundle/expected.audit.json new file mode 100644 index 0000000..8e3cc75 --- /dev/null +++ b/tests/synthetic_regression/cases/004_identifier_bundle/expected.audit.json @@ -0,0 +1,27 @@ +[ + { + "kind": "RPPS", + "original": "12345678901", + "replacement": "[RPPS]" + }, + { + "kind": "FINESS", + "original": "123456789", + "replacement": "[FINESS]" + }, + { + "kind": "IPP", + "original": "ABC12345", + "replacement": "[IPP]" + }, + { + "kind": "OGC", + "original": "12", + "replacement": "[OGC]" + }, + { + "kind": "IBAN", + "original": "FR76 3000 6000 0112 3456 7890 189", + "replacement": "[IBAN]" + } +] diff --git a/tests/synthetic_regression/cases/004_identifier_bundle/expected.txt b/tests/synthetic_regression/cases/004_identifier_bundle/expected.txt new file mode 100644 index 0000000..037c1ed --- /dev/null +++ b/tests/synthetic_regression/cases/004_identifier_bundle/expected.txt @@ -0,0 +1,5 @@ +RPPS : [RPPS] +FINESS : [FINESS] +IPP : [IPP] +N° OGC : [OGC] +IBAN : [IBAN] diff --git a/tests/synthetic_regression/cases/004_identifier_bundle/input.txt b/tests/synthetic_regression/cases/004_identifier_bundle/input.txt new file mode 100644 index 0000000..6eaea26 --- /dev/null +++ b/tests/synthetic_regression/cases/004_identifier_bundle/input.txt @@ -0,0 +1,5 @@ +RPPS : 12345678901 +FINESS : 123456789 +IPP : ABC12345 +N° OGC : 12 +IBAN : FR76 3000 6000 0112 3456 7890 189 diff --git a/tests/synthetic_regression/cases/004_identifier_bundle/test.txt b/tests/synthetic_regression/cases/004_identifier_bundle/test.txt new file mode 100644 index 0000000..6eaea26 --- /dev/null +++ b/tests/synthetic_regression/cases/004_identifier_bundle/test.txt @@ -0,0 +1,5 @@ +RPPS : 12345678901 +FINESS : 123456789 +IPP : ABC12345 +N° OGC : 12 +IBAN : FR76 3000 6000 0112 3456 7890 189 diff --git a/tests/synthetic_regression/cases/005_force_mask_default_term/expected.audit.json b/tests/synthetic_regression/cases/005_force_mask_default_term/expected.audit.json new file mode 100644 index 0000000..34816c8 --- /dev/null +++ b/tests/synthetic_regression/cases/005_force_mask_default_term/expected.audit.json @@ -0,0 +1,7 @@ +[ + { + "kind": "force_term", + "original": "CHCB", + "replacement": "[MASK]" + } +] diff --git a/tests/synthetic_regression/cases/005_force_mask_default_term/expected.txt b/tests/synthetic_regression/cases/005_force_mask_default_term/expected.txt new file mode 100644 index 0000000..16fd55b --- /dev/null +++ b/tests/synthetic_regression/cases/005_force_mask_default_term/expected.txt @@ -0,0 +1 @@ +Patient adressé au [MASK] pour avis. Retour au [MASK] demain. diff --git a/tests/synthetic_regression/cases/005_force_mask_default_term/input.txt b/tests/synthetic_regression/cases/005_force_mask_default_term/input.txt new file mode 100644 index 0000000..cb962de --- /dev/null +++ b/tests/synthetic_regression/cases/005_force_mask_default_term/input.txt @@ -0,0 +1 @@ +Patient adressé au CHCB pour avis. Retour au CHCB demain. diff --git a/tests/synthetic_regression/cases/005_force_mask_default_term/test.txt b/tests/synthetic_regression/cases/005_force_mask_default_term/test.txt new file mode 100644 index 0000000..cb962de --- /dev/null +++ b/tests/synthetic_regression/cases/005_force_mask_default_term/test.txt @@ -0,0 +1 @@ +Patient adressé au CHCB pour avis. Retour au CHCB demain. diff --git a/tests/synthetic_regression/cases/006_whitelist_phrases_preserved/expected.audit.json b/tests/synthetic_regression/cases/006_whitelist_phrases_preserved/expected.audit.json new file mode 100644 index 0000000..fe51488 --- /dev/null +++ b/tests/synthetic_regression/cases/006_whitelist_phrases_preserved/expected.audit.json @@ -0,0 +1 @@ +[] diff --git a/tests/synthetic_regression/cases/006_whitelist_phrases_preserved/expected.txt b/tests/synthetic_regression/cases/006_whitelist_phrases_preserved/expected.txt new file mode 100644 index 0000000..21cfcc9 --- /dev/null +++ b/tests/synthetic_regression/cases/006_whitelist_phrases_preserved/expected.txt @@ -0,0 +1 @@ +La classification internationale reste visible. La prise en charge est correcte. diff --git a/tests/synthetic_regression/cases/006_whitelist_phrases_preserved/input.txt b/tests/synthetic_regression/cases/006_whitelist_phrases_preserved/input.txt new file mode 100644 index 0000000..21cfcc9 --- /dev/null +++ b/tests/synthetic_regression/cases/006_whitelist_phrases_preserved/input.txt @@ -0,0 +1 @@ +La classification internationale reste visible. La prise en charge est correcte. diff --git a/tests/synthetic_regression/cases/006_whitelist_phrases_preserved/test.txt b/tests/synthetic_regression/cases/006_whitelist_phrases_preserved/test.txt new file mode 100644 index 0000000..21cfcc9 --- /dev/null +++ b/tests/synthetic_regression/cases/006_whitelist_phrases_preserved/test.txt @@ -0,0 +1 @@ +La classification internationale reste visible. La prise en charge est correcte. diff --git a/tests/synthetic_regression/cases/007_overlay_force_mask_local/config_overlay.yml b/tests/synthetic_regression/cases/007_overlay_force_mask_local/config_overlay.yml new file mode 100644 index 0000000..ac6fb23 --- /dev/null +++ b/tests/synthetic_regression/cases/007_overlay_force_mask_local/config_overlay.yml @@ -0,0 +1,3 @@ +blacklist: + force_mask_terms: + - LOCAL_SIGLE diff --git a/tests/synthetic_regression/cases/007_overlay_force_mask_local/expected.audit.json b/tests/synthetic_regression/cases/007_overlay_force_mask_local/expected.audit.json new file mode 100644 index 0000000..bd28d84 --- /dev/null +++ b/tests/synthetic_regression/cases/007_overlay_force_mask_local/expected.audit.json @@ -0,0 +1,7 @@ +[ + { + "kind": "force_term", + "original": "LOCAL_SIGLE", + "replacement": "[MASK]" + } +] diff --git a/tests/synthetic_regression/cases/007_overlay_force_mask_local/expected.txt b/tests/synthetic_regression/cases/007_overlay_force_mask_local/expected.txt new file mode 100644 index 0000000..fe1e98c --- /dev/null +++ b/tests/synthetic_regression/cases/007_overlay_force_mask_local/expected.txt @@ -0,0 +1 @@ +Réorientation vers [MASK] en urgence. diff --git a/tests/synthetic_regression/cases/007_overlay_force_mask_local/input.txt b/tests/synthetic_regression/cases/007_overlay_force_mask_local/input.txt new file mode 100644 index 0000000..68d896f --- /dev/null +++ b/tests/synthetic_regression/cases/007_overlay_force_mask_local/input.txt @@ -0,0 +1 @@ +Réorientation vers LOCAL_SIGLE en urgence. diff --git a/tests/synthetic_regression/cases/007_overlay_force_mask_local/test.txt b/tests/synthetic_regression/cases/007_overlay_force_mask_local/test.txt new file mode 100644 index 0000000..68d896f --- /dev/null +++ b/tests/synthetic_regression/cases/007_overlay_force_mask_local/test.txt @@ -0,0 +1 @@ +Réorientation vers LOCAL_SIGLE en urgence. diff --git a/tests/synthetic_regression/cases/008_ville_header/expected.audit.json b/tests/synthetic_regression/cases/008_ville_header/expected.audit.json new file mode 100644 index 0000000..30e58a8 --- /dev/null +++ b/tests/synthetic_regression/cases/008_ville_header/expected.audit.json @@ -0,0 +1,7 @@ +[ + { + "kind": "VILLE", + "original": "Bayonne", + "replacement": "[VILLE]" + } +] diff --git a/tests/synthetic_regression/cases/008_ville_header/expected.txt b/tests/synthetic_regression/cases/008_ville_header/expected.txt new file mode 100644 index 0000000..72a708f --- /dev/null +++ b/tests/synthetic_regression/cases/008_ville_header/expected.txt @@ -0,0 +1,2 @@ +[VILLE], le 12/03/2024 +Compte rendu adressé au patient. diff --git a/tests/synthetic_regression/cases/008_ville_header/input.txt b/tests/synthetic_regression/cases/008_ville_header/input.txt new file mode 100644 index 0000000..69886f1 --- /dev/null +++ b/tests/synthetic_regression/cases/008_ville_header/input.txt @@ -0,0 +1,2 @@ +Bayonne, le 12/03/2024 +Compte rendu adressé au patient. diff --git a/tests/synthetic_regression/cases/008_ville_header/test.txt b/tests/synthetic_regression/cases/008_ville_header/test.txt new file mode 100644 index 0000000..69886f1 --- /dev/null +++ b/tests/synthetic_regression/cases/008_ville_header/test.txt @@ -0,0 +1,2 @@ +Bayonne, le 12/03/2024 +Compte rendu adressé au patient. diff --git a/tests/synthetic_regression/cases/009_header_and_repeated_name/expected.audit.json b/tests/synthetic_regression/cases/009_header_and_repeated_name/expected.audit.json new file mode 100644 index 0000000..959b7fc --- /dev/null +++ b/tests/synthetic_regression/cases/009_header_and_repeated_name/expected.audit.json @@ -0,0 +1,17 @@ +[ + { + "kind": "NOM_GLOBAL", + "original": "ETCHEVERRY", + "replacement": "[NOM]" + }, + { + "kind": "NOM_GLOBAL", + "original": "CLAUDE", + "replacement": "[NOM]" + }, + { + "kind": "NOM_GLOBAL", + "original": "JEAN", + "replacement": "[NOM]" + } +] diff --git a/tests/synthetic_regression/cases/009_header_and_repeated_name/expected.txt b/tests/synthetic_regression/cases/009_header_and_repeated_name/expected.txt new file mode 100644 index 0000000..f330551 --- /dev/null +++ b/tests/synthetic_regression/cases/009_header_and_repeated_name/expected.txt @@ -0,0 +1,2 @@ +[NOM] [NOM] [NOM] +Le patient [NOM] revient ce jour. diff --git a/tests/synthetic_regression/cases/009_header_and_repeated_name/input.txt b/tests/synthetic_regression/cases/009_header_and_repeated_name/input.txt new file mode 100644 index 0000000..29cba36 --- /dev/null +++ b/tests/synthetic_regression/cases/009_header_and_repeated_name/input.txt @@ -0,0 +1,2 @@ +ETCHEVERRY JEAN CLAUDE +Le patient ETCHEVERRY revient ce jour. diff --git a/tests/synthetic_regression/cases/009_header_and_repeated_name/test.txt b/tests/synthetic_regression/cases/009_header_and_repeated_name/test.txt new file mode 100644 index 0000000..29cba36 --- /dev/null +++ b/tests/synthetic_regression/cases/009_header_and_repeated_name/test.txt @@ -0,0 +1,2 @@ +ETCHEVERRY JEAN CLAUDE +Le patient ETCHEVERRY revient ce jour. diff --git a/tests/synthetic_regression/cases/010_spaced_establishment_header/expected.audit.json b/tests/synthetic_regression/cases/010_spaced_establishment_header/expected.audit.json new file mode 100644 index 0000000..85efd48 --- /dev/null +++ b/tests/synthetic_regression/cases/010_spaced_establishment_header/expected.audit.json @@ -0,0 +1,7 @@ +[ + { + "kind": "ETAB_SPACED", + "original": "C E N T R E H O S P I T A L I E R D E L A C O T E B A S Q U E", + "replacement": "[ETABLISSEMENT]" + } +] diff --git a/tests/synthetic_regression/cases/010_spaced_establishment_header/expected.txt b/tests/synthetic_regression/cases/010_spaced_establishment_header/expected.txt new file mode 100644 index 0000000..f9850fd --- /dev/null +++ b/tests/synthetic_regression/cases/010_spaced_establishment_header/expected.txt @@ -0,0 +1,2 @@ +[ETABLISSEMENT] +Service de cardiologie diff --git a/tests/synthetic_regression/cases/010_spaced_establishment_header/input.txt b/tests/synthetic_regression/cases/010_spaced_establishment_header/input.txt new file mode 100644 index 0000000..87c396a --- /dev/null +++ b/tests/synthetic_regression/cases/010_spaced_establishment_header/input.txt @@ -0,0 +1,2 @@ +C E N T R E H O S P I T A L I E R D E L A C O T E B A S Q U E +Service de cardiologie diff --git a/tests/synthetic_regression/cases/010_spaced_establishment_header/test.txt b/tests/synthetic_regression/cases/010_spaced_establishment_header/test.txt new file mode 100644 index 0000000..87c396a --- /dev/null +++ b/tests/synthetic_regression/cases/010_spaced_establishment_header/test.txt @@ -0,0 +1,2 @@ +C E N T R E H O S P I T A L I E R D E L A C O T E B A S Q U E +Service de cardiologie diff --git a/tests/synthetic_regression/manifest.json b/tests/synthetic_regression/manifest.json new file mode 100644 index 0000000..d42fea1 --- /dev/null +++ b/tests/synthetic_regression/manifest.json @@ -0,0 +1,110 @@ +{ + "001_patient_header_and_birth": { + "description": "En-tête patient en majuscules avec date de naissance masquée et date de soin conservée.", + "must_contain": [ + "[DATE_NAISSANCE]", + "Consultation du 14/03/2024" + ], + "must_not_contain": [ + "ETCHEVERRY", + "JEAN", + "CLAUDE", + "12/03/1980" + ] + }, + "002_contact_bundle": { + "description": "Email et téléphone dans une même ligne de contact.", + "must_contain": [ + "[EMAIL]", + "[TEL]" + ], + "must_not_contain": [ + "jean.dupont@example.com", + "01 23 45 67 89" + ] + }, + "003_multiline_venue_number": { + "description": "Numéro de venue éclaté sur deux lignes.", + "must_contain": [ + "N° venue :", + "[NDA]", + "Date de séjour : 14/03/2024" + ], + "must_not_contain": [ + "1234567" + ] + }, + "004_identifier_bundle": { + "description": "Bloc d'identifiants structurés variés.", + "must_contain": [ + "[RPPS]", + "[FINESS]", + "[IPP]", + "[OGC]", + "[IBAN]" + ], + "must_not_contain": [ + "12345678901", + "123456789", + "ABC12345", + "FR76 3000 6000 0112 3456 7890 189" + ] + }, + "005_force_mask_default_term": { + "description": "Terme forcé par la configuration par défaut.", + "must_contain": [ + "[MASK]" + ], + "must_not_contain": [ + "CHCB" + ] + }, + "006_whitelist_phrases_preserved": { + "description": "Expressions métier explicitement préservées.", + "must_contain": [ + "classification internationale", + "prise en charge" + ], + "must_not_contain": [] + }, + "007_overlay_force_mask_local": { + "description": "Terme local masqué via surcharge runtime.", + "must_contain": [ + "[MASK]" + ], + "must_not_contain": [ + "LOCAL_SIGLE" + ] + }, + "008_ville_header": { + "description": "Ville en en-tête de courrier, date conservée.", + "must_contain": [ + "[VILLE], le 12/03/2024" + ], + "must_not_contain": [ + "Bayonne" + ] + }, + "009_header_and_repeated_name": { + "description": "Propagation globale d'un nom vu dans l'en-tête.", + "must_contain": [ + "Le patient [NOM] revient ce jour." + ], + "must_not_contain": [ + "ETCHEVERRY", + "JEAN", + "CLAUDE" + ] + }, + "010_spaced_establishment_header": { + "description": "En-tête d'établissement avec lettres espacées.", + "must_contain": [ + "[ETABLISSEMENT]", + "Service de cardiologie" + ], + "must_not_contain": [ + "C E N T R E", + "H O S P I T A L I E R" + ] + } +} diff --git a/tests/synthetic_regression/tests.md b/tests/synthetic_regression/tests.md new file mode 100644 index 0000000..a268db0 --- /dev/null +++ b/tests/synthetic_regression/tests.md @@ -0,0 +1,25 @@ +# Jeux de tests synthétiques + +Ces fichiers sont les cas de test relisibles à la main. Chaque dossier contient : +- `test.txt` : document synthétique d'entrée +- `expected.txt` : sortie anonymisée attendue +- `expected.audit.json` : résumé d'audit attendu + +Cas disponibles : +- `001_patient_header_and_birth` +- `002_contact_bundle` +- `003_multiline_venue_number` +- `004_identifier_bundle` +- `005_force_mask_default_term` +- `006_whitelist_phrases_preserved` +- `007_overlay_force_mask_local` +- `008_ville_header` +- `009_header_and_repeated_name` +- `010_spaced_establishment_header` + +Exemples de fichiers à ouvrir : +- [001 test]() +- [001 attendu]() +- [004 test]() +- [004 attendu]() +- [007 surcharge locale]() diff --git a/tests/synthetic_review/README.md b/tests/synthetic_review/README.md new file mode 100644 index 0000000..b6e69d1 --- /dev/null +++ b/tests/synthetic_review/README.md @@ -0,0 +1,26 @@ +# Corpus synthétique de revue humaine + +Ce corpus ne remplace pas les tests unitaires. Il sert à valider des documents +complets, relus par un humain, avec un vrai diff entre : +- `test.txt` : document synthétique source +- `expected.txt` : anonymisation attendue selon la règle métier +- `actual/` : sortie réellement produite par le moteur + +Objectif : +- détecter les régressions de composition sur des documents réalistes ; +- rendre visibles les écarts de comportement du moteur ; +- préparer une validation humaine avant promotion éventuelle en suite bloquante. + +Commande : + +```bash +python3 tools/run_synthetic_review_corpus.py +``` + +Chaque exécution écrit : +- `actual.txt` +- `actual.audit.json` +- `actual.summary.json` +- `diff.txt` + +Sous [actual](/home/dom/ai/anonymisation/tests/synthetic_review/actual). diff --git a/tests/synthetic_review/cases/001_crh_hospitalisation_complete/expectations.json b/tests/synthetic_review/cases/001_crh_hospitalisation_complete/expectations.json new file mode 100644 index 0000000..ee517c0 --- /dev/null +++ b/tests/synthetic_review/cases/001_crh_hospitalisation_complete/expectations.json @@ -0,0 +1,31 @@ +{ + "required_kinds": [ + "ADRESSE", + "CODE_POSTAL", + "DATE_NAISSANCE", + "EMAIL", + "ETAB", + "IPP", + "NDA", + "NOM_FORCE", + "TEL", + "VILLE", + "force_term" + ], + "must_contain": [ + "classification internationale", + "prise en charge", + "Service de cardiologie" + ], + "must_not_contain": [ + "ETCHEVERRY", + "JEAN", + "CLAUDE", + "12/03/1980", + "06 12 34 56 78", + "jean.claude.etcheverry@example.com", + "ABC12345", + "1234567", + "CHCB" + ] +} diff --git a/tests/synthetic_review/cases/001_crh_hospitalisation_complete/expected.txt b/tests/synthetic_review/cases/001_crh_hospitalisation_complete/expected.txt new file mode 100644 index 0000000..9c6990a --- /dev/null +++ b/tests/synthetic_review/cases/001_crh_hospitalisation_complete/expected.txt @@ -0,0 +1,19 @@ +[ETABLISSEMENT] +[VILLE], le 14/03/2024 + +COMPTE RENDU D'HOSPITALISATION + +Patient : [NOM] [NOM] [NOM] +[DATE_NAISSANCE] +Adresse : [ADRESSE] +Code postal : [CODE_POSTAL] +Ville de résidence : [VILLE] +Téléphone : [TEL] +Mail : [EMAIL] +IPP : [IPP] +N° venue : +[NDA] + +Le patient [NOM] [NOM] [NOM] est adressé au [MASK] pour bilan. +La classification internationale et la prise en charge sont discutées. +Service de cardiologie. diff --git a/tests/synthetic_review/cases/001_crh_hospitalisation_complete/review.md b/tests/synthetic_review/cases/001_crh_hospitalisation_complete/review.md new file mode 100644 index 0000000..df80173 --- /dev/null +++ b/tests/synthetic_review/cases/001_crh_hospitalisation_complete/review.md @@ -0,0 +1,10 @@ +# Revue 001 + +Points critiques : +- le patient doit être masqué partout, y compris en reprise narrative ; +- la date de naissance doit être masquée, pas la date de soin ; +- l'adresse, le code postal, la ville, le téléphone, le mail, l'IPP et le numéro de venue doivent disparaître ; +- `classification internationale`, `prise en charge` et `Service de cardiologie` doivent rester lisibles. + +Écart attendu aujourd'hui : +- ce cas doit mettre en évidence si le moteur perd des labels structurés comme `Code postal :` ou `N° venue :`. diff --git a/tests/synthetic_review/cases/001_crh_hospitalisation_complete/test.txt b/tests/synthetic_review/cases/001_crh_hospitalisation_complete/test.txt new file mode 100644 index 0000000..75aec28 --- /dev/null +++ b/tests/synthetic_review/cases/001_crh_hospitalisation_complete/test.txt @@ -0,0 +1,19 @@ +CENTRE HOSPITALIER DE LA COTE BASQUE +Bayonne, le 14/03/2024 + +COMPTE RENDU D'HOSPITALISATION + +Patient : ETCHEVERRY JEAN CLAUDE +Né le 12/03/1980 +Adresse : 14 rue des Lilas +Code postal : 64100 +Ville de résidence : Bayonne +Téléphone : 06 12 34 56 78 +Mail : jean.claude.etcheverry@example.com +IPP : ABC12345 +N° venue : +1234567 + +Le patient ETCHEVERRY JEAN CLAUDE est adressé au CHCB pour bilan. +La classification internationale et la prise en charge sont discutées. +Service de cardiologie. diff --git a/tests/synthetic_review/cases/002_imagerie_complete/expectations.json b/tests/synthetic_review/cases/002_imagerie_complete/expectations.json new file mode 100644 index 0000000..4627b30 --- /dev/null +++ b/tests/synthetic_review/cases/002_imagerie_complete/expectations.json @@ -0,0 +1,26 @@ +{ + "required_kinds": [ + "DATE_NAISSANCE", + "DOSSIER", + "ETAB_SPACED", + "FINESS", + "IBAN", + "NOM_FORCE", + "OGC", + "RPPS" + ], + "must_contain": [ + "Service de radiologie", + "classification internationale" + ], + "must_not_contain": [ + "DUPONT", + "MARIE", + "PAULE", + "01/02/1975", + "23L35781", + "12345678901", + "123456789", + "FR76 3000 6000 0112 3456 7890 189" + ] +} diff --git a/tests/synthetic_review/cases/002_imagerie_complete/expected.txt b/tests/synthetic_review/cases/002_imagerie_complete/expected.txt new file mode 100644 index 0000000..24e6c7b --- /dev/null +++ b/tests/synthetic_review/cases/002_imagerie_complete/expected.txt @@ -0,0 +1,13 @@ +[ETABLISSEMENT] +Service de radiologie + +Compte rendu d'imagerie +Patient : [NOM] [NOM] [NOM] +[DATE_NAISSANCE] +N° examen : [DOSSIER] +RPPS : [RPPS] +FINESS : [FINESS] +N° OGC : [OGC] +IBAN : [IBAN] +Le dossier de [NOM] [NOM] [NOM] est revu ce jour. +La classification internationale est conservée. diff --git a/tests/synthetic_review/cases/002_imagerie_complete/review.md b/tests/synthetic_review/cases/002_imagerie_complete/review.md new file mode 100644 index 0000000..cac9b30 --- /dev/null +++ b/tests/synthetic_review/cases/002_imagerie_complete/review.md @@ -0,0 +1,7 @@ +# Revue 002 + +Points critiques : +- l'en-tête d'établissement espacé doit être réduit à un placeholder ; +- le numéro d'examen, le RPPS, le FINESS, l'OGC et l'IBAN doivent disparaître ; +- le nom du patient doit être masqué dans le champ structuré et dans la phrase narrative ; +- `Service de radiologie` et `classification internationale` doivent rester visibles. diff --git a/tests/synthetic_review/cases/002_imagerie_complete/test.txt b/tests/synthetic_review/cases/002_imagerie_complete/test.txt new file mode 100644 index 0000000..136f1ee --- /dev/null +++ b/tests/synthetic_review/cases/002_imagerie_complete/test.txt @@ -0,0 +1,13 @@ +C E N T R E H O S P I T A L I E R D E L A C O T E B A S Q U E +Service de radiologie + +Compte rendu d'imagerie +Patient : DUPONT MARIE PAULE +Née le 01/02/1975 +N° examen : 23L35781 +RPPS : 12345678901 +FINESS : 123456789 +N° OGC : 12 +IBAN : FR76 3000 6000 0112 3456 7890 189 +Le dossier de DUPONT MARIE PAULE est revu ce jour. +La classification internationale est conservée. diff --git a/tests/synthetic_review/cases/003_consultation_complete/expectations.json b/tests/synthetic_review/cases/003_consultation_complete/expectations.json new file mode 100644 index 0000000..2c19a42 --- /dev/null +++ b/tests/synthetic_review/cases/003_consultation_complete/expectations.json @@ -0,0 +1,29 @@ +{ + "required_kinds": [ + "DATE_NAISSANCE", + "EMAIL", + "ETAB", + "IPP", + "NOM_FORCE", + "RPPS", + "TEL", + "VILLE", + "force_term" + ], + "must_contain": [ + "prise en charge en hôpital de jour" + ], + "must_not_contain": [ + "LAFITTE", + "ANNE", + "MARIE", + "18/07/1968", + "Bordeaux", + "Anglet", + "anne.lafitte@example.com", + "01 23 45 67 89", + "10987654321", + "ZXC98765", + "CHCB" + ] +} diff --git a/tests/synthetic_review/cases/003_consultation_complete/expected.txt b/tests/synthetic_review/cases/003_consultation_complete/expected.txt new file mode 100644 index 0000000..d9a83d7 --- /dev/null +++ b/tests/synthetic_review/cases/003_consultation_complete/expected.txt @@ -0,0 +1,14 @@ +[ETABLISSEMENT] +[VILLE], le 22/05/2024 + +CONSULTATION DE SUIVI + +Patient : [NOM] [NOM] [NOM] +[DATE_NAISSANCE] +Lieu de naissance : [VILLE] +Ville de résidence : [VILLE] +Contact : [EMAIL] ou [TEL] +RPPS : [RPPS] +IPP : [IPP] +Le patient [NOM] [NOM] [NOM] est adressé au [MASK]. +La prise en charge en hôpital de jour est maintenue. diff --git a/tests/synthetic_review/cases/003_consultation_complete/review.md b/tests/synthetic_review/cases/003_consultation_complete/review.md new file mode 100644 index 0000000..4311afa --- /dev/null +++ b/tests/synthetic_review/cases/003_consultation_complete/review.md @@ -0,0 +1,7 @@ +# Revue 003 + +Points critiques : +- la ville d'en-tête, le lieu de naissance et la ville de résidence doivent être masqués ; +- le contact mail/téléphone, le RPPS et l'IPP doivent être masqués ; +- la reprise narrative du nom du patient doit être masquée ; +- `prise en charge en hôpital de jour` doit rester visible. diff --git a/tests/synthetic_review/cases/003_consultation_complete/test.txt b/tests/synthetic_review/cases/003_consultation_complete/test.txt new file mode 100644 index 0000000..f912df2 --- /dev/null +++ b/tests/synthetic_review/cases/003_consultation_complete/test.txt @@ -0,0 +1,14 @@ +CLINIQUE ATLANTIQUE +Biarritz, le 22/05/2024 + +CONSULTATION DE SUIVI + +Patient : LAFITTE ANNE MARIE +Née le 18/07/1968 +Lieu de naissance : Bordeaux +Ville de résidence : Anglet +Contact : anne.lafitte@example.com ou 01 23 45 67 89 +RPPS : 10987654321 +IPP : ZXC98765 +Le patient LAFITTE ANNE MARIE est adressé au CHCB. +La prise en charge en hôpital de jour est maintenue. diff --git a/tests/synthetic_review/cases/004_structured_admin_complete/expectations.json b/tests/synthetic_review/cases/004_structured_admin_complete/expectations.json new file mode 100644 index 0000000..f324ebd --- /dev/null +++ b/tests/synthetic_review/cases/004_structured_admin_complete/expectations.json @@ -0,0 +1,27 @@ +{ + "required_kinds": [ + "EMAIL", + "FINESS", + "IPP", + "NOM_GLOBAL", + "OGC", + "RPPS", + "TEL", + "VILLE", + "force_term" + ], + "must_not_contain": [ + "ETCHEVERRY", + "JEAN", + "CLAUDE", + "ABC12345", + "123456789", + "12345678901", + "Bayonne", + "Bordeaux", + "Anglet", + "06 11 22 33 44", + "jean.dupont@example.com", + "CHCB" + ] +} diff --git a/tests/synthetic_review/cases/004_structured_admin_complete/expected.txt b/tests/synthetic_review/cases/004_structured_admin_complete/expected.txt new file mode 100644 index 0000000..ed0052d --- /dev/null +++ b/tests/synthetic_review/cases/004_structured_admin_complete/expected.txt @@ -0,0 +1,11 @@ +[NOM] [NOM] [NOM] +IPP : [IPP] +FINESS : [FINESS] +RPPS : [RPPS] +[VILLE], le 12/03/2024 +Lieu de naissance : [VILLE] +Ville de résidence : [VILLE] +Téléphone : [TEL] +Mail : [EMAIL] +N° OGC : [OGC] +Patient adressé au [MASK] pour avis. Retour au [MASK] demain. diff --git a/tests/synthetic_review/cases/004_structured_admin_complete/review.md b/tests/synthetic_review/cases/004_structured_admin_complete/review.md new file mode 100644 index 0000000..edcb727 --- /dev/null +++ b/tests/synthetic_review/cases/004_structured_admin_complete/review.md @@ -0,0 +1,7 @@ +# Revue 004 + +Points critiques : +- les identifiants structurés doivent être masqués même quand le label et la valeur sont séparés ; +- la ville d'en-tête et les villes structurées doivent disparaître ; +- le nom de patient en en-tête doit être propagé ; +- les deux occurrences de `CHCB` doivent être masquées. diff --git a/tests/synthetic_review/cases/004_structured_admin_complete/test.txt b/tests/synthetic_review/cases/004_structured_admin_complete/test.txt new file mode 100644 index 0000000..d997d16 --- /dev/null +++ b/tests/synthetic_review/cases/004_structured_admin_complete/test.txt @@ -0,0 +1,12 @@ +ETCHEVERRY JEAN CLAUDE +IPP +ABC12345 +FINESS : 123456789 +RPPS : 12345678901 +Bayonne, le 12/03/2024 +Lieu de naissance : Bordeaux +Ville de résidence : Anglet +Téléphone : 06 11 22 33 44 +Mail : jean.dupont@example.com +N° OGC : 12 +Patient adressé au CHCB pour avis. Retour au CHCB demain. diff --git a/tests/synthetic_review/tests.md b/tests/synthetic_review/tests.md new file mode 100644 index 0000000..b1a94bb --- /dev/null +++ b/tests/synthetic_review/tests.md @@ -0,0 +1,15 @@ +# Index du corpus de revue + +Cas complets disponibles : +- [001 source]() +- [001 attendu]() +- [001 revue]() +- [002 source]() +- [002 attendu]() +- [002 revue]() +- [003 source]() +- [003 attendu]() +- [003 revue]() +- [004 source]() +- [004 attendu]() +- [004 revue]() diff --git a/tests/unit/test_config_externalization.py b/tests/unit/test_config_externalization.py new file mode 100644 index 0000000..ed9045e --- /dev/null +++ b/tests/unit/test_config_externalization.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +""" +Tests de non-régression pour la config externalisée. +""" +from pathlib import Path + +import anonymizer_core_refactored_onnx as core +from config_defaults import ( + deep_merge_dict, + ensure_runtime_dictionaries_config, + load_effective_dictionaries_dict, + read_default_dictionaries_text, + read_runtime_dictionaries_overlay_text, +) + + +def test_default_config_template_is_externalized(): + text = read_default_dictionaries_text() + + assert "blacklist:" in text + assert "whitelist_phrases:" in text + + cfg = core.load_dictionaries(None) + assert "CHCB" in cfg["blacklist"]["force_mask_terms"] + + +def test_runtime_overlay_template_is_minimal(): + text = read_runtime_dictionaries_overlay_text() + + assert "dictionnaires.default.yml" in text + assert "{}" in text + + +def test_deep_merge_dict_preserves_nested_defaults(): + base = { + "whitelist": { + "sections_titres": ["DIM"], + "org_gpe_keep": False, + }, + "flags": { + "case_insensitive": True, + "regex_engine": "python", + }, + } + override = { + "whitelist": { + "sections_titres": ["GHM"], + "org_gpe_keep": True, + }, + "flags": { + "regex_engine": "re2", + }, + } + + merged = deep_merge_dict(base, override) + + assert merged["whitelist"]["sections_titres"] == ["DIM", "GHM"] + assert merged["whitelist"]["org_gpe_keep"] is True + assert merged["flags"]["case_insensitive"] is True + assert merged["flags"]["regex_engine"] == "re2" + + +def test_additional_stopwords_refresh_and_reset(tmp_path: Path): + cfg_path = tmp_path / "cfg.yml" + cfg_path.write_text("additional_stopwords:\n - xyzzymed\n", encoding="utf-8") + + core.load_dictionaries(cfg_path) + assert "xyzzymed" in core._MEDICAL_STOP_WORDS_SET + assert "xyzzymed" in core._MEDICAL_STOP_WORDS + + core.load_dictionaries(None) + assert "xyzzymed" not in core._MEDICAL_STOP_WORDS_SET + assert "xyzzymed" not in core._MEDICAL_STOP_WORDS + + +def test_runtime_overlay_is_created_and_effective_merge_works(tmp_path: Path): + cfg_path = tmp_path / "dictionnaires.yml" + + created = ensure_runtime_dictionaries_config(cfg_path) + assert created == cfg_path + assert cfg_path.exists() + + effective = load_effective_dictionaries_dict(cfg_path) + assert "CHCB" in effective["blacklist"]["force_mask_terms"] + + cfg_path.write_text( + "blacklist:\n force_mask_terms:\n - LOCAL_SIGLE\n", + encoding="utf-8", + ) + effective = load_effective_dictionaries_dict(cfg_path) + assert "CHCB" in effective["blacklist"]["force_mask_terms"] + assert "LOCAL_SIGLE" in effective["blacklist"]["force_mask_terms"] diff --git a/tests/unit/test_header_pii_detection.py b/tests/unit/test_header_pii_detection.py new file mode 100644 index 0000000..c06455e --- /dev/null +++ b/tests/unit/test_header_pii_detection.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 +""" +Tests de non-régression pour les fuites en en-tête de document. +""" +from anonymizer_core_refactored_onnx import ( + RE_NUM_ACCESSION_HEADER, + RE_NUM_EXAMEN_PATIENT, + anonymise_document_regex, + load_dictionaries, + selective_rescan, +) + + +class TestHeaderPiiDetection: + """Cas réels vus en production: nom patient en capitales + numéro d'examen compact.""" + + def test_uppercase_patient_header_is_masked(self): + cfg = load_dictionaries(None) + anon = anonymise_document_regex(["ETCHEVERRY JEAN CLAUDE"], [[]], cfg) + + assert "ETCHEVERRY" not in anon.text_out + assert "JEAN" not in anon.text_out + assert "CLAUDE" not in anon.text_out + assert anon.text_out == "[NOM] [NOM] [NOM]" + + def test_compact_exam_number_matches_labeled_pattern(self): + match = RE_NUM_EXAMEN_PATIENT.search("N° examen : 23L35781") + + assert match is not None + assert match.group(1) == "23L35781" + + def test_bare_header_accession_number_is_added_to_audit(self): + cfg = load_dictionaries(None) + text = ( + "N° 23L35781\n" + "Prélevé le 26/07/2023\n" + "Enregistré le 27/07/2023\n" + ) + + match = RE_NUM_ACCESSION_HEADER.search(text) + assert match is not None + assert match.group(1) == "23L35781" + + anon = anonymise_document_regex([text], [[]], cfg) + assert any(h.kind == "DOSSIER" and h.original == "23L35781" for h in anon.audit) + + def test_labeled_exam_number_is_masked_in_text_and_audit(self): + cfg = load_dictionaries(None) + + anon = anonymise_document_regex(["N° examen : 23L35781"], [[]], cfg) + text = selective_rescan(anon.text_out, cfg) + + assert text == "N° examen : [DOSSIER]" + assert any(h.kind == "DOSSIER" and h.original == "23L35781" for h in anon.audit) + + def test_structured_code_postal_preserves_label_and_audit(self): + cfg = load_dictionaries(None) + + anon = anonymise_document_regex(["Code postal : 64100"], [[]], cfg) + text = selective_rescan(anon.text_out, cfg) + + assert text == "Code postal : [CODE_POSTAL]" + assert any(h.kind == "CODE_POSTAL" and h.original == "64100" for h in anon.audit) diff --git a/tests/unit/test_synthetic_regression.py b/tests/unit/test_synthetic_regression.py new file mode 100644 index 0000000..215b3af --- /dev/null +++ b/tests/unit/test_synthetic_regression.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +""" +Tests synthétiques de non-régression pour l'anonymisation. +""" +import json +from pathlib import Path + +import pytest + +from anonymizer_core_refactored_onnx import ( + anonymise_document_regex, + load_dictionaries, + selective_rescan, +) +from evaluation.leak_scanner import LeakScanner + + +SUITE_DIR = Path(__file__).resolve().parents[1] / "synthetic_regression" +CASES_DIR = SUITE_DIR / "cases" +MANIFEST_PATH = SUITE_DIR / "manifest.json" +LEAK_SCANNER = LeakScanner() + + +def _normalize_text(text: str) -> str: + text = text.replace("\r\n", "\n").replace("\r", "\n") + return "\n".join(line.rstrip() for line in text.strip().splitlines()) + + +def _load_manifest() -> dict: + return json.loads(MANIFEST_PATH.read_text(encoding="utf-8")) + + +def _case_dirs() -> list[Path]: + return sorted(path for path in CASES_DIR.iterdir() if path.is_dir()) + + +def _normalize_audit(audit: list) -> list[dict]: + return [ + { + "kind": hit.kind, + "original": hit.original, + "replacement": hit.placeholder, + } + for hit in audit + ] + + +def _load_case_cfg(case_dir: Path): + overlay_path = case_dir / "config_overlay.yml" + return load_dictionaries(overlay_path if overlay_path.exists() else None) + + +def _assertions_for(case_name: str) -> dict: + manifest = _load_manifest() + return manifest[case_name] + + +def test_synthetic_regression_inventory(): + assert MANIFEST_PATH.exists() + assert len(_case_dirs()) == 10 + assert len(_load_manifest()) == 10 + + +@pytest.mark.parametrize("case_dir", _case_dirs(), ids=lambda path: path.name) +def test_synthetic_regression_case(case_dir: Path): + cfg = _load_case_cfg(case_dir) + case_rules = _assertions_for(case_dir.name) + + input_path = case_dir / "test.txt" + if not input_path.exists(): + input_path = case_dir / "input.txt" + + input_text = input_path.read_text(encoding="utf-8") + expected_text = _normalize_text((case_dir / "expected.txt").read_text(encoding="utf-8")) + expected_audit = json.loads((case_dir / "expected.audit.json").read_text(encoding="utf-8")) + + result = anonymise_document_regex([input_text], [[]], cfg) + actual_text = _normalize_text(selective_rescan(result.text_out, cfg)) + actual_audit = _normalize_audit(result.audit) + + assert actual_text == expected_text + assert actual_audit == expected_audit + + for required in case_rules.get("must_contain", []): + assert required in actual_text + + for forbidden in case_rules.get("must_not_contain", []): + assert forbidden not in actual_text + + leaks = LEAK_SCANNER.scan_text( + actual_text, + [ + { + "kind": item["kind"], + "original": item["original"], + } + for item in actual_audit + ], + ) + assert not leaks diff --git a/tools/debug_force_term.py b/tools/debug_force_term.py index 4d0c40d..48bd586 100644 --- a/tools/debug_force_term.py +++ b/tools/debug_force_term.py @@ -2,12 +2,12 @@ """Debug force_term mechanism.""" import re -import yaml -from pathlib import Path -# Load config -cfg_path = Path("config/dictionnaires.yml") -cfg = yaml.safe_load(cfg_path.read_text(encoding="utf-8")) +from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH, load_effective_dictionaries_dict + +# Load effective config +cfg_path = RUNTIME_DICTIONARIES_CONFIG_PATH +cfg = load_effective_dictionaries_dict(cfg_path) print("=" * 80) print("CONFIG LOADED") diff --git a/tools/quick_test_date_correction.py b/tools/quick_test_date_correction.py index 8fc55c4..391f80d 100644 --- a/tools/quick_test_date_correction.py +++ b/tools/quick_test_date_correction.py @@ -5,6 +5,7 @@ import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent)) +from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH from anonymizer_core_refactored_onnx import process_pdf # Test sur 3 documents du test dataset @@ -32,7 +33,7 @@ for doc in test_docs: out_dir=out_dir, make_vector_redaction=False, also_make_raster_burn=False, - config_path=Path("config/dictionnaires.yml"), + config_path=RUNTIME_DICTIONARIES_CONFIG_PATH, use_hf=False, ner_manager=None, vlm_manager=None, @@ -56,4 +57,3 @@ for doc in test_docs: print(f"❌ {pdf_path.name}: Erreur - {e}") print("\n✅ Test terminé") - diff --git a/tools/run_synthetic_review_corpus.py b/tools/run_synthetic_review_corpus.py new file mode 100644 index 0000000..a8921d6 --- /dev/null +++ b/tools/run_synthetic_review_corpus.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python3 +""" +Exécute le corpus synthétique de revue humaine et produit les diffs. +""" +from __future__ import annotations + +import argparse +import difflib +import json +import shutil +import sys +from collections import Counter +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) + +from anonymizer_core_refactored_onnx import ( # noqa: E402 + anonymise_document_regex, + load_dictionaries, + selective_rescan, +) +from evaluation.leak_scanner import LeakScanner # noqa: E402 + + +CORPUS_DIR = ROOT / "tests" / "synthetic_review" +CASES_DIR = CORPUS_DIR / "cases" +ACTUAL_DIR = CORPUS_DIR / "actual" +SCANNER = LeakScanner() + + +def normalize_text(text: str) -> str: + text = text.replace("\r\n", "\n").replace("\r", "\n") + return "\n".join(line.rstrip() for line in text.strip().splitlines()) + "\n" + + +def load_expectations(case_dir: Path) -> dict: + expectations_path = case_dir / "expectations.json" + if not expectations_path.exists(): + return {} + return json.loads(expectations_path.read_text(encoding="utf-8")) + + +def build_leak_scan_seed(audit: list[dict]) -> list[dict]: + """Évite les faux positifs sur les valeurs trop courtes ou ambiguës.""" + seed = [] + for item in audit: + original = str(item.get("original", "")).strip() + compact = original.replace(" ", "") + if len(compact) < 4: + continue + if compact.isdigit() and len(compact) < 6: + continue + seed.append( + { + "kind": item["kind"], + "original": original, + } + ) + return seed + + +def run_case(case_dir: Path) -> dict: + cfg_path = case_dir / "config_overlay.yml" + cfg = load_dictionaries(cfg_path if cfg_path.exists() else None) + + source_text = (case_dir / "test.txt").read_text(encoding="utf-8") + expected_text = normalize_text((case_dir / "expected.txt").read_text(encoding="utf-8")) + expectations = load_expectations(case_dir) + + anon = anonymise_document_regex([source_text], [[]], cfg) + actual_text = normalize_text(selective_rescan(anon.text_out, cfg)) + audit = [ + { + "kind": hit.kind, + "original": hit.original, + "replacement": hit.placeholder, + } + for hit in anon.audit + ] + summary = { + "kinds_present": sorted(set(item["kind"] for item in audit)), + "kind_counts": dict(sorted(Counter(item["kind"] for item in audit).items())), + "audit_len": len(audit), + "leaks": SCANNER.scan_text(actual_text, build_leak_scan_seed(audit)), + } + + case_actual_dir = ACTUAL_DIR / case_dir.name + if case_actual_dir.exists(): + shutil.rmtree(case_actual_dir) + case_actual_dir.mkdir(parents=True, exist_ok=True) + + (case_actual_dir / "actual.txt").write_text(actual_text, encoding="utf-8") + (case_actual_dir / "actual.audit.json").write_text( + json.dumps(audit, ensure_ascii=False, indent=2) + "\n", + encoding="utf-8", + ) + (case_actual_dir / "actual.summary.json").write_text( + json.dumps(summary, ensure_ascii=False, indent=2) + "\n", + encoding="utf-8", + ) + + diff_lines = list( + difflib.unified_diff( + expected_text.splitlines(keepends=True), + actual_text.splitlines(keepends=True), + fromfile=f"{case_dir.name}/expected.txt", + tofile=f"{case_dir.name}/actual.txt", + ) + ) + (case_actual_dir / "diff.txt").write_text("".join(diff_lines), encoding="utf-8") + + failures = [] + if actual_text != expected_text: + failures.append("text_diff") + + if summary["leaks"]: + failures.append("leak_detected") + + required_kinds = expectations.get("required_kinds", []) + missing_kinds = sorted(kind for kind in required_kinds if kind not in summary["kinds_present"]) + if missing_kinds: + failures.append(f"missing_kinds:{','.join(missing_kinds)}") + + for required in expectations.get("must_contain", []): + if required not in actual_text: + failures.append(f"missing_text:{required}") + + for forbidden in expectations.get("must_not_contain", []): + if forbidden in actual_text: + failures.append(f"forbidden_text:{forbidden}") + + return { + "case": case_dir.name, + "failures": failures, + "output_dir": str(case_actual_dir), + } + + +def main() -> int: + parser = argparse.ArgumentParser(description="Exécuter le corpus synthétique de revue humaine") + parser.add_argument( + "--strict", + action="store_true", + help="Retourne un code non nul si un cas diffère de l'attendu.", + ) + args = parser.parse_args() + + ACTUAL_DIR.mkdir(parents=True, exist_ok=True) + case_dirs = sorted(path for path in CASES_DIR.iterdir() if path.is_dir()) + results = [run_case(case_dir) for case_dir in case_dirs] + + has_failures = False + for result in results: + if result["failures"]: + has_failures = True + print(f"[FAIL] {result['case']}: {', '.join(result['failures'])}") + else: + print(f"[OK] {result['case']}") + print(f" -> {result['output_dir']}") + + if args.strict and has_failures: + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tools/test_all_cro.py b/tools/test_all_cro.py index 610e201..4797002 100644 --- a/tools/test_all_cro.py +++ b/tools/test_all_cro.py @@ -8,6 +8,7 @@ sys.path.insert(0, '.') from pathlib import Path import re +from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH from anonymizer_core_refactored_onnx import process_pdf import time @@ -47,7 +48,7 @@ def test_all_cro(): output_dir, make_vector_redaction=False, also_make_raster_burn=False, - config_path=Path("config/dictionnaires.yml") + config_path=RUNTIME_DICTIONARIES_CONFIG_PATH ) # Lire le texte anonymisé diff --git a/tools/test_chcb_leak.py b/tools/test_chcb_leak.py index d772da8..6e9f483 100644 --- a/tools/test_chcb_leak.py +++ b/tools/test_chcb_leak.py @@ -8,6 +8,7 @@ import sys sys.path.insert(0, str(Path(__file__).parent.parent)) import anonymizer_core_refactored_onnx as core +from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH def test_chcb_detection(): """Test CHCB detection on the 2 documents with leaks.""" @@ -53,7 +54,7 @@ def test_chcb_detection(): out_dir=outdir, make_vector_redaction=False, also_make_raster_burn=False, - config_path=Path("config/dictionnaires.yml"), + config_path=RUNTIME_DICTIONARIES_CONFIG_PATH, use_hf=False, ) @@ -102,7 +103,7 @@ def test_chcb_detection(): out_dir=outdir, make_vector_redaction=False, also_make_raster_burn=False, - config_path=Path("config/dictionnaires.yml"), + config_path=RUNTIME_DICTIONARIES_CONFIG_PATH, use_hf=False, ) diff --git a/tools/test_date_propagation.py b/tools/test_date_propagation.py index bd3d643..fd46f0b 100644 --- a/tools/test_date_propagation.py +++ b/tools/test_date_propagation.py @@ -9,6 +9,7 @@ sys.path.insert(0, '.') from pathlib import Path import re +from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH from anonymizer_core_refactored_onnx import process_pdf def test_date_propagation(): @@ -47,7 +48,7 @@ def test_date_propagation(): output_dir, make_vector_redaction=False, also_make_raster_burn=False, - config_path=Path("config/dictionnaires.yml") + config_path=RUNTIME_DICTIONARIES_CONFIG_PATH ) # Lire le texte anonymisé diff --git a/tools/test_gui_complete.py b/tools/test_gui_complete.py index ae3a54e..ca0c74c 100755 --- a/tools/test_gui_complete.py +++ b/tools/test_gui_complete.py @@ -9,6 +9,7 @@ import time sys.path.insert(0, str(Path(__file__).parent.parent)) import anonymizer_core_refactored_onnx as core +from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH # Dossier de test test_dir = Path("/tmp/test_gui_pdfs") @@ -39,7 +40,7 @@ for i, pdf in enumerate(pdfs, start=1): out_dir=out_dir, make_vector_redaction=False, also_make_raster_burn=True, - config_path=Path("config/dictionnaires.yml"), + config_path=RUNTIME_DICTIONARIES_CONFIG_PATH, use_hf=False, ner_manager=None, ner_thresholds=None, diff --git a/tools/test_gui_simulation.py b/tools/test_gui_simulation.py index 576aea0..8d05ae8 100755 --- a/tools/test_gui_simulation.py +++ b/tools/test_gui_simulation.py @@ -8,6 +8,7 @@ import sys sys.path.insert(0, str(Path(__file__).parent.parent)) import anonymizer_core_refactored_onnx as core +from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH # Simuler exactement ce que fait le GUI test_pdf = Path("/tmp/test_gui_pdfs/001_simple_unknown_BACTERIO_23018396.pdf") @@ -27,7 +28,7 @@ try: out_dir=out_dir, make_vector_redaction=False, also_make_raster_burn=True, - config_path=Path("config/dictionnaires.yml"), + config_path=RUNTIME_DICTIONARIES_CONFIG_PATH, use_hf=False, ner_manager=None, ner_thresholds=None, diff --git a/tools/test_phase1_corrections.py b/tools/test_phase1_corrections.py index f0d4198..b35228d 100755 --- a/tools/test_phase1_corrections.py +++ b/tools/test_phase1_corrections.py @@ -16,6 +16,7 @@ import re # Ajouter le répertoire racine au path sys.path.insert(0, str(Path(__file__).parent.parent)) +from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH from anonymizer_core_refactored_onnx import process_pdf def test_phase1_corrections(): @@ -52,7 +53,7 @@ def test_phase1_corrections(): # Anonymiser le document result = process_pdf( pdf_path=pdf_path, - config_path=Path("config/dictionnaires.yml"), + config_path=RUNTIME_DICTIONARIES_CONFIG_PATH, ner_manager=None, eds_pseudo_manager=None, vlm_manager=None, diff --git a/tools/validate_corpus_sample.py b/tools/validate_corpus_sample.py index 12239be..9715be7 100644 --- a/tools/validate_corpus_sample.py +++ b/tools/validate_corpus_sample.py @@ -16,6 +16,7 @@ import re sys.path.insert(0, str(Path(__file__).parent.parent)) +from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH from anonymizer_core_refactored_onnx import process_pdf def validate_corpus_sample(): @@ -94,7 +95,7 @@ def validate_corpus_sample(): output_dir, make_vector_redaction=False, also_make_raster_burn=False, # Pas de PDF pour aller plus vite - config_path=Path("config/dictionnaires.yml") + config_path=RUNTIME_DICTIONARIES_CONFIG_PATH ) doc_time = time.time() - doc_start diff --git a/tools/validate_full_corpus.py b/tools/validate_full_corpus.py index 2dc2d14..feea373 100644 --- a/tools/validate_full_corpus.py +++ b/tools/validate_full_corpus.py @@ -17,6 +17,7 @@ import re sys.path.insert(0, str(Path(__file__).parent.parent)) +from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH from anonymizer_core_refactored_onnx import process_pdf def validate_full_corpus(): @@ -70,7 +71,7 @@ def validate_full_corpus(): output_dir, make_vector_redaction=False, also_make_raster_burn=True, - config_path=Path("config/dictionnaires.yml") + config_path=RUNTIME_DICTIONARIES_CONFIG_PATH ) doc_time = time.time() - doc_start diff --git a/tools/validate_phase1_on_production.py b/tools/validate_phase1_on_production.py index 473eaad..deed368 100644 --- a/tools/validate_phase1_on_production.py +++ b/tools/validate_phase1_on_production.py @@ -10,6 +10,7 @@ from pathlib import Path import json sys.path.insert(0, str(Path(__file__).parent.parent)) +from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH from anonymizer_core_refactored_onnx import process_pdf # 5 documents du corpus production (OGC 008) @@ -58,7 +59,7 @@ for pdf_path in test_docs[:5]: out_dir=out_dir, make_vector_redaction=False, also_make_raster_burn=False, - config_path=Path("config/dictionnaires.yml"), + config_path=RUNTIME_DICTIONARIES_CONFIG_PATH, use_hf=False, ner_manager=None, vlm_manager=None,