Externalize dictionaries and add anonymization review corpus

2026-04-21 10:32:57 +02:00
parent 39db675052
commit 34dcf8f360
99 changed files with 1805 additions and 805 deletions
--- a/FONCTIONNEMENT.md
+++ b/FONCTIONNEMENT.md
@@ -122,8 +122,9 @@ Fonction : `_mask_line_by_regex`
 | Dates         | `[DATE]`       | 12/03/2024           |
 | Adresses      | `[ADRESSE]`    | 12 rue de la Paix    |
-Configuration supplementaire via `config/dictionnaires.yml` :
+Configuration :
-listes blanches, force-mask et regex personnalisees.
+- `config/dictionnaires.default.yml` : template versionne, source de verite des valeurs par defaut
 - `config/dictionnaires.yml` : surcharge locale chargee par defaut, contenant uniquement les ecarts site/runtime
 ### 3. Reconnaissance d'entites nommees (NER)
@@ -180,6 +181,7 @@ un fallback OCR est utilise :
 | Element                       | Description                                    |
 |-------------------------------|------------------------------------------------|
-| `config/dictionnaires.yml`    | Listes blanches, force-mask, regex custom      |
+| `config/dictionnaires.default.yml` | Valeurs par defaut completes et versionnees |
 | `config/dictionnaires.yml`    | Surcharge locale optionnelle (ecarts uniquement) |
 | `Pseudonymisation_Gui_V5.py` | Interface graphique (traitement par lots)       |
 | Ligne de commande             | `python anonymizer_core_refactored_onnx.py fichier.pdf --hf --raster` |
--- a/Pseudonymisation_Gui_Models_V4.py
+++ b/Pseudonymisation_Gui_Models_V4.py
@@ -48,33 +48,16 @@ try:
 except Exception:
    yaml = None
-APP_TITLE = "Pseudonymisation de PDF"
+from config_defaults import (
-DEFAULT_CFG = Path("config/dictionnaires.yml")
+    RUNTIME_DICTIONARIES_CONFIG_PATH,
    read_default_dictionaries_text,
    read_runtime_dictionaries_overlay_text,
 )
-DEFAULTS_CFG_TEXT = r"""
+APP_TITLE = "Pseudonymisation de PDF"
-# dictionnaires.yml – valeurs par défaut (bloc littéral pour les regex)
+DEFAULT_CFG = RUNTIME_DICTIONARIES_CONFIG_PATH
-version: 1
+DEFAULTS_CFG_TEXT = read_default_dictionaries_text()
-encoding: "utf-8"
+RUNTIME_CFG_TEXT = read_runtime_dictionaries_overlay_text()
 normalization: "NFKC"
 whitelist:
  sections_titres: [DIM, GHM, GHS, RUM, COMPTE, RENDU, DIAGNOSTIC]
  noms_maj_excepts: ["Médecin DIM", "Praticien conseil"]
  org_gpe_keep: true
 blacklist:
  force_mask_terms: []
  force_mask_regex: []
 kv_labels_preserve: [FINESS, IPP, "N° OGC", Etablissement]
 regex_overrides:
  - name: OGC_court
    pattern: |-
      \b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b
    placeholder: '[OGC]'
    flags: [IGNORECASE]
 flags:
  case_insensitive: true
  unicode_word_boundaries: true
  regex_engine: "python"
 """
 class ToolTip:
@@ -208,7 +191,7 @@ class App:
    # YAML helpers
    def _ensure_cfg_exists(self):
        p = Path(self.cfg_path.get()); p.parent.mkdir(parents=True, exist_ok=True)
-        if not p.exists(): p.write_text(DEFAULTS_CFG_TEXT, encoding="utf-8")
+        if not p.exists(): p.write_text(RUNTIME_CFG_TEXT, encoding="utf-8")
    def _cfg_browse(self):
        d = filedialog.asksaveasfilename(defaultextension=".yml", filetypes=[("YAML","*.yml *.yaml"), ("Tous","*.*")])
        if d: self.cfg_path.set(d)
@@ -225,14 +208,14 @@ class App:
        if yaml is None:
            messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml)."); return
        try:
-            Path(self.cfg_path.get()).write_text(yaml.safe_dump(self.cfg_data or yaml.safe_load(DEFAULTS_CFG_TEXT), allow_unicode=True, sort_keys=False), encoding="utf-8")
+            Path(self.cfg_path.get()).write_text(yaml.safe_dump(self.cfg_data or {}, allow_unicode=True, sort_keys=False), encoding="utf-8")
            self._log("Règles sauvegardées.")
        except Exception as e:
            messagebox.showerror("Erreur", f"Impossible d'écrire le YAML: {e}")
    def _reload_cfg(self): self._load_cfg(); self._log("Règles rechargées.")
    def _restore_defaults(self):
        try:
-            Path(self.cfg_path.get()).write_text(DEFAULTS_CFG_TEXT, encoding="utf-8"); self._log("CFG par défaut écrit."); self._load_cfg()
+            Path(self.cfg_path.get()).write_text(RUNTIME_CFG_TEXT, encoding="utf-8"); self._log("Surcharge locale réinitialisée."); self._load_cfg()
        except Exception as e:
            messagebox.showerror("Erreur", f"Impossible d'écrire le YAML par défaut: {e}")
--- a/Pseudonymisation_Gui_V5.py
+++ b/Pseudonymisation_Gui_V5.py
@@ -20,7 +20,6 @@ import os
 import platform
 import queue
 import re
 import shutil
 import subprocess
 import sys
 import threading
@@ -75,6 +74,11 @@ try:
 except Exception:
    yaml = None
 from config_defaults import (
    read_default_dictionaries_text,
    read_runtime_dictionaries_overlay_text,
 )
 # ---------------------------------------------------------------------------
 # Thème optionnel
 # ---------------------------------------------------------------------------
@@ -142,47 +146,19 @@ def _resolve_config() -> Path:
    pour que l'utilisateur puisse la modifier sans recompiler.
    """
    exe_cfg = _exe_dir() / "config" / "dictionnaires.yml"
    app_cfg = _app_dir() / "config" / "dictionnaires.yml"
    if exe_cfg.exists():
        return exe_cfg
-    # Premier lancement : copier la config embarquée à côté de l'exe
+    exe_cfg.parent.mkdir(parents=True, exist_ok=True)
-    if app_cfg.exists():
+    exe_cfg.write_text(read_runtime_dictionaries_overlay_text(), encoding="utf-8")
-        exe_cfg.parent.mkdir(parents=True, exist_ok=True)
+    return exe_cfg
        import shutil
        shutil.copy2(str(app_cfg), str(exe_cfg))
        return exe_cfg
    return app_cfg  # fallback
 DEFAULT_CFG = _resolve_config()
 MODELS_DIR = _app_dir() / "models"
-DEFAULTS_CFG_TEXT = r"""
+DEFAULTS_CFG_TEXT = read_default_dictionaries_text()
-# dictionnaires.yml – valeurs par défaut (bloc littéral pour les regex)
+RUNTIME_CFG_TEXT = read_runtime_dictionaries_overlay_text()
 version: 1
 encoding: "utf-8"
 normalization: "NFKC"
 whitelist:
  sections_titres: [DIM, GHM, GHS, RUM, COMPTE, RENDU, DIAGNOSTIC]
  noms_maj_excepts: ["Médecin DIM", "Praticien conseil"]
  org_gpe_keep: true
 blacklist:
  force_mask_terms: []
  force_mask_regex: []
 kv_labels_preserve: [FINESS, IPP, "N° OGC", Etablissement]
 regex_overrides:
  - name: OGC_court
    pattern: |-
      \b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b
    placeholder: '[OGC]'
    flags: [IGNORECASE]
 flags:
  case_insensitive: true
  unicode_word_boundaries: true
  regex_engine: "python"
 """
 # Palette dérivée du logo aivanonym (gradient magenta → rose → pêche → noir)
 # Magenta du logo : primaire (boutons, accents)
@@ -1593,7 +1569,7 @@ class App:
        p = Path(self.cfg_path.get())
        p.parent.mkdir(parents=True, exist_ok=True)
        if not p.exists():
-            p.write_text(DEFAULTS_CFG_TEXT, encoding="utf-8")
+            p.write_text(RUNTIME_CFG_TEXT, encoding="utf-8")
    def _load_cfg(self):
        if yaml is None:
--- a/anonymizer_core_refactored.py
+++ b/anonymizer_core_refactored.py
@@ -24,36 +24,11 @@ try:
    import yaml  # PyYAML for dictionaries
 except Exception:
    yaml = None
-
+from config_defaults import (
-# ----------------- Defaults & Config -----------------
+    RUNTIME_DICTIONARIES_CONFIG_PATH,
-DEFAULTS_CFG = {
+    load_effective_dictionaries_dict,
-    "version": 1,
+    load_default_dictionaries_dict,
-    "encoding": "utf-8",
+)
    "normalization": "NFKC",
    "whitelist": {
        "sections_titres": ["DIM", "GHM", "GHS", "RUM", "COMPTE", "RENDU", "DIAGNOSTIC"],
        "noms_maj_excepts": ["Médecin DIM", "Praticien conseil"],
        "org_gpe_keep": True,
    },
    "blacklist": {
        "force_mask_terms": [],
        "force_mask_regex": [],
    },
    "kv_labels_preserve": ["FINESS", "IPP", "N° OGC", "Etablissement"],
    "regex_overrides": [
        {
            "name": "OGC_court",
            "pattern": r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b",
            "placeholder": "[OGC]",
            "flags": ["IGNORECASE"],
        }
    ],
    "flags": {
        "case_insensitive": True,
        "unicode_word_boundaries": True,
        "regex_engine": "python",
    },
 }
 PLACEHOLDERS = {
    "EMAIL": "[EMAIL]",
@@ -103,16 +78,7 @@ class AnonResult:
 # ----------------- Config loader -----------------
 def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]:
-    cfg = DEFAULTS_CFG.copy()
+    return load_default_dictionaries_dict() if config_path is None else load_effective_dictionaries_dict(config_path)
    if config_path and config_path.exists() and yaml is not None:
        try:
            user = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
            # shallow-merge for top-level keys
            for k, v in user.items():
                cfg[k] = v
        except Exception:
            pass
    return cfg
 # ----------------- Extraction -----------------
@@ -416,7 +382,7 @@ if __name__ == "__main__":
    ap.add_argument("--out", type=str, default="out")
    ap.add_argument("--no-vector", action="store_true")
    ap.add_argument("--raster", action="store_true")
-    ap.add_argument("--config", type=str, default=str(Path("config/dictionnaires.yml")))
+    ap.add_argument("--config", type=str, default=str(RUNTIME_DICTIONARIES_CONFIG_PATH))
    args = ap.parse_args()
    outs = process_pdf(Path(args.pdf), Path(args.out), make_vector_redaction=not args.no_vector, also_make_raster_burn=args.raster, config_path=Path(args.config))
    print(json.dumps(outs, indent=2, ensure_ascii=False))
--- a/anonymizer_core_refactored_onnx.py
+++ b/anonymizer_core_refactored_onnx.py
@@ -44,6 +44,12 @@ try:
 except Exception:
    yaml = None
 from config_defaults import (
    RUNTIME_DICTIONARIES_CONFIG_PATH,
    load_effective_dictionaries_dict,
    load_default_dictionaries_dict,
 )
 try:
    from doctr.models import ocr_predictor as _doctr_ocr_predictor
    _DOCTR_AVAILABLE = True
@@ -115,6 +121,29 @@ def _load_bdpm_medication_names() -> set:
        return set()
 def _load_wordlist_file(
    path: Path,
    *,
    transform=lambda s: s,
    label: str,
    min_len: int = 1,
 ) -> set:
    """Charge un fichier texte, un mot par ligne."""
    result: set = set()
    if not path.exists():
        log.warning("%s introuvable : %s", label, path)
        return result
    try:
        for line in path.read_text(encoding="utf-8").splitlines():
            word = line.strip()
            if word and not word.startswith("#") and len(word) >= min_len:
                result.add(transform(word))
        log.info("%s chargé : %d entrées depuis %s", label, len(result), path.name)
    except Exception as exc:
        log.error("%s : erreur de lecture %s — %s", label, path, exc)
    return result
 # ----------------- Gazetteers INSEE (prénoms + communes + noms de famille) -----------------
 # Prénoms et noms de famille sont utilisés sous deux formes :
 # - _INSEE_PRENOMS (lowercase) : check rapide "w.lower() in _INSEE_PRENOMS"
@@ -199,62 +228,24 @@ _FINESS_ADDR_AC = None             # Automate Aho-Corasick pour adresses (noms d
 _VILLE_AC = None                   # Automate Aho-Corasick pour villes (INSEE + FINESS)
 # Communes trop ambiguës (homonymes de mots courants, trop courts, etc.)
-_VILLE_BLACKLIST = {
+_VILLE_BLACKLIST_FALLBACK = {
-    # Directions / mots géographiques génériques
+    "PARIS",
-    "SAINT", "NORD", "SUD", "EST", "OUEST",
+    "FRANCE",
-    "CENTRE", "SERVICE", "BOURG",
+    "EUROPE",
-    # Communes homonymes de mots courants français
+    "COURANT",
-    "ORANGE", "TOURS", "NICE", "SENS", "VITRE",
+    "COU",
-    "ROMANS", "MENTON", "SALON", "VIENNE",
+    "DOS",
-    "BREST",  # trop court et ambigu
+    "SEIN",
-    "HYERES",  # proche de termes médicaux
+    "BRAS",
    "AGEN", "AUCH", "ALBI",
    "BLOIS", "LAON", "LENS",
    "GIEN", "GRAY",
    "AIRE", "LURE", "SETE", "DOLE",
    "VIRE", "LUNEL", "MURET", "MORET",
    "COEUR", "FOIX", "GIVET",
    "EVIAN", "MAURE", "MENDE",
    "JOUE", "MEAUX", "REDON",
    "CREIL", "CERGY",
    # Communes de 4-5 lettres homonymes de mots très courants
    "VERS", "MONT", "MARS", "PORT", "PONT", "FORT",
    "BOIS", "ISLE", "LACS", "MURS", "OUST", "PREY",
    "VAUX", "VERT", "FAUX", "REZE",
    "BILLE", "PLACE", "VILLE", "COURS", "GRAND",
    "ROUGE", "RICHE", "NUITS", "SORE", "SARE",
    "TRANS", "RANS", "MARSA",
    # Mots courants français (6+ lettres) aussi communes
    "CHARGE", "SIGNES", "BARRES", "FOSSES", "GARDES",
    "MARCHE", "LIGNES", "MOULIN", "PIERRE", "CHAISE",
    "SOURCE", "VALLEE", "MAISON", "BEAUNE", "CORPS",
    "PUITS", "CROIX", "LIGNE", "QUATRE", "PRISON",
    # Prénoms très courants (aussi communes)
    "MARIE", "PIERRE", "JEAN", "PAUL", "ANNE",
    # Expressions composées ambiguës (aussi communes INSEE)
    "LONG", "RECY", "PLAN", "MARCHE", "SALLE",
    "CONTRE", "MERE", "ONDRES", "VEBRE",
    # Mots structurels / médicaux
    "PARIS",  # omniprésent, source de faux positifs
    "FRANCE", "EUROPE",
    # Termes ambigus (aussi communes INSEE) - trackare/DPI
    "COURANT",  # "Médecin courant" ≠ ville
    # Parties du corps homonymes de communes (FP "prurit invalidant (COU, décolleté)")
    "COU", "DOS", "SEIN", "BRAS",
 }
-# Enrichissement depuis fichier externe (modifiable sans toucher au code)
+_VILLE_BLACKLIST = _load_wordlist_file(
-_villes_bl_file = Path(__file__).parent / "data" / "villes_blacklist.txt"
+    Path(__file__).parent / "data" / "villes_blacklist.txt",
-if _villes_bl_file.exists():
+    transform=str.upper,
-    try:
+    label="Villes blacklist",
-        for _line in _villes_bl_file.read_text(encoding="utf-8").splitlines():
+)
-            _w = _line.strip()
+if not _VILLE_BLACKLIST:
-            if _w and not _w.startswith("#"):
+    _VILLE_BLACKLIST = set(_VILLE_BLACKLIST_FALLBACK)
-                _VILLE_BLACKLIST.add(_w)
+_BASE_VILLE_BLACKLIST = set(_VILLE_BLACKLIST)
        log.info("Villes blacklist chargées : %d entrées", len(_VILLE_BLACKLIST))
    except Exception as _exc:
        log.error("Villes blacklist : erreur de lecture %s — %s", _villes_bl_file, _exc)
 else:
    log.warning("Villes blacklist : fichier introuvable %s — défauts intégrés utilisés", _villes_bl_file)
 try:
    import ahocorasick as _ahocorasick
@@ -331,7 +322,7 @@ def load_medical_whitelists():
    global _MEDICAL_STRUCTURAL_TERMS, _MEDICATION_WHITELIST
    # 1. Charger les termes médicaux structurels
-    config_path = Path("config/medical_terms_whitelist.yml")
+    config_path = Path(__file__).parent / "config" / "medical_terms_whitelist.yml"
    if config_path.exists() and yaml:
        try:
            with open(config_path, 'r', encoding='utf-8') as f:
@@ -345,48 +336,20 @@ def load_medical_whitelists():
    # 2. Charger la whitelist des médicaments (edsnlp + BDPM + manuels)
    _MEDICATION_WHITELIST = _load_edsnlp_drug_names()
    _MEDICATION_WHITELIST.update(_load_bdpm_medication_names())
-    # Ajouter médicaments manquants
+    _MEDICATION_WHITELIST.update(
-    additional_meds = {
+        _load_wordlist_file(
-        "idacio", "salazopyrine", "infliximab", "apranax",
+            Path(__file__).parent / "data" / "bdpm" / "medication_whitelist_manual.txt",
-        "ketoprofene", "prevenar", "pneumovax", "bétadine"
+            transform=str.lower,
-    }
+            label="Whitelist médicaments manuelle",
-    _MEDICATION_WHITELIST.update(additional_meds)
+            min_len=3,
        )
    )
    log.info(f"Whitelist médicaments chargée: {len(_MEDICATION_WHITELIST)} médicaments (edsnlp+BDPM)")
 # Charger les whitelists au démarrage du module
 load_medical_whitelists()
 # ----------------- Defaults & Config -----------------
 DEFAULTS_CFG = {
    "version": 1,
    "encoding": "utf-8",
    "normalization": "NFKC",
    "whitelist": {
        "sections_titres": ["DIM", "GHM", "GHS", "RUM", "COMPTE", "RENDU", "DIAGNOSTIC"],
        "noms_maj_excepts": ["Médecin DIM", "Praticien conseil"],
        "org_gpe_keep": False,
    },
    "blacklist": {
        "force_mask_terms": [],
        "force_mask_regex": [],
    },
    "kv_labels_preserve": ["FINESS", "IPP", "N° OGC", "Etablissement"],
    "regex_overrides": [
        {
            "name": "OGC_court",
            "pattern": r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b",
            "placeholder": "[OGC]",
            "flags": ["IGNORECASE"],
        }
    ],
    "flags": {
        "case_insensitive": True,
        "unicode_word_boundaries": True,
        "regex_engine": "python",
    },
 }
 PLACEHOLDERS = {
    "EMAIL": "[EMAIL]",
    "TEL": "[TEL]",
@@ -445,408 +408,49 @@ def validate_nir(nir_raw: str) -> bool:
        return False
    return key_int == (97 - (body_int % 97))
-# Mots médicaux/techniques/courants qui ne sont pas des noms de personnes
+# Mots médicaux/techniques/courants qui ne sont pas des noms de personnes.
-_MEDICAL_STOP_WORDS_SET = {
+# Source de vérité externalisée dans data/stopwords_manuels.txt + BDPM/edsnlp.
-    # Mots français courants (déterminants, prépositions, adverbes, etc.)
+_MEDICAL_STOP_WORDS_FALLBACK = {
-    "pas", "mon", "bien", "ancien", "ancienne", "bon", "bonne", "tout", "tous",
+    "date",
-    "mais", "donc", "car", "que", "qui", "avec", "dans", "pour", "sur", "par",
+    "note",
-    "les", "des", "une", "est", "son", "ses", "nos", "aux", "cette", "ces",
+    "heure",
-    "cher", "chez", "entre", "sans", "sous", "vers", "selon", "après", "avant",
+    "type",
-    "puis", "aussi", "très", "plus", "moins", "peu", "non", "oui", "quelques",
+    "traitement",
-    "mise", "début", "fin", "suite", "fait", "lieu", "cas", "jour", "jours",
+    "traitements",
-    "semaine", "semaines", "mois", "temps", "place", "nouvelle", "nouveau",
+    "soins",
-    "franche", "légère", "quelque", "depuis", "comme", "encore", "votre",
+    "surveillance",
-    "date", "note", "notes", "nom", "heure", "matin", "soir", "midi",
+    "consultation",
-    "signé", "réalisé", "courrier", "cabinet", "rue",
+    "hospitalisation",
    # Verbes / participes courants
    "remontée", "associée", "réalisée", "débuté", "prolongé", "prolongée",
    "prescrit", "prescrite", "présente", "présent", "absente", "absent",
    "reprise", "introduction", "arrêt", "relais",
    # Titres / rôles hospitaliers
    "chef", "assistant", "assistante", "praticien", "praticienne",
    "docteur", "professeur", "hospitalier", "hospitalière", "hospitaliers",
    "spécialiste", "contractuel", "contractuelle", "titulaire",
    "confrère", "consoeur", "coordonnateur", "coordonnatrice",
    "médecin", "médical", "infirmier", "infirmière",
    "praticiens", "patient", "patiente",
    # Structure hospitalière
    "service", "pôle", "clinique", "consultation", "secrétariat",
    "hôpital", "hôpitaux", "centre", "établissement", "polyclinique",
    # Villes / géographie (pas des noms de personnes)
    "bordeaux", "bayonne", "paris", "lyon", "lille", "marseille",
    "toulouse", "nantes", "montpellier", "pessac", "biarritz", "soustons",
    "basque", "basques", "sud", "côte",
    # Médicaments génériques et spécialités (DCI + noms commerciaux)
    "colchicine", "aspirine", "cortancyl", "bisoprolol", "entresto",
    "methotrexate", "eplerenone", "speciafoldine", "prednisone",
    "corticoïdes", "cortisone",
    "paracetamol", "metformine", "solupred", "novorapid", "abasaglar",
    "lovenox", "methylprednisolone", "potassium", "humalog", "furosemide",
    "insuline", "trulicity", "forxiga", "atorvastatine", "amlodipine",
    "ondansetron", "eliquis", "nebivolol", "gaviscon", "loxen",
    "morphine", "oxycodone", "kardegic", "tercian", "zopiclone",
    "seresta", "tramadol", "alprazolam", "forlax", "levothyrox",
    "bromazepam", "gliclazide", "zymad", "pravastatine", "spiriva",
    "quetiapine", "sertraline", "crestor", "lercanidipine", "amoxicilline",
    "opocalcium", "ferinject", "candesartan", "ceftriaxone", "calcidose",
    "laroxyl", "brintellix", "ketoprofene", "adrenaline", "exacyl",
    "terbutaline", "ipratropium", "actiskenan", "vialebex", "oxynormoro",
    "lansoprazole", "perindopril", "sodium", "velmetia",
    "doliprane", "dafalgan", "efferalgan", "spasfon", "vogalene",
    "augmentin", "inexium", "omeprazole", "pantoprazole", "esomeprazole",
    "ramipril", "lisinopril", "enalapril", "losartan", "valsartan",
    "irbesartan", "olmesartan", "telmisartan", "hydrochlorothiazide",
    "spironolactone", "furosemide", "lasilix", "aldactone",
    "tahor", "crestor", "rosuvastatine", "simvastatine", "fluvastatine",
    "xarelto", "pradaxa", "apixaban", "rivaroxaban", "dabigatran",
    "plavix", "clopidogrel", "ticagrelor", "brilique",
    "ventoline", "seretide", "symbicort", "salmeterol", "fluticasone",
    "salbutamol", "tiotropium", "budesonide", "beclometasone",
    "oxycodone", "oxynorm", "skenan", "actiskenan", "fentanyl",
    "nubain", "nalbuphine", "nefopam", "acupan", "profenid",
    "ibuprofene", "diclofenac", "naproxene", "celecoxib",
    "gabapentine", "pregabaline", "lyrica", "neurontin",
    "amitriptyline", "duloxetine", "venlafaxine", "fluoxetine",
    "paroxetine", "escitalopram", "citalopram", "mirtazapine",
    "olanzapine", "risperidone", "aripiprazole", "haloperidol",
    "loxapine", "cyamemazine", "diazepam", "oxazepam", "lorazepam",
    "clonazepam", "midazolam", "hydroxyzine", "atarax", "melatonine",
    "stilnox", "zolpidem", "imovane",
    "levothyroxine", "metformine", "glimepiride", "sitagliptine",
    "januvia", "jardiance", "empagliflozine", "dapagliflozine",
    "ozempic", "semaglutide", "dulaglutide", "liraglutide", "victoza",
    "heparine", "enoxaparine", "tinzaparine", "innohep",
    "warfarine", "coumadine", "fluindione", "previscan",
    "ciprofloxacine", "levofloxacine", "ofloxacine", "metronidazole",
    "vancomycine", "gentamicine", "tazocilline", "piperacilline",
    "meropenem", "imipenem", "clindamycine", "doxycycline",
    "azithromycine", "clarithromycine", "cotrimoxazole", "bactrim",
    "polyionique", "propranolol", "apidra", "solostar",
    # Noms et suffixes laboratoires pharmaceutiques
    "arw", "myl", "myp", "arg", "teva", "bga", "agt",
    "mylan", "biogaran", "arrow", "sandoz", "zentiva", "cristers",
    "accord", "viatris", "ranbaxy", "ratiopharm", "almus", "qualimed",
    "evolugen", "alter", "zydus", "medisol", "substipharm",
    "sdz", "bgr", "egt", "rnb",
    # Formes galéniques / voies d'administration
    "cpr", "sachet", "orale", "oral", "sol", "buv", "stylo", "flexpen",
    "flestouch", "kwikpen", "inj", "susp", "gelule", "comprime",
    "unidose", "perf", "inh", "seringue", "aerosol", "sach", "pdr",
    "orodisp", "capsule", "patch", "suppositoire", "gouttes",
    # Termes de prescription / pharmacie
    "prescription", "prescriptions", "dose", "fréquence", "statut",
    "technique", "capteur", "bandelettes", "glycemiques", "glycemique",
    "lancettes", "aiguilles", "fines", "micro", "pompe", "réserve",
    "glycemie", "capillaire", "hgt",
    # Termes médicaux / cliniques
    "myocardite", "myosite", "corticothérapie", "biopsie", "pathologie",
    "dysimmunitaire", "récidive", "récidivante", "traitement", "diagnostic",
    "antécédents", "examen", "bilan", "résultats", "analyse",
    "interne", "externe", "médecine", "chirurgie", "rhumatologie",
    "dermatologie", "immunologie", "cardiologie", "pneumologie",
    "neurologie", "gynécologie", "radiologie", "sénologie",
    "douleur", "douleurs", "douloureux", "musculaire", "musculaires",
    "thoracique", "thoraciques", "membres", "supérieurs", "inférieurs",
    "normale", "normaux", "habituelle", "habituelles",
    "synthèse", "hospitalisation", "syndrome", "vaccination", "ophtalmo",
    "pelvien", "diabétique", "sommeil", "régime", "diet",
    "desinfection", "environnement", "identification", "bracelet",
    "toilettes", "accompagner", "installer", "transfusion",
    "signes", "vitaux", "alimentaire", "avis", "zone",
    "calcémie",
    # Abréviations médicales
    "irm", "ett", "ecg", "mtx", "fevg", "bdc", "crp", "sfu", "hdj",
    "bnp", "asat", "alat", "cpk", "ctc", "hba", "hba1c",
    "saos", "tsh", "inr", "vgm", "pnn", "plq", "hb",
    "poc", "bax", "act", "bic", "cfx", "acc", "ado", "acf", "vfo",
    "qvl", "cci", "pse", "pca", "chl", "crt", "bbm", "pds", "ren",
    "vit", "zen",
    "scanner", "radio", "écho", "échographie",
    # Spécialités médicales (éviter faux positifs NOM)
    "hépato-gastro-entérologue", "gastro-entérologue", "gastro-entérologie",
    "proctologue", "oncologue", "anesthésiste", "pneumologue", "gérontologue",
    "cardiologue", "néphrologue", "urologue", "gériatre",
    "hépatologue", "endocrinologue", "stomatologue",
    # Termes médicaux / titres fréquemment détectés comme NOM par le NER
    "supplémentation", "supplementation", "endocrinologie", "monsieur", "madame",
    "suivi", "sortie", "emog", "ophtalmo",
    # Médicaments détectés comme NOM/PRENOM par EDS-Pseudo
    "eliquis", "trulicity", "saos", "wind", "taxotere", "eupantol", "ezetimibe",
    "lansoyl", "xatral", "xenetix", "trimbow", "buspirone", "cetirizine",
    "depakote", "versatis", "durogesic", "montelukast", "metformine", "viatris",
    "rosuvastatine", "gliclazide", "amlodipine", "perindopril", "nebivolol",
    "pravastatine", "bisoprolol", "amoxicilline", "kardegic", "lovenox",
    # Termes médicaux / soins / actes détectés comme NOM
    "partielle", "cutanee", "cutané", "cutanée", "osseuse", "diabetique",
    "diabétique", "transdermique", "transderm", "diarrhees", "diarrhées",
    "ionogramme", "scintigraphie", "thoraco", "thorax", "négative", "negative",
    "diététicienne", "pressurise", "pressuriser", "inhalee", "inhalée", "inhal",
    # Mots courants français détectés comme NOM dans les trackare
    "toilette", "repas", "poche", "installation", "education", "éducation",
    "refection", "réfection", "complete", "complète", "regime", "régime",
    "normal", "traité", "traite", "arrêté", "arrete", "volume",
    "commentaires", "france", "covid", "framboise", "epoux", "époux",
    # Abréviations médicales courtes (3-4 chars) détectées comme NOM
    "ide", "ipp", "pcr", "tap", "gel", "ahl", "ssr", "hds", "tca", "etp",
    "mcg", "sdz", "iao", "ser", "orod", "clav", "disp", "cart", "atcd", "mdrd",
    "amox", "endoc", "microg", "item", "pyélo", "néphro",
    # En-têtes de colonnes / mots structurels trackare
    "observations", "observation", "commentaires", "commentaire",
    "surveillance", "température", "temperature", "glycémie", "glycemie",
    "diurèse", "diurese", "balance", "pouls", "systolique", "diastolique",
    "saturation", "fréquence", "frequence", "respiratoire", "douleur",
    "alertes", "alerte", "antécédents", "antecedents", "habitus",
    "allergies", "prescriptions", "prescription", "administration",
    "catégorie", "categorie", "expiration", "message",
    "destination", "diagnostique", "diagnostiques",
    "date", "note", "nom", "heure", "type", "code", "etat",
    "comprime", "comprimé", "gelule", "gélule", "solution", "injectable",
    # Médicaments supplémentaires détectés dans les trackare
    "depakote", "versatis", "humalog", "forxiga", "durogesic",
    "montelukast", "rosuvastatine",
    # Abréviations pharma courtes
    "cpr", "sol", "bic", "agt", "poche", "inhal",
    # Termes chirurgicaux/cliniques FP
    "cure", "endocrine", "operatoire", "opératoire", "realisee", "réalisée",
    "gauche", "droit", "droite", "face", "profil",
    # Faux positifs EDS supplémentaires
    "psy", "inhales", "inhalés", "kwikpen", "lansoprazole", "tiorfan", "smecta",
    "axa", "ttt", "anionique", "abdomino", "cod", "omi", "urg", "med",
    "10mg", "20mg", "40mg", "100mg", "300ui", "500ml", "innohep", "coaprovel",
    "actiskenan", "simvastatine", "forlax",
    # Mots temporels / contextuels détectés comme EDS_HOPITAL
    "semaine", "jour", "matin", "soir", "nuit", "midi",
    # Mots clés de contexte document
    "compétences", "maladies", "inflammatoires", "systémiques", "rares",
    "fret", "fax", "contexte", "résultat", "resultat", "résultats", "resultats",
    "haute", "maison", "aide", "rpps", "poste", "fonct",
    "sante", "santé", "etxe", "ttipi", "gastro", "concha",
    "endoscopie", "endoscopique", "fibroscopie",
    "indication", "conclusion", "technique", "anesthésie",
    "digestif", "digestive", "digestives", "nutritive",
    # Abréviations soins trackare détectées comme NOM (batch 20 OGC)
    "soins", "lit", "jeun", "lever", "pose", "surv", "ggt", "vvp",
    # Verbes d'instructions soins (aussi des patronymes INSEE → FP)
    "coucher", "manger", "marcher", "sortir",
    "verif", "crop", "evs", "maco", "pan", "cet", "trou", "nit", "nfs",
    # Mots narratifs CRH capturés par fusion sidebar 2-colonnes
    "evolution", "évolution", "explorations", "fermeture", "allergie", "allergies",
    "lotissement", "cholangiographie", "cholecystectomie", "cholécystectomie",
    "paracetamol", "paracétamol", "unité", "unite",
    # FP résiduels batch 10 OGC (termes médicaux/instructions soins)
    "glyc", "glycosurie", "vider", "forte",
    # FP audit batch 59 OGC (mots courants/médicaux flagués comme NOM)
    "oncologie", "confrères", "confrere", "doubles", "chers", "motif",
    "responsable", "autre", "autres", "autonome", "autonomes",
    "préparations", "preparations", "prévenir", "prevenir",
    "acétylsalicylique", "acetylsalicylique", "angio",
    "desc", "diu", "barreau",
    "haitz", "alde",
    # FP audit OGC 21 — termes médicaux/courants flagués NOM_GLOBAL
    "alimentation", "augmentation", "amelioration", "amélioration",
    "biliaire", "biliaires", "bili", "voies", "voie",
    "apyrexie", "apyréxie", "apyrétique", "apyretique",
    "clavulanique", "mecillinam", "sulfamides", "sulfamide",
    "tazobactam", "temocilline", "ecoflac", "furanes", "furane",
    "exilar", "lipruzet", "mopral",
    "sensible", "sensibles", "dossier", "dossiers",
    "entero", "entéro", "medecine", "bio",
    "aviation", "contention", "isolement",
    "elimination", "élimination", "infectieux",
    "hémodynamique", "hemodynamique", "pancréatite", "pancreatite",
    "cholecystite", "cholécystite", "cholécystectomie", "cholecystectomie",
    "appendicectomie", "néoplasie", "neoplasie",
    "ovarienne", "prandial", "fébrile", "febrile",
    "eupnéique", "eupneique", "normocarde", "normotendue",
    "variable", "dosage", "posologie",
    # Abréviations diététiques/soins trackare
    "bcy", "po2", "po1", "po3", "bha", "atg", "ras", "cat", "ass",
    # FP audit OGC 17 CRH
    "mode", "retraitee", "retraité", "retraitée", "régression", "regression", "tel",
    "strasbourg", "bordeaux", "toulouse", "paris", "lyon", "marseille", "bayonne", "anglet",
    "saint-palais", "tarnos", "hendaye", "dax", "orthez", "oloron", "pau", "cambo",
    # Spécialités/services récurrents comme FP NOM
    "cancérologie", "cancerologie", "réanimation", "reanimation",
    "urologie", "néphrologie", "nephrologie", "hématologie", "hematologie",
    "gériatrie", "geriatrie", "pédiatrie", "pediatrie",
    "ophtalmologie", "stomatologie", "allergologie",
    "kinésithérapie", "kinesitherapie", "ergothérapie", "ergotherapie",
    "orthopédie", "orthopedie", "traumatologie",
    "palliatifs", "palliative", "palliatif",
    "addictologie", "alcoologie", "tabacologie",
    # FP soignants trackare (mots courants capturés par patterns Note d'évolution / Signé / Flacon)
    "discussion", "echelle", "échelle", "scope", "tdm", "bouteille",
    "evendol", "relais", "repas", "poursuite", "indication",
    # FP pattern timestamp (termes ALL-CAPS capturés par "HH:MM NOM")
    "eliminatin", "elimination", "élimination", "preremplie", "pré-remplie",
    "thermie", "alim", "alimentation", "admin",
    # Médicaments/tests labo capturés par patterns soignants
    "biprofenid", "bi-profenid", "phosphatase", "phosphatases",
    "ecbu", "suppo", "suppositoire", "mucite", "microlax", "normacol",
    "ciprofloxacine", "lavement", "desinfection", "désinfection",
    "avaler", "rachis", "lombaire", "thoraco-lombaire",
    "cérébrale", "cerebrale", "cérébral", "cerebral", "hépatique", "hepatique",
    "thoracique", "abdominale", "abdominal", "pelvienne", "pelvien", "médullaire",
    # Dosages et labos pharma (FP fréquents dans prescriptions Trackare)
    "faible", "fort", "forte",
    "myl", "mylan", "sandoz", "teva", "arrow", "biogaran", "zentiva", "eg",
    "arw", "pan", "mso", "bgn", "ratiopharm", "accord", "vts", "viatris",
    "abdomino-pelvien", "abdomino", "pelvien", "thoraco-abdominal",
    "entree", "entrée", "continu", "continue",
    "morphine", "claforan", "skenan", "actiskenan",
    # Fragments de noms de médicaments (pdfplumber split)
    "sium", "pegic", "fenid", "profenid",
    # Catégories cliniques Trackare (en-têtes de section masqués à tort)
    "respi", "respiratoire", "nephro", "cardio", "neuro", "onco", "pulmo",
    "hemato", "hémato", "infectieux", "thermie", "diurese", "diurèse",
    "transit", "anemie", "anémie", "constantes", "examen",
    "post-op", "postop", "pré-op", "preop", "chimio", "elim",
    "toilette", "sommeil", "hypota", "hypotension", "spo2",
    "urine", "urines", "sng",
    "rénale", "renale", "rénal", "renal", "cardiaque",
    # Termes structurels trackare
    "transmissions", "transmission", "releve", "relevé",
    "objectif", "objectifs", "evaluation", "évaluation",
    "planification", "planifié", "planifiee",
    # ── FP détectés automatiquement par audit_fp_detector.py ──
    # Lot 2 : tokens basse confiance (DICT_FR seul) clairement non-noms
    "acide", "adulte", "ambulatoire", "avenue", "bandelette", "bassin",
    "bijoux", "bouche", "bouchon", "changes", "court", "demande", "dessert",
    "devenir", "diffusé", "douche", "entrée", "escarre", "espace",
    "explications", "fauteuil", "feuillet", "fixateur", "fois", "gamma",
    "germes", "glace", "habillage", "liste", "maquillage", "matelas",
    "mettre", "obésité", "ongles", "palais", "perlant", "pertes",
    "pièce", "plaie", "risque", "saint", "sang", "signe", "sonde",
    "tenue", "texte", "transaminases", "transit", "transmis", "urinal",
    "vernis", "vessie", "vrac",
    # Lot 2 : termes médicaux (préfixes/suffixes)
    "anatomo-pathologique", "anemie", "anémie", "angioscanner",
    "cétonurie", "cetonurie", "depilation", "dépilation",
    "folique", "gastroentérologue", "gastroenterologue",
    "microgrammes", "nalidixique", "naso-gastrique",
    "angio-irm", "neuro", "neuro-chirurgie", "endoplasmique",
    "cyto", "plaie-colle", "bionolyte",
    # Lot 1 (103 tokens, confiance >= 0.5) ──
    # Anatomie / clinique
    "abdomen", "aortique", "bilirubine", "drain", "gastrique", "histologique",
    "intraveineuse", "lithiasique", "macroscopie", "miction", "molles", "pelvienne",
    "plaquette", "plaquettes", "rectale", "sanguine", "sciatique", "selle", "urinaire",
    # Pathologies / symptômes
    "algie", "angoisse", "antécédent", "douloureuse", "dyslipidémie",
    "hemodialyse", "hemorragique", "hyperthermie", "hématologue",
    # Médicaments / matériel médical
    "ampoule", "antalgique", "antiseptique", "compresse", "flacon",
    "oxygène", "pansement", "vitamine",
    # Biologie / examens
    "biochimie", "biologie", "fer",
    # Actions / états cliniques
    "ablation", "absence", "admission", "bloc", "changement", "cliniquement",
    "cognitif", "couchage", "dispositif", "dynamique", "entretien", "histoire",
    "intervention", "position", "rappel", "relation", "retour", "réalisation",
    "résistant", "réévaluation", "situation", "temporaire", "urgence", "urgences",
    "urgent", "validation",
    # Mots courants / contextuels
    "angle", "bille", "boisson", "bureau", "cases", "circuit",
    "concubin", "confortable", "demain", "densité", "dernière",
    "distant", "domaine", "elle", "fils", "frère", "grand", "horizon",
    "hui", "identifiant", "minuit", "murent", "neuf", "original", "pages",
    "personne", "premier", "quartier", "retraite", "route", "rés",
    "trouve", "verrouillé", "villa", "étage",
    # Termes médicaux courants faussement détectés comme NOM (Phase 2 audit mars 2026)
    "ains", "ponction", "hanche", "burkitt", "orl", "gds", "oap", "tvp", "epp",
    "bronchite", "accueil", "cadre", "transfert", "relecture", "examens",
    "traitements", "traitement", "infectiologie", "cancérologie", "cancerologie",
    "maternité", "orale", "sachet", "absence",
    # FP audit 30 fichiers Phase 2 (mars 2026)
    "bouffee", "bouffée", "discontinue", "respimat", "lyoc",
    "probnp", "pro-bnp", "nt-probnp",
    "bpco", "colle", "gsc", "masse",
    "selle", "selles",
    # Acronymes médicaux courts (3 lettres) souvent FP comme NOM
    "epo", "irc", "sib", "inr", "iec", "ira", "ait", "avc",
    "imc", "ipp", "ivo", "amp", "ivg", "img", "had", "ssr",
    "hta", "ecg", "irm", "tep", "crp", "nfs", "bhc", "vgm",
    "vni", "aeg", "bas", "snv", "hba", "ide", "dci",
    # Termes pharmaceutiques FP comme NOM (audit 30 fichiers mars 2026)
    "buvable", "buvables", "nominal", "nominaux",
    "acide", "principale", "principal", "principaux",
    "hyaluronique", "valproique", "valproïque", "tranexamique", "tranéxamique",
    "clavulanique", "nalidixique",
    "grancher",  # Centre de réadaptation (nom d'établissement homonyme)
    "experf",  # Prestataire HAD (nom commercial homonyme)
    # Noms de services hospitaliers (FP comme [NOM])
    "ortho", "mobile", "polyvalente", "polyvalent",
    "geriatrie", "gériatrie", "ambulatoire", "provisoire",
    "intraveineuse", "intraveineux", "sous-cutanee", "sous-cutané",
    # Noms de services hospitaliers (aussi patronymes INSEE → FP récurrents)
    "viscerale", "viscérale", "vasculaire", "vasculaires",
    "conventionnelle", "conventionnel",
    "polyvalente", "polyvalent",
    "infectieuse", "infectieuses",
    # Termes soins infirmiers / activités de la vie quotidienne (FP trackare doc 216)
    "aide", "partielle", "partiel", "complete", "complète", "complet",
    "contention", "lavabo", "blader", "scan", "post", "lunettes",
    "deshabillage", "déshabillage", "habillage",
    "surveillance", "surv", "refection", "réfection",
    "miction", "toilette", "douche", "changes",
    "installation", "transfert", "mobilisation",
    "alimentation", "hydratation", "collation",
    "stimulation", "prevention", "prévention",
    # Termes pharmaceutiques/matériel médical FP (retour relecteur 2026-03-16)
    "chlorure",
    # Dispositifs médicaux (FP "OXYGENE LUNETTES" → [NOM])
    "canule", "canules", "masque", "sonde", "sondes",
    # Termes chirurgicaux FP comme [NOM] (retour relecteur 2026-03-17)
    "totale", "total", "partielle", "partiel",
    "prothese", "prothèse", "protheses", "prothèses", "unicompartimentale",
    # Antiseptiques / produits de soins (FP trackare prescriptions)
    "betascrub", "hibiscrub", "betadine", "biseptine", "chlorhexidine",
    # Nutrition entérale / compléments
    "fresubin", "nutrison", "sondalis", "isosource", "novasource",
    # Termes médicaux FP dans bactério / texte libre
    "nombreuses", "nombreux", "plusieurs", "quelques",
    "internationale", "international",
    "resorbable", "résorbable", "resorbables", "résorbables",
    "alfa", "capsule", "capsules",
 }
-# Enrichissement automatique avec les ~4000 noms de médicaments d'edsnlp
+_MEDICAL_STOP_WORDS_SET = _load_wordlist_file(
-_MEDICAL_STOP_WORDS_SET.update(_load_edsnlp_drug_names())
+    Path(__file__).parent / "data" / "stopwords_manuels.txt",
-
+    transform=str.lower,
-# Enrichissement depuis fichier externe (modifiable sans toucher au code)
+    label="Stop-words manuels",
 _stopwords_file = Path(__file__).parent / "data" / "stopwords_manuels.txt"
 if _stopwords_file.exists():
    try:
        _sw_count = 0
        for _line in _stopwords_file.read_text(encoding="utf-8").splitlines():
            _w = _line.strip()
            if _w and not _w.startswith("#"):
                _MEDICAL_STOP_WORDS_SET.add(_w)
                _sw_count += 1
        log.info("Stop-words manuels chargés : %d mots depuis %s", _sw_count, _stopwords_file.name)
    except Exception as _exc:
        log.error("Stop-words manuels : erreur de lecture %s — %s", _stopwords_file, _exc)
 else:
    log.warning("Stop-words manuels : fichier introuvable %s — qualité dégradée", _stopwords_file)
 # Enrichissement BDPM : ~7300 noms commerciaux + DCI/substances actives
 _bdpm_path = Path(__file__).parent / "data" / "bdpm" / "medicaments_stopwords.txt"
 if _bdpm_path.exists():
    try:
        _bdpm_count = 0
        for _line in _bdpm_path.read_text(encoding="utf-8").splitlines():
            _w = _line.strip()
            if _w and not _w.startswith("#"):
                _MEDICAL_STOP_WORDS_SET.add(_w)
                _bdpm_count += 1
        log.info("BDPM stop-words chargés : %d mots", _bdpm_count)
    except Exception as _exc:
        log.error("BDPM stop-words : erreur de lecture %s — %s", _bdpm_path, _exc)
 else:
    log.warning("BDPM stop-words : fichier introuvable %s — qualité dégradée", _bdpm_path)
 _MEDICAL_STOP_WORDS = (
    r"(?:" + "|".join(re.escape(w) for w in _MEDICAL_STOP_WORDS_SET) + r")"
 )
 _MEDICAL_STOP_WORDS_SET.update(_load_edsnlp_drug_names())
 _MEDICAL_STOP_WORDS_SET.update(
    _load_wordlist_file(
        Path(__file__).parent / "data" / "bdpm" / "medicaments_stopwords.txt",
        transform=str.lower,
        label="BDPM stop-words",
    )
 )
 if not _MEDICAL_STOP_WORDS_SET:
    _MEDICAL_STOP_WORDS_SET = set(_MEDICAL_STOP_WORDS_FALLBACK)
 _BASE_MEDICAL_STOP_WORDS_SET = set(_MEDICAL_STOP_WORDS_SET)
 def _refresh_medical_stopwords_pattern() -> None:
    global _MEDICAL_STOP_WORDS
    if not _MEDICAL_STOP_WORDS_SET:
        _MEDICAL_STOP_WORDS = r"(?!)"
        return
    _MEDICAL_STOP_WORDS = (
        r"(?:" + "|".join(re.escape(w) for w in sorted(_MEDICAL_STOP_WORDS_SET)) + r")"
    )
 _refresh_medical_stopwords_pattern()
 # Un token de nom : commence par majuscule, lettres/tirets/apostrophes (PAS d'espace ni de point)
 _PERSON_TOKEN = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+"
 RE_PERSON_CONTEXT = re.compile(
@@ -985,7 +589,17 @@ RE_CIVILITE_INITIALE = re.compile(
 # --- N° examen / N° patient imagerie (radiologie) ---
 RE_NUM_EXAMEN_PATIENT = re.compile(
-    r"N[°o]?\s*(?:d['''\u2019]\s*)?(?:examen|patient|accession|passage)\s*[:\-]?\s*([A-Za-z]{0,4}\d{5,12})",
+    r"N[°o]?\s*(?:d['''\u2019]\s*)?(?:examen|patient(?:\s+imagerie)?|accession|passage)\s*[:\-]?\s*"
    r"((?=[A-Za-z0-9\-]{6,20}\b)(?=[A-Za-z0-9\-]*\d)[A-Za-z0-9\-]+)",
    re.IGNORECASE,
 )
 # --- N° bare d'entête labo / imagerie ---
 # Exemple:
 #   N° 23L35781
 #   Prélevé le 26/07/2023     Enregistré le 27/07/2023
 RE_NUM_ACCESSION_HEADER = re.compile(
    r"(?:^|\n)\s*N[°o]\s*[:\-]?\s*([A-Za-z0-9\-]{6,20})\s*\n"
    r"(?:[^\n]*\n){0,2}\s*(?:Pr[ée]lev[ée]\s+le|Enregistr[ée]\s+le)",
    re.IGNORECASE,
 )
@@ -1177,6 +791,7 @@ _DPI_LABELS_SET: set = _load_txt_set(
 )
 if not _DPI_LABELS_SET:
    _DPI_LABELS_SET = set(_DPI_LABELS_FALLBACK)
 _BASE_DPI_LABELS_SET = set(_DPI_LABELS_SET)
 # Companion blacklist : termes EN MAJUSCULES qui ne sont JAMAIS des noms
 # (spécialités, labos pharma, mots courants ambigus).
@@ -1189,6 +804,7 @@ _COMPANION_BLACKLIST_SET: set = _load_txt_set(
 )
 if not _COMPANION_BLACKLIST_SET:
    _COMPANION_BLACKLIST_SET = set(_COMPANION_BLACKLIST_FALLBACK)
 _BASE_COMPANION_BLACKLIST_SET = set(_COMPANION_BLACKLIST_SET)
 _WHITELIST_FUNCTION_WORDS = {
@@ -1223,14 +839,15 @@ def _load_whitelist_phrases(phrases) -> int:
 def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]:
-    cfg = DEFAULTS_CFG.copy()
+    global _MEDICAL_STOP_WORDS_SET, _VILLE_BLACKLIST, _DPI_LABELS_SET, _COMPANION_BLACKLIST_SET
-    if config_path and config_path.exists() and yaml is not None:
+    cfg = load_default_dictionaries_dict() if config_path is None else load_effective_dictionaries_dict(config_path)
-        try:
+
-            user = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
+    _MEDICAL_STOP_WORDS_SET = set(_BASE_MEDICAL_STOP_WORDS_SET)
-            for k, v in user.items():
+    _VILLE_BLACKLIST = set(_BASE_VILLE_BLACKLIST)
-                cfg[k] = v
+    _DPI_LABELS_SET = set(_BASE_DPI_LABELS_SET)
-        except Exception:
+    _COMPANION_BLACKLIST_SET = set(_BASE_COMPANION_BLACKLIST_SET)
-            pass
+    _WHITELIST_NEVER_MASK_TOKENS.clear()
    _WHITELIST_NEVER_MASK_PHRASES.clear()
    # Charger les stop-words et villes supplémentaires depuis le YAML
    extra_sw = cfg.get("additional_stopwords", [])
@@ -1239,6 +856,7 @@ def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]:
            if w and str(w).strip():
                _MEDICAL_STOP_WORDS_SET.add(str(w).strip().lower())
        log.info("Stop-words YAML supplémentaires : %d", len(extra_sw))
    _refresh_medical_stopwords_pattern()
    extra_villes = cfg.get("additional_villes_blacklist", [])
    if extra_villes:
@@ -1871,8 +1489,49 @@ def _mask_critical_in_key(key: str, audit: List[PiiHit], page_idx: int) -> str:
    return key
 def _replace_captured_value(full_match: str, captured_value: str, placeholder: str) -> str:
    start = full_match.find(captured_value)
    if start < 0:
        return placeholder
    end = start + len(captured_value)
    return full_match[:start] + placeholder + full_match[end:]
 def _mask_structured_line(line: str, audit: List[PiiHit], page_idx: int) -> str:
    """Masque les champs structurés dont la détection dépend du libellé de la ligne."""
    def _repl_code_postal(m: re.Match) -> str:
        original = m.group(1) or m.group(2) or m.group(0)
        audit.append(PiiHit(page_idx, "CODE_POSTAL", original, PLACEHOLDERS["CODE_POSTAL"]))
        if m.group(1):
            return _replace_captured_value(m.group(0), m.group(1), PLACEHOLDERS["CODE_POSTAL"])
        return PLACEHOLDERS["CODE_POSTAL"]
    def _repl_num_examen(m: re.Match) -> str:
        audit.append(PiiHit(page_idx, "DOSSIER", m.group(1), PLACEHOLDERS["DOSSIER"]))
        return _replace_captured_value(m.group(0), m.group(1), PLACEHOLDERS["DOSSIER"])
    def _repl_dossier(m: re.Match) -> str:
        original = m.group(1) or m.group(2) or m.group(0)
        audit.append(PiiHit(page_idx, "DOSSIER", original, PLACEHOLDERS["DOSSIER"]))
        return _replace_captured_value(m.group(0), original, PLACEHOLDERS["DOSSIER"])
    def _repl_venue(m: re.Match) -> str:
        audit.append(PiiHit(page_idx, "NDA", m.group(1), PLACEHOLDERS["NDA"]))
        return _replace_captured_value(m.group(0), m.group(1), PLACEHOLDERS["NDA"])
    masked = RE_CODE_POSTAL.sub(_repl_code_postal, line)
    masked = RE_NUM_EXAMEN_PATIENT.sub(_repl_num_examen, masked)
    masked = RE_NUMERO_DOSSIER.sub(_repl_dossier, masked)
    masked = RE_VENUE_SEJOUR.sub(_repl_venue, masked)
    return masked
 def _kv_value_only_mask(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
    line = _mask_admin_label(line, audit, page_idx)
    structured_line = _mask_structured_line(line, audit, page_idx)
    if structured_line != line:
        return structured_line
    parts = SPLITTER.split(line, maxsplit=1)
    if len(parts) == 2:
        key, value = parts
@@ -2413,6 +2072,35 @@ def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> Tuple[set, s
    for m in _RE_EMAIL_HEADER.finditer(full_text):
        _add_tokens_force_all(m.group(1), "EMAIL_HEADER", "medium")
    # En-têtes patient en capitales, sans libellé explicite.
    # Exemple:
    #   ETCHEVERRY JEAN CLAUDE
    # On reste conservateur: 2-4 tokens uppercase, avec au moins un prénom
    # INSEE et un nom de famille INSEE. Les tokens proposés viennent
    # exclusivement des dictionnaires INSEE, sans blacklist codée en dur ici.
    _UPPER_NAME_LINE_RE = re.compile(
        r"^[ \t]*([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ\-' ]+"
        r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ])[ \t]*$",
        re.MULTILINE,
    )
    for m in _UPPER_NAME_LINE_RE.finditer(full_text):
        raw_line = re.sub(r"\s+", " ", m.group(1)).strip()
        tokens = [tok.strip(" .-'") for tok in raw_line.split() if tok.strip(" .-'")]
        if len(tokens) < 2 or len(tokens) > 4:
            continue
        if any(len(tok) < 3 for tok in tokens):
            continue
        norm_tokens = [_normalize_nfkd_upper(tok) for tok in tokens]
        has_prenom = any(tok in _INSEE_PRENOMS_SET for tok in norm_tokens)
        has_nom = any(tok in _INSEE_NOMS_FAMILLE for tok in norm_tokens)
        if not (has_prenom and has_nom):
            continue
        for tok, norm_tok in zip(tokens, norm_tokens):
            if norm_tok in _INSEE_PRENOMS_SET or norm_tok in _INSEE_NOMS_FAMILLE:
                _add_candidate(tok, "UPPER_NAME_LINE", "low", False)
    # Pour les noms composés avec tiret (ex: "LACLAU-LACROUTS"),
    # ajouter aussi les parties individuelles pour capturer les occurrences standalone.
    # _apply_extracted_names traite le composé en premier (plus long) puis les parties.
@@ -2582,10 +2270,10 @@ def _apply_extracted_names(text: str, names: set, audit: List[PiiHit], force_nam
 def _apply_trackare_hits_to_text(text: str, audit: List[PiiHit]) -> str:
-    """Applique les PiiHit non-NOM dans le texte (NDA footers, EPISODE, RPPS, FINESS, etc.).
+    """Applique les PiiHit non-NOM dans le texte (NDA, DOSSIER, EPISODE, RPPS, FINESS, etc.).
    Ces hits sont détectés par _extract_trackare_identity ou la phase 0c
    mais n'étaient appliqués qu'au PDF raster, pas au fichier .pseudonymise.txt."""
-    _APPLY_KINDS = {"EPISODE", "RPPS", "FINESS"}
+    _APPLY_KINDS = {"DOSSIER", "EPISODE", "FINESS", "NDA", "RPPS"}
    # Collecter les valeurs à remplacer, groupées par placeholder
    replacements: Dict[str, str] = {}  # original → placeholder
    for h in audit:
@@ -2698,7 +2386,14 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
    for m in _RE_IPP_MULTILINE.finditer(full_raw):
        audit.append(PiiHit(-1, "IPP", m.group(1), PLACEHOLDERS["IPP"]))
-    # Phase 0f : DEMANDE N° multiline (DEMANDE N°\n2300261164)
+    # Phase 0f : numéro d'accession / d'examen en en-tête de labo ou imagerie
    # Ex:
    #   N° 23L35781
    #   Prélevé le 26/07/2023
    for m in RE_NUM_ACCESSION_HEADER.finditer(full_raw):
        audit.append(PiiHit(-1, "DOSSIER", m.group(1), PLACEHOLDERS["DOSSIER"]))
    # Phase 0g : DEMANDE N° multiline (DEMANDE N°\n2300261164)
    _RE_DEMANDE_MULTILINE = re.compile(
        r"DEMANDE\s+N[°o]?\s*\n\s*(\d{8,})",
        re.IGNORECASE,
@@ -2706,14 +2401,14 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
    for m in _RE_DEMANDE_MULTILINE.finditer(full_raw):
        audit.append(PiiHit(-1, "DOSSIER", m.group(1), PLACEHOLDERS["DOSSIER"]))
-    # Phase 0g : N° venue multiline (tableaux BACTERIO : label et valeur séparés)
+    # Phase 0h : N° venue multiline (tableaux BACTERIO : label et valeur séparés)
    _RE_VENUE_MULTILINE = re.compile(
        r"N[°o]?\s*venue\s*[:\-]?\s*\n(?:[^\n]*\n){0,2}\s*(\d{6,})",
        re.IGNORECASE,
    )
    for m in _RE_VENUE_MULTILINE.finditer(full_raw):
        audit.append(PiiHit(-1, "NDA", m.group(1), PLACEHOLDERS["NDA"]))
-    # Phase 0g-bis : N° venue inversé (layout-aware réordonne : numéro AVANT label)
+    # Phase 0h-bis : N° venue inversé (layout-aware réordonne : numéro AVANT label)
    _RE_VENUE_REVERSE = re.compile(
        r"(?<!\d)(\d{7,10})(?!\d)\s*\n(?:[^\n]*\n){0,4}N[°o]?\s*venue\s*[:\-]?\s*$",
        re.IGNORECASE | re.MULTILINE,
@@ -3092,55 +2787,17 @@ def _build_finess_ac():
        return
    # Mots génériques qui ne doivent jamais être matchés seuls
-    _ac_generic_blacklist = {
+    _ac_generic_blacklist = _load_wordlist_file(
-        # Types d'établissements
+        data_dir / "generic_name_blacklist.txt",
-        "clinique", "pharmacie", "hopital", "centre", "foyer",
+        transform=str.lower,
-        "residence", "maison", "cabinet", "service", "laboratoire",
+        label="FINESS noms génériques blacklist",
-        "institut", "association", "fondation", "mutuelle", "polyclinique",
+    )
        "dispensaire", "hospice", "annexe", "antenne", "site",
        # Mots français courants qui sont aussi des noms d'établissements
        "collegiale", "collegial", "cathedral", "cathedrale",
        "providence", "esperance", "renaissance", "liberation",
        "republique", "fraternite", "solidarite", "independance",
        "beauregard", "bellevue", "belvedere",
        "promenade", "esplanade", "corniche", "prefecture",
        "croissant", "confluence", "bienvenue",
        "chartreuse", "commanderie", "chapelle", "basilique",
        "departement", "departementale", "communautaire",
        # Spécialités médicales / termes cliniques courants
        "chirurgicale", "radiologie", "addictologie", "prevention",
        "psychotherapique", "ambulatoire", "hospitalisation",
        "consultation", "surveillance", "therapeutique",
        "readaptation", "reeducation", "reanimation",
        "specialisee", "conventionnelle", "professionnelle",
        "informatique", "administrative", "regionale",
        # Mots communs
        "generation", "revolution", "assomption", "visitation",
        "consolation", "atlantique", "manutention", "prefiguration",
        "intervalle", "pharmaciens", "pharmacien", "transfert",
        "comprimee", "comprimees", "injectable", "injectables",
        "maintenant", "actuellement", "auparavant", "prochainement",
        "rapidement", "correctement", "directement", "simplement",
        "internationale", "international", "intercommunal", "intercommunale",
        # Termes médicaux homonymes d'établissements FINESS (retour relecteur 2026-03-17)
        "resistance", "radiotherapie", "chimiotherapie", "curietherapie",
        "hormonotherapie", "immunotherapie", "kinesitherapie",
        "ergotherapie", "orthophonie", "psychomotricite",
        "reeducation", "readaptation", "convalescence",
        "dependance", "autonomie", "gerontologie",
    }
    # Expressions multi-mots trop génériques
-    _ac_generic_phrases = {
+    _ac_generic_phrases = _load_wordlist_file(
-        "a domicile", "au domicile", "menage a domicile",
+        data_dir / "generic_phrase_blacklist.txt",
-        "du nord", "du sud", "de l est", "de l ouest",
+        transform=str.lower,
-        "la maison", "la residence", "les jardins",
+        label="FINESS expressions génériques blacklist",
-        "le village", "le parc", "la colline",
+    )
        "au soleil", "en france",
        # Expressions médicales homonymes d'établissements FINESS (FP relecteur 2026-03-16)
        "long cours", "au long cours",
        "le bourg", "le val", "le clos", "le mas",
        "les pins", "les chenes", "les oliviers",
    }
    # Whitelist explicite de mono-mots < 10 chars considérés comme distinctifs
    # (sinon rejetés par le filtre général). Exemple : EMBRUNS (7 chars).
    # Alimentée depuis data/finess/mono_mots_distinctifs.txt — curation manuelle.
@@ -3365,8 +3022,11 @@ def _build_finess_addr_ac():
                       "sentier", "rond-point", "traverse", "esplanade",
                       "promenade", "montee", "voie", "carrefour", "faubourg"}
        # Patterns non-adresse à exclure
-        _addr_blacklist = {"cabinet medical", "cabinet dentaire", "cabinet infirmier",
+        _addr_blacklist = _load_wordlist_file(
-                          "cabinet paramedical", "cabinet sage-femme"}
+            data_dir / "address_blacklist.txt",
            transform=str.lower,
            label="FINESS adresses blacklist",
        )
        for line in addr_path.read_text(encoding="utf-8").splitlines():
            name = line.strip()
            if not name or len(name) < 10:
@@ -3804,11 +3464,19 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
    protected = RE_ADRESSE.sub(PLACEHOLDERS["ADRESSE"], protected)
    protected = RE_ADRESSE_LIEU_DIT.sub(PLACEHOLDERS["ADRESSE"], protected)
    protected = RE_BP.sub(PLACEHOLDERS["ADRESSE"], protected)
-    protected = RE_CODE_POSTAL.sub(PLACEHOLDERS["CODE_POSTAL"], protected)
+    def _rescan_code_postal(m: re.Match) -> str:
        if m.group(1):
            return _replace_captured_value(m.group(0), m.group(1), PLACEHOLDERS["CODE_POSTAL"])
        return PLACEHOLDERS["CODE_POSTAL"]
    protected = RE_CODE_POSTAL.sub(_rescan_code_postal, protected)
    # N° Episode
    protected = RE_EPISODE.sub(PLACEHOLDERS["EPISODE"], protected)
    # N° venue / séjour
-    protected = RE_VENUE_SEJOUR.sub(PLACEHOLDERS["NDA"], protected)
+    protected = RE_VENUE_SEJOUR.sub(
        lambda m: _replace_captured_value(m.group(0), m.group(1), PLACEHOLDERS["NDA"]),
        protected,
    )
    # N° RPPS
    protected = RE_RPPS.sub(PLACEHOLDERS["RPPS"], protected)
    # FINESS par gazetteer (nombres 9 chiffres matchant un vrai numéro FINESS)
@@ -4825,7 +4493,7 @@ if __name__ == "__main__":
    ap.add_argument("--out", type=str, default="out")
    ap.add_argument("--no-vector", action="store_true")
    ap.add_argument("--raster", action="store_true")
-    ap.add_argument("--config", type=str, default=str(Path("config/dictionnaires.yml")))
+    ap.add_argument("--config", type=str, default=str(RUNTIME_DICTIONARIES_CONFIG_PATH))
    ap.add_argument("--hf", action="store_true", help="Activer NER ONNX sur narratif (nécessite ner_manager_onnx)")
    ap.add_argument("--model", type=str, default="cmarkea/distilcamembert-base-ner")
    args = ap.parse_args()
--- a/build_windows.bat
+++ b/build_windows.bat
@@ -33,6 +33,7 @@ python -m nuitka ^
    --include-module=ner_manager_onnx ^
    --include-module=eds_pseudo_manager ^
    --include-data-dir=config=config ^
    --include-data-dir=data=data ^
    --include-data-dir=models=models ^
    --nofollow-import-to=onnxruntime ^
    --nofollow-import-to=numpy ^
--- a/config/dictionnaires.default.yml
+++ b/config/dictionnaires.default.yml
@@ -0,0 +1,59 @@
 # Template versionné des règles d'anonymisation.
 # Ce fichier décrit les valeurs par défaut complètes de l'application.
 # La surcharge locale chargée par défaut est config/dictionnaires.yml.
 version: 1
 encoding: utf-8
 normalization: NFKC
 whitelist:
  sections_titres:
  - DIM
  - GHM
  - GHS
  - RUM
  - COMPTE
  - RENDU
  - DIAGNOSTIC
  noms_maj_excepts:
  - Médecin DIM
  - Praticien conseil
  org_gpe_keep: false
 blacklist:
  # Sigles et libellés propres à l'établissement non couverts par les gazetteers
  # nationaux (FINESS / INSEE / BDPM). Évitez d'ajouter ici des noms d'hôpitaux,
  # villes, codes postaux ou numéros FINESS — ils sont déjà détectés automatiquement.
  force_mask_terms:
  - CHCB
  - 'Dates du séjour :'
  - CONCERTATION
  - LABORATOIRE de BIOLOGIE MEDICALE
  force_mask_regex:
  - '13\s*,?\s*Avenue\s+de\s+l.Interne\s+J\.?\s*LOEB\s+BP\s*\d+'
 kv_labels_preserve:
 - FINESS
 - IPP
 - N° OGC
 - Etablissement
 regex_overrides:
 - name: OGC_court
  pattern: \b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b
  placeholder: '[OGC]'
  flags:
  - IGNORECASE
 whitelist_phrases:
  - "classification internationale"
  - "prise en charge"
  - "bas de contention"
  - "date de naissance"
  - "lieu de naissance"
  - "ville de résidence"
  - "date de sortie"
  - "date d'admission"
  - "code postal"
 additional_stopwords: []
 additional_villes_blacklist: []
 additional_dpi_labels: []
 additional_companion_blacklist: []
 flags:
  case_insensitive: true
  unicode_word_boundaries: true
  regex_engine: python
--- a/config/dictionnaires.yml
+++ b/config/dictionnaires.yml
@@ -1,83 +1,11 @@
-version: 1
+# Surcharge locale chargée par défaut par l'application.
-encoding: utf-8
+# Source de vérité des valeurs par défaut : config/dictionnaires.default.yml
-normalization: NFKC
+# Ce fichier ne doit contenir que les écarts spécifiques à l'environnement courant.
-whitelist:
+#
-  sections_titres:
+# Exemples :
-  - DIM
+# blacklist:
-  - GHM
+#   force_mask_terms:
-  - GHS
+#   - VOTRE_SIGLE
-  - RUM
+# additional_stopwords:
-  - COMPTE
+# - votre_terme
-  - RENDU
+{}
  - DIAGNOSTIC
  noms_maj_excepts:
  - Médecin DIM
  - Praticien conseil
  org_gpe_keep: false
 blacklist:
  # Sigles et libellés propres à l'établissement non couverts par les gazetteers
  # nationaux (FINESS / INSEE / BDPM). Évitez d'ajouter ici des noms d'hôpitaux,
  # villes, codes postaux ou numéros FINESS — ils sont déjà détectés automatiquement.
  force_mask_terms:
  - CHCB                                # Sigle local non référencé FINESS
  - 'Dates du séjour :'                 # Libellé administratif (politique masquage)
  - CONCERTATION                        # Mention de RCP (politique métier)
  - LABORATOIRE de BIOLOGIE MEDICALE    # Libellé administratif générique
  force_mask_regex:
  # Adresse précise du CHCB — couverte par l'AC FINESS adresses mais on garde
  # la regex en filet de sécurité (encodages PDF, espaces non standards).
  - '13\s*,?\s*Avenue\s+de\s+l.Interne\s+J\.?\s*LOEB\s+BP\s*\d+'
 kv_labels_preserve:
 - FINESS
 - IPP
 - N° OGC
 - Etablissement
 regex_overrides:
 - name: OGC_court
  pattern: \b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b
  placeholder: '[OGC]'
  flags:
  - IGNORECASE
 # Phrases à ne JAMAIS anonymiser (faux positifs récurrents)
 # Ajouter ici les expressions qui sont masquées à tort.
 # La correspondance est insensible à la casse.
 whitelist_phrases:
  - "classification internationale"
  - "prise en charge"
  - "bas de contention"
  - "date de naissance"
  - "lieu de naissance"
  - "ville de résidence"
  - "date de sortie"
  - "date d'admission"
  - "code postal"
 # Mots supplémentaires à ne jamais masquer comme noms de personnes
 # (complète les 9000+ stop-words intégrés)
 additional_stopwords: []
 # Exemple :
 #   - "votre_mot"
 # Villes supplémentaires à ne jamais matcher comme lieux
 # (complète les 115+ villes blacklistées intégrées)
 additional_villes_blacklist: []
 # Exemple :
 #   - "VOTRE_VILLE"
 # Labels DPI supplémentaires à ne jamais masquer comme noms
 # (complète data/dpi_labels_blacklist.txt)
 # Utiliser pour : titres de colonnes, en-têtes de sections, libellés de champs
 additional_dpi_labels: []
 # Exemple :
 #   - "Service"
 #   - "Statut"
 # Termes en MAJUSCULES à ne jamais propager comme noms compagnons
 # (complète data/companion_blacklist.txt — spécialités, labos pharma, mots ambigus)
 additional_companion_blacklist: []
 # Exemple :
 #   - "VOTRE_SPECIALITE"
 flags:
  case_insensitive: true
  unicode_word_boundaries: true
  regex_engine: python
--- a/config_defaults.py
+++ b/config_defaults.py
@@ -0,0 +1,177 @@
 #!/usr/bin/env python3
 """
 Helpers partagés pour la config dictionnaires.
 """
 from __future__ import annotations
 from copy import deepcopy
 from pathlib import Path
 from typing import Any, Dict
 try:
    import yaml
 except Exception:
    yaml = None
 PROJECT_DIR = Path(__file__).resolve().parent
 CONFIG_DIR = PROJECT_DIR / "config"
 DEFAULT_DICTIONARIES_CONFIG_PATH = CONFIG_DIR / "dictionnaires.default.yml"
 RUNTIME_DICTIONARIES_CONFIG_PATH = CONFIG_DIR / "dictionnaires.yml"
 _RUNTIME_DICTIONARIES_OVERLAY_TEXT = """# Surcharge locale chargée par défaut par l'application.
 # Seuls les écarts par rapport à config/dictionnaires.default.yml sont nécessaires ici.
 # Si ce fichier est vide, les valeurs du template par défaut s'appliquent.
 #
 # Exemples :
 # blacklist:
 #   force_mask_terms:
 #   - VOTRE_SIGLE
 # additional_stopwords:
 # - votre_terme
 {}
 """
 _FALLBACK_DEFAULT_DICTIONARIES_TEXT = """version: 1
 encoding: utf-8
 normalization: NFKC
 whitelist:
  sections_titres:
  - DIM
  - GHM
  - GHS
  - RUM
  - COMPTE
  - RENDU
  - DIAGNOSTIC
  noms_maj_excepts:
  - Médecin DIM
  - Praticien conseil
  org_gpe_keep: false
 blacklist:
  force_mask_terms: []
  force_mask_regex: []
 kv_labels_preserve:
 - FINESS
 - IPP
 - N° OGC
 - Etablissement
 regex_overrides:
 - name: OGC_court
  pattern: \\b(?:N°\\s*)?OGC\\s*[:\\-]?\\s*([A-Za-z0-9\\-]{1,3})\\b
  placeholder: '[OGC]'
  flags:
  - IGNORECASE
 whitelist_phrases: []
 additional_stopwords: []
 additional_villes_blacklist: []
 additional_dpi_labels: []
 additional_companion_blacklist: []
 flags:
  case_insensitive: true
  unicode_word_boundaries: true
  regex_engine: python
 """
 _FALLBACK_DEFAULT_DICTIONARIES_DICT: Dict[str, Any] = {
    "version": 1,
    "encoding": "utf-8",
    "normalization": "NFKC",
    "whitelist": {
        "sections_titres": ["DIM", "GHM", "GHS", "RUM", "COMPTE", "RENDU", "DIAGNOSTIC"],
        "noms_maj_excepts": ["Médecin DIM", "Praticien conseil"],
        "org_gpe_keep": False,
    },
    "blacklist": {
        "force_mask_terms": [],
        "force_mask_regex": [],
    },
    "kv_labels_preserve": ["FINESS", "IPP", "N° OGC", "Etablissement"],
    "regex_overrides": [
        {
            "name": "OGC_court",
            "pattern": r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b",
            "placeholder": "[OGC]",
            "flags": ["IGNORECASE"],
        }
    ],
    "whitelist_phrases": [],
    "additional_stopwords": [],
    "additional_villes_blacklist": [],
    "additional_dpi_labels": [],
    "additional_companion_blacklist": [],
    "flags": {
        "case_insensitive": True,
        "unicode_word_boundaries": True,
        "regex_engine": "python",
    },
 }
 def read_default_dictionaries_text() -> str:
    try:
        return DEFAULT_DICTIONARIES_CONFIG_PATH.read_text(encoding="utf-8")
    except Exception:
        return _FALLBACK_DEFAULT_DICTIONARIES_TEXT
 def read_runtime_dictionaries_overlay_text() -> str:
    return _RUNTIME_DICTIONARIES_OVERLAY_TEXT
 def load_default_dictionaries_dict() -> Dict[str, Any]:
    text = read_default_dictionaries_text()
    if yaml is not None:
        try:
            loaded = yaml.safe_load(text) or {}
            if isinstance(loaded, dict):
                return loaded
        except Exception:
            pass
    return deepcopy(_FALLBACK_DEFAULT_DICTIONARIES_DICT)
 def load_runtime_dictionaries_overlay_dict(path: Path | None = None) -> Dict[str, Any]:
    target = Path(path) if path is not None else RUNTIME_DICTIONARIES_CONFIG_PATH
    if not target.exists():
        return {}
    if yaml is None:
        return {}
    try:
        loaded = yaml.safe_load(target.read_text(encoding="utf-8")) or {}
        if isinstance(loaded, dict):
            return loaded
    except Exception:
        pass
    return {}
 def load_effective_dictionaries_dict(path: Path | None = None) -> Dict[str, Any]:
    return deep_merge_dict(
        load_default_dictionaries_dict(),
        load_runtime_dictionaries_overlay_dict(path),
    )
 def deep_merge_dict(base: Dict[str, Any], override: Dict[str, Any]) -> Dict[str, Any]:
    merged = deepcopy(base)
    for key, value in (override or {}).items():
        if isinstance(value, dict) and isinstance(merged.get(key), dict):
            merged[key] = deep_merge_dict(merged[key], value)
        elif isinstance(value, list) and isinstance(merged.get(key), list):
            combined = list(merged[key])
            for item in value:
                if item not in combined:
                    combined.append(deepcopy(item))
            merged[key] = combined
        else:
            merged[key] = deepcopy(value)
    return merged
 def ensure_runtime_dictionaries_config(path: Path | None = None) -> Path:
    target = Path(path) if path is not None else RUNTIME_DICTIONARIES_CONFIG_PATH
    if not target.exists():
        target.parent.mkdir(parents=True, exist_ok=True)
        target.write_text(read_runtime_dictionaries_overlay_text(), encoding="utf-8")
    return target
--- a/data/bdpm/medication_whitelist_manual.txt
+++ b/data/bdpm/medication_whitelist_manual.txt
@@ -0,0 +1,11 @@
 # Compléments manuels à la whitelist médicaments.
 # Un terme par ligne, en lowercase.
 idacio
 salazopyrine
 infliximab
 apranax
 ketoprofene
 prevenar
 pneumovax
 bétadine
--- a/data/finess/address_blacklist.txt
+++ b/data/finess/address_blacklist.txt
@@ -0,0 +1,7 @@
 # Faux positifs à exclure du gazetteer d'adresses FINESS.
 cabinet medical
 cabinet dentaire
 cabinet infirmier
 cabinet paramedical
 cabinet sage-femme
--- a/data/finess/generic_name_blacklist.txt
+++ b/data/finess/generic_name_blacklist.txt
@@ -0,0 +1,112 @@
 # Noms d'établissements trop génériques à ignorer dans l'automate FINESS.
 clinique
 pharmacie
 hopital
 centre
 foyer
 residence
 maison
 cabinet
 service
 laboratoire
 institut
 association
 fondation
 mutuelle
 polyclinique
 dispensaire
 hospice
 annexe
 antenne
 site
 collegiale
 collegial
 cathedral
 cathedrale
 providence
 esperance
 renaissance
 liberation
 republique
 fraternite
 solidarite
 independance
 beauregard
 bellevue
 belvedere
 promenade
 esplanade
 corniche
 prefecture
 croissant
 confluence
 bienvenue
 chartreuse
 commanderie
 chapelle
 basilique
 departement
 departementale
 communautaire
 chirurgicale
 radiologie
 addictologie
 prevention
 psychotherapique
 ambulatoire
 hospitalisation
 consultation
 surveillance
 therapeutique
 readaptation
 reeducation
 reanimation
 specialisee
 conventionnelle
 professionnelle
 informatique
 administrative
 regionale
 generation
 revolution
 assomption
 visitation
 consolation
 atlantique
 manutention
 prefiguration
 intervalle
 pharmaciens
 pharmacien
 transfert
 comprimee
 comprimees
 injectable
 injectables
 maintenant
 actuellement
 auparavant
 prochainement
 rapidement
 correctement
 directement
 simplement
 internationale
 international
 intercommunal
 intercommunale
 resistance
 radiotherapie
 chimiotherapie
 curietherapie
 hormonotherapie
 immunotherapie
 kinesitherapie
 ergotherapie
 orthophonie
 psychomotricite
 convalescence
 dependance
 autonomie
 gerontologie
--- a/data/finess/generic_phrase_blacklist.txt
+++ b/data/finess/generic_phrase_blacklist.txt
@@ -0,0 +1,26 @@
 # Expressions FINESS multi-mots trop génériques à ignorer.
 a domicile
 au domicile
 menage a domicile
 du nord
 du sud
 de l est
 de l ouest
 la maison
 la residence
 les jardins
 le village
 le parc
 la colline
 au soleil
 en france
 long cours
 au long cours
 le bourg
 le val
 le clos
 le mas
 les pins
 les chenes
 les oliviers
--- a/pseudonymisation_pipeline_gui_v3.py
+++ b/pseudonymisation_pipeline_gui_v3.py
@@ -37,33 +37,18 @@ try:
 except Exception:
    yaml = None
-APP_TITLE = "Pseudonymisation de PDF"
+from config_defaults import (
-DEFAULT_CFG = Path("config/dictionnaires.yml")
+    RUNTIME_DICTIONARIES_CONFIG_PATH,
    read_default_dictionaries_text,
    read_runtime_dictionaries_overlay_text,
 )
-# YAML par défaut (patterns en bloc littéral pour éviter les échappements)
+APP_TITLE = "Pseudonymisation de PDF"
-DEFAULTS_CFG_TEXT = """# dictionnaires.yml – valeurs par défaut
+DEFAULT_CFG = RUNTIME_DICTIONARIES_CONFIG_PATH
-version: 1
+
-encoding: "utf-8"
+# YAML par défaut externalisé dans config/dictionnaires.default.yml
-normalization: "NFKC"
+DEFAULTS_CFG_TEXT = read_default_dictionaries_text()
-whitelist:
+RUNTIME_CFG_TEXT = read_runtime_dictionaries_overlay_text()
  sections_titres: [DIM, GHM, GHS, RUM, COMPTE, RENDU, DIAGNOSTIC]
  noms_maj_excepts: ["Médecin DIM", "Praticien conseil"]
  org_gpe_keep: true
 blacklist:
  force_mask_terms: []
  force_mask_regex: []
 kv_labels_preserve: [FINESS, IPP, "N° OGC", Etablissement]
 regex_overrides:
  - name: OGC_court
    pattern: |-
      \b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b
    placeholder: '[OGC]'
    flags: [IGNORECASE]
 flags:
  case_insensitive: true
  unicode_word_boundaries: true
  regex_engine: "python"
 """
 # ---------- util : ToolTip & helpers ----------
 class ToolTip:
@@ -211,7 +196,7 @@ class App:
        p = Path(self.cfg_path.get())
        p.parent.mkdir(parents=True, exist_ok=True)
        if not p.exists():
-            p.write_text(DEFAULTS_CFG_TEXT, encoding="utf-8")
+            p.write_text(RUNTIME_CFG_TEXT, encoding="utf-8")
    def _cfg_browse(self):
        d = filedialog.asksaveasfilename(defaultextension=".yml", filetypes=[("YAML","*.yml *.yaml"), ("Tous","*.*")])
@@ -248,7 +233,7 @@ class App:
            return
        try:
            with open(self.cfg_path.get(), "w", encoding="utf-8") as f:
-                yaml.safe_dump(self.cfg_data or yaml.safe_load(DEFAULTS_CFG_TEXT), f, allow_unicode=True, sort_keys=False)
+                yaml.safe_dump(self.cfg_data or {}, f, allow_unicode=True, sort_keys=False)
            self._log("Règles sauvegardées.")
        except Exception as e:
            messagebox.showerror("Erreur", f"Impossible d'écrire le fichier de règles: {e}")
@@ -258,8 +243,8 @@ class App:
    def _restore_defaults(self):
        try:
-            Path(self.cfg_path.get()).write_text(DEFAULTS_CFG_TEXT, encoding="utf-8")
+            Path(self.cfg_path.get()).write_text(RUNTIME_CFG_TEXT, encoding="utf-8")
-            self._log("Règles restaurées aux valeurs par défaut.")
+            self._log("Surcharge locale réinitialisée.")
            self._load_cfg()
        except Exception as e:
            messagebox.showerror("Erreur", f"Impossible d'écrire le YAML par défaut: {e}")
--- a/run_batch_30_audit.py
+++ b/run_batch_30_audit.py
@@ -9,6 +9,7 @@ from collections import Counter
 sys.path.insert(0, str(Path(__file__).parent))
 import anonymizer_core_refactored_onnx as core
 from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
 from eds_pseudo_manager import EdsPseudoManager
 from vlm_manager import VlmManager
 from gliner_manager import GlinerManager
@@ -16,7 +17,7 @@ from camembert_ner_manager import CamembertNerManager
 SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
 OUTDIR = SRC / "anonymise_audit_30"
-CONFIG = Path("/home/dom/ai/anonymisation/config/dictionnaires.yml")
+CONFIG = RUNTIME_DICTIONARIES_CONFIG_PATH
 PDFS = [
    SRC / "114_23060661/CONSULTATION ANESTHESISTE 23060661.pdf",
--- a/run_batch_59ogc.py
+++ b/run_batch_59ogc.py
@@ -9,11 +9,12 @@ from collections import Counter
 sys.path.insert(0, str(Path(__file__).parent))
 import anonymizer_core_refactored_onnx as core
 from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
 from eds_pseudo_manager import EdsPseudoManager
 SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
 OUTDIR = SRC / "anonymise"
-CONFIG = Path("/home/dom/ai/anonymisation/config/dictionnaires.yml")
+CONFIG = RUNTIME_DICTIONARIES_CONFIG_PATH
 def main():
    # Charger EDS-Pseudo
--- a/run_batch_silver_export.py
+++ b/run_batch_silver_export.py
@@ -19,9 +19,11 @@ from pathlib import Path
 sys.path.insert(0, str(Path(__file__).parent))
 from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
 SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
 OUTDIR = SRC / "anonymise_silver_extra"
-CONFIG = Path("/home/dom/ai/anonymisation/config/dictionnaires.yml")
+CONFIG = RUNTIME_DICTIONARIES_CONFIG_PATH
 # PDFs déjà traités dans l'audit 30 (à exclure)
 ALREADY_DONE_AUDIT30 = {
--- a/scripts/merge_params.py
+++ b/scripts/merge_params.py
@@ -13,13 +13,18 @@ import json
 import sys
 from pathlib import Path
 sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
 try:
    import yaml
 except ImportError:
    print("ERREUR : pyyaml requis (pip install pyyaml)")
    sys.exit(1)
-CONFIG = Path(__file__).parent.parent / "config" / "dictionnaires.yml"
+from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
 CONFIG = RUNTIME_DICTIONARIES_CONFIG_PATH
 def merge_params(json_files: list, config_path: Path = CONFIG, dry_run: bool = False):
--- a/server.py
+++ b/server.py
@@ -29,6 +29,8 @@ from typing import Optional
 from fastapi import FastAPI, File, Form, UploadFile
 from fastapi.responses import JSONResponse
 from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
@@ -86,7 +88,7 @@ def _load_models():
    """Charge tous les modèles NER une seule fois au démarrage."""
    global _eds_manager, _camembert_manager, _gliner_manager, _vlm_manager, _cfg
-    _cfg = load_dictionaries(Path(__file__).parent / "config" / "dictionnaires.yml")
+    _cfg = load_dictionaries(RUNTIME_DICTIONARIES_CONFIG_PATH)
    # EDS-Pseudo (F1=0.97)
    if EdsPseudoManager is not None:
@@ -288,7 +290,7 @@ async def anonymize_pdf(
            out_dir=out_dir,
            make_vector_redaction=vector_redaction,
            also_make_raster_burn=raster_redaction,
-            config_path=Path(__file__).parent / "config" / "dictionnaires.yml",
+            config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
            use_hf=use_ner and ner_mgr is not None,
            ner_manager=ner_mgr,
            gliner_manager=_gliner_manager if use_ner else None,
--- a/test_gui_error.py
+++ b/test_gui_error.py
@@ -3,6 +3,7 @@
 from pathlib import Path
 import anonymizer_core_refactored_onnx as core
 from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
 # Tester avec un seul PDF
 test_pdf = Path("/home/dom/Téléchargements").rglob("*.pdf")
@@ -16,7 +17,7 @@ if test_pdf:
            Path("/tmp/test_gui"),
            make_vector_redaction=False,
            also_make_raster_burn=True,
-            config_path=Path("config/dictionnaires.yml"),
+            config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
            use_hf=False,
        )
        print(f"✅ Succès: {result}")
--- a/test_gui_fixed.py
+++ b/test_gui_fixed.py
@@ -6,6 +6,7 @@ from pathlib import Path
 sys.path.insert(0, str(Path(__file__).parent))
 import anonymizer_core_refactored_onnx as core
 from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
 # Test avec un PDF simple
 test_pdf = Path("/tmp/test_gui_pdfs")
@@ -31,7 +32,7 @@ try:
        out_dir=out_dir,
        make_vector_redaction=False,
        also_make_raster_burn=True,
-        config_path=Path("config/dictionnaires.yml"),
+        config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
        use_hf=False,
        ner_manager=None,
        ner_thresholds=None,
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -0,0 +1,12 @@
 #!/usr/bin/env python3
 """
 Configuration pytest partagée pour les imports du dépôt.
 """
 import sys
 from pathlib import Path
 ROOT_DIR = Path(__file__).resolve().parent.parent
 if str(ROOT_DIR) not in sys.path:
    sys.path.insert(0, str(ROOT_DIR))
--- a/tests/synthetic_regression/README.md
+++ b/tests/synthetic_regression/README.md
@@ -0,0 +1,26 @@
 # Tests synthétiques de non-régression
 Cette suite fournit 10 cas synthétiques courts, relisibles et diffables, pensés
 comme première barrière de sécurité avant la revue humaine.
 Principe :
 - `test.txt` contient le document synthétique d'entrée à relire ou diff-er.
 - `expected.txt` contient la sortie anonymisée attendue, normalisée.
 - `expected.audit.json` contient un résumé stable de l'audit attendu.
 - `config_overlay.yml` est optionnel et permet de tester une surcharge locale.
 Objectif :
 - bloquer les régressions évidentes sur les règles critiques ;
 - rendre les écarts lisibles dans un diff Git ou dans la sortie de `pytest` ;
 - compléter, et non remplacer, la validation humaine sur corpus réel.
 Portée de cette première version :
 - texte uniquement ;
 - pas encore de PDF/OCR/layout ;
 - pas encore de cas `xfail` pour les bugs connus.
 Exécution :
 ```bash
 pytest -q tests/unit/test_synthetic_regression.py
 ```
--- a/tests/synthetic_regression/cases/001_patient_header_and_birth/expected.audit.json
+++ b/tests/synthetic_regression/cases/001_patient_header_and_birth/expected.audit.json
@@ -0,0 +1,22 @@
 [
  {
    "kind": "DATE_NAISSANCE",
    "original": "Né le 12/03/1980",
    "replacement": "[DATE_NAISSANCE]"
  },
  {
    "kind": "NOM_GLOBAL",
    "original": "ETCHEVERRY",
    "replacement": "[NOM]"
  },
  {
    "kind": "NOM_GLOBAL",
    "original": "CLAUDE",
    "replacement": "[NOM]"
  },
  {
    "kind": "NOM_GLOBAL",
    "original": "JEAN",
    "replacement": "[NOM]"
  }
 ]
--- a/tests/synthetic_regression/cases/001_patient_header_and_birth/expected.txt
+++ b/tests/synthetic_regression/cases/001_patient_header_and_birth/expected.txt
@@ -0,0 +1,3 @@
 [NOM] [NOM] [NOM]
 [DATE_NAISSANCE]
 Consultation du 14/03/2024
--- a/tests/synthetic_regression/cases/001_patient_header_and_birth/input.txt
+++ b/tests/synthetic_regression/cases/001_patient_header_and_birth/input.txt
@@ -0,0 +1,3 @@
 ETCHEVERRY JEAN CLAUDE
 Né le 12/03/1980
 Consultation du 14/03/2024
--- a/tests/synthetic_regression/cases/001_patient_header_and_birth/test.txt
+++ b/tests/synthetic_regression/cases/001_patient_header_and_birth/test.txt
@@ -0,0 +1,3 @@
 ETCHEVERRY JEAN CLAUDE
 Né le 12/03/1980
 Consultation du 14/03/2024
--- a/tests/synthetic_regression/cases/002_contact_bundle/expected.audit.json
+++ b/tests/synthetic_regression/cases/002_contact_bundle/expected.audit.json
@@ -0,0 +1,12 @@
 [
  {
    "kind": "EMAIL",
    "original": "jean.dupont@example.com",
    "replacement": "[EMAIL]"
  },
  {
    "kind": "TEL",
    "original": "01 23 45 67 89",
    "replacement": "[TEL]"
  }
 ]
--- a/tests/synthetic_regression/cases/002_contact_bundle/expected.txt
+++ b/tests/synthetic_regression/cases/002_contact_bundle/expected.txt
@@ -0,0 +1 @@
 Contact : [EMAIL] ou [TEL]
--- a/tests/synthetic_regression/cases/002_contact_bundle/input.txt
+++ b/tests/synthetic_regression/cases/002_contact_bundle/input.txt
@@ -0,0 +1 @@
 Contact: jean.dupont@example.com ou 01 23 45 67 89
--- a/tests/synthetic_regression/cases/002_contact_bundle/test.txt
+++ b/tests/synthetic_regression/cases/002_contact_bundle/test.txt
@@ -0,0 +1 @@
 Contact: jean.dupont@example.com ou 01 23 45 67 89
--- a/tests/synthetic_regression/cases/003_multiline_venue_number/expected.audit.json
+++ b/tests/synthetic_regression/cases/003_multiline_venue_number/expected.audit.json
@@ -0,0 +1,7 @@
 [
  {
    "kind": "NDA",
    "original": "1234567",
    "replacement": "[NDA]"
  }
 ]
--- a/tests/synthetic_regression/cases/003_multiline_venue_number/expected.txt
+++ b/tests/synthetic_regression/cases/003_multiline_venue_number/expected.txt
@@ -0,0 +1,3 @@
 N° venue :
 [NDA]
 Date de séjour : 14/03/2024
--- a/tests/synthetic_regression/cases/003_multiline_venue_number/input.txt
+++ b/tests/synthetic_regression/cases/003_multiline_venue_number/input.txt
@@ -0,0 +1,3 @@
 N° venue :
 1234567
 Date de séjour : 14/03/2024
--- a/tests/synthetic_regression/cases/003_multiline_venue_number/test.txt
+++ b/tests/synthetic_regression/cases/003_multiline_venue_number/test.txt
@@ -0,0 +1,3 @@
 N° venue :
 1234567
 Date de séjour : 14/03/2024
--- a/tests/synthetic_regression/cases/004_identifier_bundle/expected.audit.json
+++ b/tests/synthetic_regression/cases/004_identifier_bundle/expected.audit.json
@@ -0,0 +1,27 @@
 [
  {
    "kind": "RPPS",
    "original": "12345678901",
    "replacement": "[RPPS]"
  },
  {
    "kind": "FINESS",
    "original": "123456789",
    "replacement": "[FINESS]"
  },
  {
    "kind": "IPP",
    "original": "ABC12345",
    "replacement": "[IPP]"
  },
  {
    "kind": "OGC",
    "original": "12",
    "replacement": "[OGC]"
  },
  {
    "kind": "IBAN",
    "original": "FR76 3000 6000 0112 3456 7890 189",
    "replacement": "[IBAN]"
  }
 ]
--- a/tests/synthetic_regression/cases/004_identifier_bundle/expected.txt
+++ b/tests/synthetic_regression/cases/004_identifier_bundle/expected.txt
@@ -0,0 +1,5 @@
 RPPS : [RPPS]
 FINESS : [FINESS]
 IPP : [IPP]
 N° OGC : [OGC]
 IBAN : [IBAN]
--- a/tests/synthetic_regression/cases/004_identifier_bundle/input.txt
+++ b/tests/synthetic_regression/cases/004_identifier_bundle/input.txt
@@ -0,0 +1,5 @@
 RPPS : 12345678901
 FINESS : 123456789
 IPP : ABC12345
 N° OGC : 12
 IBAN : FR76 3000 6000 0112 3456 7890 189
--- a/tests/synthetic_regression/cases/004_identifier_bundle/test.txt
+++ b/tests/synthetic_regression/cases/004_identifier_bundle/test.txt
@@ -0,0 +1,5 @@
 RPPS : 12345678901
 FINESS : 123456789
 IPP : ABC12345
 N° OGC : 12
 IBAN : FR76 3000 6000 0112 3456 7890 189
--- a/tests/synthetic_regression/cases/005_force_mask_default_term/expected.audit.json
+++ b/tests/synthetic_regression/cases/005_force_mask_default_term/expected.audit.json
@@ -0,0 +1,7 @@
 [
  {
    "kind": "force_term",
    "original": "CHCB",
    "replacement": "[MASK]"
  }
 ]
--- a/tests/synthetic_regression/cases/005_force_mask_default_term/expected.txt
+++ b/tests/synthetic_regression/cases/005_force_mask_default_term/expected.txt
@@ -0,0 +1 @@
 Patient adressé au [MASK] pour avis. Retour au [MASK] demain.
--- a/tests/synthetic_regression/cases/005_force_mask_default_term/input.txt
+++ b/tests/synthetic_regression/cases/005_force_mask_default_term/input.txt
@@ -0,0 +1 @@
 Patient adressé au CHCB pour avis. Retour au CHCB demain.
--- a/tests/synthetic_regression/cases/005_force_mask_default_term/test.txt
+++ b/tests/synthetic_regression/cases/005_force_mask_default_term/test.txt
@@ -0,0 +1 @@
 Patient adressé au CHCB pour avis. Retour au CHCB demain.
--- a/tests/synthetic_regression/cases/006_whitelist_phrases_preserved/expected.audit.json
+++ b/tests/synthetic_regression/cases/006_whitelist_phrases_preserved/expected.audit.json
@@ -0,0 +1 @@
 []
--- a/tests/synthetic_regression/cases/006_whitelist_phrases_preserved/expected.txt
+++ b/tests/synthetic_regression/cases/006_whitelist_phrases_preserved/expected.txt
@@ -0,0 +1 @@
 La classification internationale reste visible. La prise en charge est correcte.
--- a/tests/synthetic_regression/cases/006_whitelist_phrases_preserved/input.txt
+++ b/tests/synthetic_regression/cases/006_whitelist_phrases_preserved/input.txt
@@ -0,0 +1 @@
 La classification internationale reste visible. La prise en charge est correcte.
--- a/tests/synthetic_regression/cases/006_whitelist_phrases_preserved/test.txt
+++ b/tests/synthetic_regression/cases/006_whitelist_phrases_preserved/test.txt
@@ -0,0 +1 @@
 La classification internationale reste visible. La prise en charge est correcte.
--- a/tests/synthetic_regression/cases/007_overlay_force_mask_local/config_overlay.yml
+++ b/tests/synthetic_regression/cases/007_overlay_force_mask_local/config_overlay.yml
@@ -0,0 +1,3 @@
 blacklist:
  force_mask_terms:
    - LOCAL_SIGLE
--- a/tests/synthetic_regression/cases/007_overlay_force_mask_local/expected.audit.json
+++ b/tests/synthetic_regression/cases/007_overlay_force_mask_local/expected.audit.json
@@ -0,0 +1,7 @@
 [
  {
    "kind": "force_term",
    "original": "LOCAL_SIGLE",
    "replacement": "[MASK]"
  }
 ]
--- a/tests/synthetic_regression/cases/007_overlay_force_mask_local/expected.txt
+++ b/tests/synthetic_regression/cases/007_overlay_force_mask_local/expected.txt
@@ -0,0 +1 @@
 Réorientation vers [MASK] en urgence.
--- a/tests/synthetic_regression/cases/007_overlay_force_mask_local/input.txt
+++ b/tests/synthetic_regression/cases/007_overlay_force_mask_local/input.txt
@@ -0,0 +1 @@
 Réorientation vers LOCAL_SIGLE en urgence.
--- a/tests/synthetic_regression/cases/007_overlay_force_mask_local/test.txt
+++ b/tests/synthetic_regression/cases/007_overlay_force_mask_local/test.txt
@@ -0,0 +1 @@
 Réorientation vers LOCAL_SIGLE en urgence.
--- a/tests/synthetic_regression/cases/008_ville_header/expected.audit.json
+++ b/tests/synthetic_regression/cases/008_ville_header/expected.audit.json
@@ -0,0 +1,7 @@
 [
  {
    "kind": "VILLE",
    "original": "Bayonne",
    "replacement": "[VILLE]"
  }
 ]
--- a/tests/synthetic_regression/cases/008_ville_header/expected.txt
+++ b/tests/synthetic_regression/cases/008_ville_header/expected.txt
@@ -0,0 +1,2 @@
 [VILLE], le 12/03/2024
 Compte rendu adressé au patient.
--- a/tests/synthetic_regression/cases/008_ville_header/input.txt
+++ b/tests/synthetic_regression/cases/008_ville_header/input.txt
@@ -0,0 +1,2 @@
 Bayonne, le 12/03/2024
 Compte rendu adressé au patient.
--- a/tests/synthetic_regression/cases/008_ville_header/test.txt
+++ b/tests/synthetic_regression/cases/008_ville_header/test.txt
@@ -0,0 +1,2 @@
 Bayonne, le 12/03/2024
 Compte rendu adressé au patient.
--- a/tests/synthetic_regression/cases/009_header_and_repeated_name/expected.audit.json
+++ b/tests/synthetic_regression/cases/009_header_and_repeated_name/expected.audit.json
@@ -0,0 +1,17 @@
 [
  {
    "kind": "NOM_GLOBAL",
    "original": "ETCHEVERRY",
    "replacement": "[NOM]"
  },
  {
    "kind": "NOM_GLOBAL",
    "original": "CLAUDE",
    "replacement": "[NOM]"
  },
  {
    "kind": "NOM_GLOBAL",
    "original": "JEAN",
    "replacement": "[NOM]"
  }
 ]
--- a/tests/synthetic_regression/cases/009_header_and_repeated_name/expected.txt
+++ b/tests/synthetic_regression/cases/009_header_and_repeated_name/expected.txt
@@ -0,0 +1,2 @@
 [NOM] [NOM] [NOM]
 Le patient [NOM] revient ce jour.
--- a/tests/synthetic_regression/cases/009_header_and_repeated_name/input.txt
+++ b/tests/synthetic_regression/cases/009_header_and_repeated_name/input.txt
@@ -0,0 +1,2 @@
 ETCHEVERRY JEAN CLAUDE
 Le patient ETCHEVERRY revient ce jour.
--- a/tests/synthetic_regression/cases/009_header_and_repeated_name/test.txt
+++ b/tests/synthetic_regression/cases/009_header_and_repeated_name/test.txt
@@ -0,0 +1,2 @@
 ETCHEVERRY JEAN CLAUDE
 Le patient ETCHEVERRY revient ce jour.
--- a/tests/synthetic_regression/cases/010_spaced_establishment_header/expected.audit.json
+++ b/tests/synthetic_regression/cases/010_spaced_establishment_header/expected.audit.json
@@ -0,0 +1,7 @@
 [
  {
    "kind": "ETAB_SPACED",
    "original": "C E N T R E   H O S P I T A L I E R   D E   L A   C O T E   B A S Q U E",
    "replacement": "[ETABLISSEMENT]"
  }
 ]
--- a/tests/synthetic_regression/cases/010_spaced_establishment_header/expected.txt
+++ b/tests/synthetic_regression/cases/010_spaced_establishment_header/expected.txt
@@ -0,0 +1,2 @@
 [ETABLISSEMENT]
 Service de cardiologie
--- a/tests/synthetic_regression/cases/010_spaced_establishment_header/input.txt
+++ b/tests/synthetic_regression/cases/010_spaced_establishment_header/input.txt
@@ -0,0 +1,2 @@
 C E N T R E   H O S P I T A L I E R   D E   L A   C O T E   B A S Q U E
 Service de cardiologie
--- a/tests/synthetic_regression/cases/010_spaced_establishment_header/test.txt
+++ b/tests/synthetic_regression/cases/010_spaced_establishment_header/test.txt
@@ -0,0 +1,2 @@
 C E N T R E   H O S P I T A L I E R   D E   L A   C O T E   B A S Q U E
 Service de cardiologie
--- a/tests/synthetic_regression/manifest.json
+++ b/tests/synthetic_regression/manifest.json
@@ -0,0 +1,110 @@
 {
  "001_patient_header_and_birth": {
    "description": "En-tête patient en majuscules avec date de naissance masquée et date de soin conservée.",
    "must_contain": [
      "[DATE_NAISSANCE]",
      "Consultation du 14/03/2024"
    ],
    "must_not_contain": [
      "ETCHEVERRY",
      "JEAN",
      "CLAUDE",
      "12/03/1980"
    ]
  },
  "002_contact_bundle": {
    "description": "Email et téléphone dans une même ligne de contact.",
    "must_contain": [
      "[EMAIL]",
      "[TEL]"
    ],
    "must_not_contain": [
      "jean.dupont@example.com",
      "01 23 45 67 89"
    ]
  },
  "003_multiline_venue_number": {
    "description": "Numéro de venue éclaté sur deux lignes.",
    "must_contain": [
      "N° venue :",
      "[NDA]",
      "Date de séjour : 14/03/2024"
    ],
    "must_not_contain": [
      "1234567"
    ]
  },
  "004_identifier_bundle": {
    "description": "Bloc d'identifiants structurés variés.",
    "must_contain": [
      "[RPPS]",
      "[FINESS]",
      "[IPP]",
      "[OGC]",
      "[IBAN]"
    ],
    "must_not_contain": [
      "12345678901",
      "123456789",
      "ABC12345",
      "FR76 3000 6000 0112 3456 7890 189"
    ]
  },
  "005_force_mask_default_term": {
    "description": "Terme forcé par la configuration par défaut.",
    "must_contain": [
      "[MASK]"
    ],
    "must_not_contain": [
      "CHCB"
    ]
  },
  "006_whitelist_phrases_preserved": {
    "description": "Expressions métier explicitement préservées.",
    "must_contain": [
      "classification internationale",
      "prise en charge"
    ],
    "must_not_contain": []
  },
  "007_overlay_force_mask_local": {
    "description": "Terme local masqué via surcharge runtime.",
    "must_contain": [
      "[MASK]"
    ],
    "must_not_contain": [
      "LOCAL_SIGLE"
    ]
  },
  "008_ville_header": {
    "description": "Ville en en-tête de courrier, date conservée.",
    "must_contain": [
      "[VILLE], le 12/03/2024"
    ],
    "must_not_contain": [
      "Bayonne"
    ]
  },
  "009_header_and_repeated_name": {
    "description": "Propagation globale d'un nom vu dans l'en-tête.",
    "must_contain": [
      "Le patient [NOM] revient ce jour."
    ],
    "must_not_contain": [
      "ETCHEVERRY",
      "JEAN",
      "CLAUDE"
    ]
  },
  "010_spaced_establishment_header": {
    "description": "En-tête d'établissement avec lettres espacées.",
    "must_contain": [
      "[ETABLISSEMENT]",
      "Service de cardiologie"
    ],
    "must_not_contain": [
      "C E N T R E",
      "H O S P I T A L I E R"
    ]
  }
 }
--- a/tests/synthetic_regression/tests.md
+++ b/tests/synthetic_regression/tests.md
@@ -0,0 +1,25 @@
 # Jeux de tests synthétiques
 Ces fichiers sont les cas de test relisibles à la main. Chaque dossier contient :
 - `test.txt` : document synthétique d'entrée
 - `expected.txt` : sortie anonymisée attendue
 - `expected.audit.json` : résumé d'audit attendu
 Cas disponibles :
 - `001_patient_header_and_birth`
 - `002_contact_bundle`
 - `003_multiline_venue_number`
 - `004_identifier_bundle`
 - `005_force_mask_default_term`
 - `006_whitelist_phrases_preserved`
 - `007_overlay_force_mask_local`
 - `008_ville_header`
 - `009_header_and_repeated_name`
 - `010_spaced_establishment_header`
 Exemples de fichiers à ouvrir :
 - [001 test](</home/dom/ai/anonymisation/tests/synthetic_regression/cases/001_patient_header_and_birth/test.txt:1>)
 - [001 attendu](</home/dom/ai/anonymisation/tests/synthetic_regression/cases/001_patient_header_and_birth/expected.txt:1>)
 - [004 test](</home/dom/ai/anonymisation/tests/synthetic_regression/cases/004_identifier_bundle/test.txt:1>)
 - [004 attendu](</home/dom/ai/anonymisation/tests/synthetic_regression/cases/004_identifier_bundle/expected.txt:1>)
 - [007 surcharge locale](</home/dom/ai/anonymisation/tests/synthetic_regression/cases/007_overlay_force_mask_local/config_overlay.yml:1>)
--- a/tests/synthetic_review/README.md
+++ b/tests/synthetic_review/README.md
@@ -0,0 +1,26 @@
 # Corpus synthétique de revue humaine
 Ce corpus ne remplace pas les tests unitaires. Il sert à valider des documents
 complets, relus par un humain, avec un vrai diff entre :
 - `test.txt` : document synthétique source
 - `expected.txt` : anonymisation attendue selon la règle métier
 - `actual/` : sortie réellement produite par le moteur
 Objectif :
 - détecter les régressions de composition sur des documents réalistes ;
 - rendre visibles les écarts de comportement du moteur ;
 - préparer une validation humaine avant promotion éventuelle en suite bloquante.
 Commande :
 ```bash
 python3 tools/run_synthetic_review_corpus.py
 ```
 Chaque exécution écrit :
 - `actual.txt`
 - `actual.audit.json`
 - `actual.summary.json`
 - `diff.txt`
 Sous [actual](/home/dom/ai/anonymisation/tests/synthetic_review/actual).
--- a/tests/synthetic_review/cases/001_crh_hospitalisation_complete/expectations.json
+++ b/tests/synthetic_review/cases/001_crh_hospitalisation_complete/expectations.json
@@ -0,0 +1,31 @@
 {
  "required_kinds": [
    "ADRESSE",
    "CODE_POSTAL",
    "DATE_NAISSANCE",
    "EMAIL",
    "ETAB",
    "IPP",
    "NDA",
    "NOM_FORCE",
    "TEL",
    "VILLE",
    "force_term"
  ],
  "must_contain": [
    "classification internationale",
    "prise en charge",
    "Service de cardiologie"
  ],
  "must_not_contain": [
    "ETCHEVERRY",
    "JEAN",
    "CLAUDE",
    "12/03/1980",
    "06 12 34 56 78",
    "jean.claude.etcheverry@example.com",
    "ABC12345",
    "1234567",
    "CHCB"
  ]
 }
--- a/tests/synthetic_review/cases/001_crh_hospitalisation_complete/expected.txt
+++ b/tests/synthetic_review/cases/001_crh_hospitalisation_complete/expected.txt
@@ -0,0 +1,19 @@
 [ETABLISSEMENT]
 [VILLE], le 14/03/2024
 COMPTE RENDU D'HOSPITALISATION
 Patient : [NOM] [NOM] [NOM]
 [DATE_NAISSANCE]
 Adresse : [ADRESSE]
 Code postal : [CODE_POSTAL]
 Ville de résidence : [VILLE]
 Téléphone : [TEL]
 Mail : [EMAIL]
 IPP : [IPP]
 N° venue :
 [NDA]
 Le patient [NOM] [NOM] [NOM] est adressé au [MASK] pour bilan.
 La classification internationale et la prise en charge sont discutées.
 Service de cardiologie.
--- a/tests/synthetic_review/cases/001_crh_hospitalisation_complete/review.md
+++ b/tests/synthetic_review/cases/001_crh_hospitalisation_complete/review.md
@@ -0,0 +1,10 @@
 # Revue 001
 Points critiques :
 - le patient doit être masqué partout, y compris en reprise narrative ;
 - la date de naissance doit être masquée, pas la date de soin ;
 - l'adresse, le code postal, la ville, le téléphone, le mail, l'IPP et le numéro de venue doivent disparaître ;
 - `classification internationale`, `prise en charge` et `Service de cardiologie` doivent rester lisibles.
 Écart attendu aujourd'hui :
 - ce cas doit mettre en évidence si le moteur perd des labels structurés comme `Code postal :` ou `N° venue :`.
--- a/tests/synthetic_review/cases/001_crh_hospitalisation_complete/test.txt
+++ b/tests/synthetic_review/cases/001_crh_hospitalisation_complete/test.txt
@@ -0,0 +1,19 @@
 CENTRE HOSPITALIER DE LA COTE BASQUE
 Bayonne, le 14/03/2024
 COMPTE RENDU D'HOSPITALISATION
 Patient : ETCHEVERRY JEAN CLAUDE
 Né le 12/03/1980
 Adresse : 14 rue des Lilas
 Code postal : 64100
 Ville de résidence : Bayonne
 Téléphone : 06 12 34 56 78
 Mail : jean.claude.etcheverry@example.com
 IPP : ABC12345
 N° venue :
 1234567
 Le patient ETCHEVERRY JEAN CLAUDE est adressé au CHCB pour bilan.
 La classification internationale et la prise en charge sont discutées.
 Service de cardiologie.
--- a/tests/synthetic_review/cases/002_imagerie_complete/expectations.json
+++ b/tests/synthetic_review/cases/002_imagerie_complete/expectations.json
@@ -0,0 +1,26 @@
 {
  "required_kinds": [
    "DATE_NAISSANCE",
    "DOSSIER",
    "ETAB_SPACED",
    "FINESS",
    "IBAN",
    "NOM_FORCE",
    "OGC",
    "RPPS"
  ],
  "must_contain": [
    "Service de radiologie",
    "classification internationale"
  ],
  "must_not_contain": [
    "DUPONT",
    "MARIE",
    "PAULE",
    "01/02/1975",
    "23L35781",
    "12345678901",
    "123456789",
    "FR76 3000 6000 0112 3456 7890 189"
  ]
 }
--- a/tests/synthetic_review/cases/002_imagerie_complete/expected.txt
+++ b/tests/synthetic_review/cases/002_imagerie_complete/expected.txt
@@ -0,0 +1,13 @@
 [ETABLISSEMENT]
 Service de radiologie
 Compte rendu d'imagerie
 Patient : [NOM] [NOM] [NOM]
 [DATE_NAISSANCE]
 N° examen : [DOSSIER]
 RPPS : [RPPS]
 FINESS : [FINESS]
 N° OGC : [OGC]
 IBAN : [IBAN]
 Le dossier de [NOM] [NOM] [NOM] est revu ce jour.
 La classification internationale est conservée.
--- a/tests/synthetic_review/cases/002_imagerie_complete/review.md
+++ b/tests/synthetic_review/cases/002_imagerie_complete/review.md
@@ -0,0 +1,7 @@
 # Revue 002
 Points critiques :
 - l'en-tête d'établissement espacé doit être réduit à un placeholder ;
 - le numéro d'examen, le RPPS, le FINESS, l'OGC et l'IBAN doivent disparaître ;
 - le nom du patient doit être masqué dans le champ structuré et dans la phrase narrative ;
 - `Service de radiologie` et `classification internationale` doivent rester visibles.
--- a/tests/synthetic_review/cases/002_imagerie_complete/test.txt
+++ b/tests/synthetic_review/cases/002_imagerie_complete/test.txt
@@ -0,0 +1,13 @@
 C E N T R E   H O S P I T A L I E R   D E   L A   C O T E   B A S Q U E
 Service de radiologie
 Compte rendu d'imagerie
 Patient : DUPONT MARIE PAULE
 Née le 01/02/1975
 N° examen : 23L35781
 RPPS : 12345678901
 FINESS : 123456789
 N° OGC : 12
 IBAN : FR76 3000 6000 0112 3456 7890 189
 Le dossier de DUPONT MARIE PAULE est revu ce jour.
 La classification internationale est conservée.
--- a/tests/synthetic_review/cases/003_consultation_complete/expectations.json
+++ b/tests/synthetic_review/cases/003_consultation_complete/expectations.json
@@ -0,0 +1,29 @@
 {
  "required_kinds": [
    "DATE_NAISSANCE",
    "EMAIL",
    "ETAB",
    "IPP",
    "NOM_FORCE",
    "RPPS",
    "TEL",
    "VILLE",
    "force_term"
  ],
  "must_contain": [
    "prise en charge en hôpital de jour"
  ],
  "must_not_contain": [
    "LAFITTE",
    "ANNE",
    "MARIE",
    "18/07/1968",
    "Bordeaux",
    "Anglet",
    "anne.lafitte@example.com",
    "01 23 45 67 89",
    "10987654321",
    "ZXC98765",
    "CHCB"
  ]
 }
--- a/tests/synthetic_review/cases/003_consultation_complete/expected.txt
+++ b/tests/synthetic_review/cases/003_consultation_complete/expected.txt
@@ -0,0 +1,14 @@
 [ETABLISSEMENT]
 [VILLE], le 22/05/2024
 CONSULTATION DE SUIVI
 Patient : [NOM] [NOM] [NOM]
 [DATE_NAISSANCE]
 Lieu de naissance : [VILLE]
 Ville de résidence : [VILLE]
 Contact : [EMAIL] ou [TEL]
 RPPS : [RPPS]
 IPP : [IPP]
 Le patient [NOM] [NOM] [NOM] est adressé au [MASK].
 La prise en charge en hôpital de jour est maintenue.
--- a/tests/synthetic_review/cases/003_consultation_complete/review.md
+++ b/tests/synthetic_review/cases/003_consultation_complete/review.md
@@ -0,0 +1,7 @@
 # Revue 003
 Points critiques :
 - la ville d'en-tête, le lieu de naissance et la ville de résidence doivent être masqués ;
 - le contact mail/téléphone, le RPPS et l'IPP doivent être masqués ;
 - la reprise narrative du nom du patient doit être masquée ;
 - `prise en charge en hôpital de jour` doit rester visible.
--- a/tests/synthetic_review/cases/003_consultation_complete/test.txt
+++ b/tests/synthetic_review/cases/003_consultation_complete/test.txt
@@ -0,0 +1,14 @@
 CLINIQUE ATLANTIQUE
 Biarritz, le 22/05/2024
 CONSULTATION DE SUIVI
 Patient : LAFITTE ANNE MARIE
 Née le 18/07/1968
 Lieu de naissance : Bordeaux
 Ville de résidence : Anglet
 Contact : anne.lafitte@example.com ou 01 23 45 67 89
 RPPS : 10987654321
 IPP : ZXC98765
 Le patient LAFITTE ANNE MARIE est adressé au CHCB.
 La prise en charge en hôpital de jour est maintenue.
--- a/tests/synthetic_review/cases/004_structured_admin_complete/expectations.json
+++ b/tests/synthetic_review/cases/004_structured_admin_complete/expectations.json
@@ -0,0 +1,27 @@
 {
  "required_kinds": [
    "EMAIL",
    "FINESS",
    "IPP",
    "NOM_GLOBAL",
    "OGC",
    "RPPS",
    "TEL",
    "VILLE",
    "force_term"
  ],
  "must_not_contain": [
    "ETCHEVERRY",
    "JEAN",
    "CLAUDE",
    "ABC12345",
    "123456789",
    "12345678901",
    "Bayonne",
    "Bordeaux",
    "Anglet",
    "06 11 22 33 44",
    "jean.dupont@example.com",
    "CHCB"
  ]
 }
--- a/tests/synthetic_review/cases/004_structured_admin_complete/expected.txt
+++ b/tests/synthetic_review/cases/004_structured_admin_complete/expected.txt
@@ -0,0 +1,11 @@
 [NOM] [NOM] [NOM]
 IPP : [IPP]
 FINESS : [FINESS]
 RPPS : [RPPS]
 [VILLE], le 12/03/2024
 Lieu de naissance : [VILLE]
 Ville de résidence : [VILLE]
 Téléphone : [TEL]
 Mail : [EMAIL]
 N° OGC : [OGC]
 Patient adressé au [MASK] pour avis. Retour au [MASK] demain.
--- a/tests/synthetic_review/cases/004_structured_admin_complete/review.md
+++ b/tests/synthetic_review/cases/004_structured_admin_complete/review.md
@@ -0,0 +1,7 @@
 # Revue 004
 Points critiques :
 - les identifiants structurés doivent être masqués même quand le label et la valeur sont séparés ;
 - la ville d'en-tête et les villes structurées doivent disparaître ;
 - le nom de patient en en-tête doit être propagé ;
 - les deux occurrences de `CHCB` doivent être masquées.
--- a/tests/synthetic_review/cases/004_structured_admin_complete/test.txt
+++ b/tests/synthetic_review/cases/004_structured_admin_complete/test.txt
@@ -0,0 +1,12 @@
 ETCHEVERRY JEAN CLAUDE
 IPP
 ABC12345
 FINESS : 123456789
 RPPS : 12345678901
 Bayonne, le 12/03/2024
 Lieu de naissance : Bordeaux
 Ville de résidence : Anglet
 Téléphone : 06 11 22 33 44
 Mail : jean.dupont@example.com
 N° OGC : 12
 Patient adressé au CHCB pour avis. Retour au CHCB demain.
--- a/tests/synthetic_review/tests.md
+++ b/tests/synthetic_review/tests.md
@@ -0,0 +1,15 @@
 # Index du corpus de revue
 Cas complets disponibles :
 - [001 source](</home/dom/ai/anonymisation/tests/synthetic_review/cases/001_crh_hospitalisation_complete/test.txt:1>)
 - [001 attendu](</home/dom/ai/anonymisation/tests/synthetic_review/cases/001_crh_hospitalisation_complete/expected.txt:1>)
 - [001 revue](</home/dom/ai/anonymisation/tests/synthetic_review/cases/001_crh_hospitalisation_complete/review.md:1>)
 - [002 source](</home/dom/ai/anonymisation/tests/synthetic_review/cases/002_imagerie_complete/test.txt:1>)
 - [002 attendu](</home/dom/ai/anonymisation/tests/synthetic_review/cases/002_imagerie_complete/expected.txt:1>)
 - [002 revue](</home/dom/ai/anonymisation/tests/synthetic_review/cases/002_imagerie_complete/review.md:1>)
 - [003 source](</home/dom/ai/anonymisation/tests/synthetic_review/cases/003_consultation_complete/test.txt:1>)
 - [003 attendu](</home/dom/ai/anonymisation/tests/synthetic_review/cases/003_consultation_complete/expected.txt:1>)
 - [003 revue](</home/dom/ai/anonymisation/tests/synthetic_review/cases/003_consultation_complete/review.md:1>)
 - [004 source](</home/dom/ai/anonymisation/tests/synthetic_review/cases/004_structured_admin_complete/test.txt:1>)
 - [004 attendu](</home/dom/ai/anonymisation/tests/synthetic_review/cases/004_structured_admin_complete/expected.txt:1>)
 - [004 revue](</home/dom/ai/anonymisation/tests/synthetic_review/cases/004_structured_admin_complete/review.md:1>)
--- a/tests/unit/test_config_externalization.py
+++ b/tests/unit/test_config_externalization.py
@@ -0,0 +1,92 @@
 #!/usr/bin/env python3
 """
 Tests de non-régression pour la config externalisée.
 """
 from pathlib import Path
 import anonymizer_core_refactored_onnx as core
 from config_defaults import (
    deep_merge_dict,
    ensure_runtime_dictionaries_config,
    load_effective_dictionaries_dict,
    read_default_dictionaries_text,
    read_runtime_dictionaries_overlay_text,
 )
 def test_default_config_template_is_externalized():
    text = read_default_dictionaries_text()
    assert "blacklist:" in text
    assert "whitelist_phrases:" in text
    cfg = core.load_dictionaries(None)
    assert "CHCB" in cfg["blacklist"]["force_mask_terms"]
 def test_runtime_overlay_template_is_minimal():
    text = read_runtime_dictionaries_overlay_text()
    assert "dictionnaires.default.yml" in text
    assert "{}" in text
 def test_deep_merge_dict_preserves_nested_defaults():
    base = {
        "whitelist": {
            "sections_titres": ["DIM"],
            "org_gpe_keep": False,
        },
        "flags": {
            "case_insensitive": True,
            "regex_engine": "python",
        },
    }
    override = {
        "whitelist": {
            "sections_titres": ["GHM"],
            "org_gpe_keep": True,
        },
        "flags": {
            "regex_engine": "re2",
        },
    }
    merged = deep_merge_dict(base, override)
    assert merged["whitelist"]["sections_titres"] == ["DIM", "GHM"]
    assert merged["whitelist"]["org_gpe_keep"] is True
    assert merged["flags"]["case_insensitive"] is True
    assert merged["flags"]["regex_engine"] == "re2"
 def test_additional_stopwords_refresh_and_reset(tmp_path: Path):
    cfg_path = tmp_path / "cfg.yml"
    cfg_path.write_text("additional_stopwords:\n  - xyzzymed\n", encoding="utf-8")
    core.load_dictionaries(cfg_path)
    assert "xyzzymed" in core._MEDICAL_STOP_WORDS_SET
    assert "xyzzymed" in core._MEDICAL_STOP_WORDS
    core.load_dictionaries(None)
    assert "xyzzymed" not in core._MEDICAL_STOP_WORDS_SET
    assert "xyzzymed" not in core._MEDICAL_STOP_WORDS
 def test_runtime_overlay_is_created_and_effective_merge_works(tmp_path: Path):
    cfg_path = tmp_path / "dictionnaires.yml"
    created = ensure_runtime_dictionaries_config(cfg_path)
    assert created == cfg_path
    assert cfg_path.exists()
    effective = load_effective_dictionaries_dict(cfg_path)
    assert "CHCB" in effective["blacklist"]["force_mask_terms"]
    cfg_path.write_text(
        "blacklist:\n  force_mask_terms:\n    - LOCAL_SIGLE\n",
        encoding="utf-8",
    )
    effective = load_effective_dictionaries_dict(cfg_path)
    assert "CHCB" in effective["blacklist"]["force_mask_terms"]
    assert "LOCAL_SIGLE" in effective["blacklist"]["force_mask_terms"]
--- a/tests/unit/test_header_pii_detection.py
+++ b/tests/unit/test_header_pii_detection.py
@@ -0,0 +1,63 @@
 #!/usr/bin/env python3
 """
 Tests de non-régression pour les fuites en en-tête de document.
 """
 from anonymizer_core_refactored_onnx import (
    RE_NUM_ACCESSION_HEADER,
    RE_NUM_EXAMEN_PATIENT,
    anonymise_document_regex,
    load_dictionaries,
    selective_rescan,
 )
 class TestHeaderPiiDetection:
    """Cas réels vus en production: nom patient en capitales + numéro d'examen compact."""
    def test_uppercase_patient_header_is_masked(self):
        cfg = load_dictionaries(None)
        anon = anonymise_document_regex(["ETCHEVERRY JEAN CLAUDE"], [[]], cfg)
        assert "ETCHEVERRY" not in anon.text_out
        assert "JEAN" not in anon.text_out
        assert "CLAUDE" not in anon.text_out
        assert anon.text_out == "[NOM] [NOM] [NOM]"
    def test_compact_exam_number_matches_labeled_pattern(self):
        match = RE_NUM_EXAMEN_PATIENT.search("N° examen : 23L35781")
        assert match is not None
        assert match.group(1) == "23L35781"
    def test_bare_header_accession_number_is_added_to_audit(self):
        cfg = load_dictionaries(None)
        text = (
            "N° 23L35781\n"
            "Prélevé le 26/07/2023\n"
            "Enregistré le 27/07/2023\n"
        )
        match = RE_NUM_ACCESSION_HEADER.search(text)
        assert match is not None
        assert match.group(1) == "23L35781"
        anon = anonymise_document_regex([text], [[]], cfg)
        assert any(h.kind == "DOSSIER" and h.original == "23L35781" for h in anon.audit)
    def test_labeled_exam_number_is_masked_in_text_and_audit(self):
        cfg = load_dictionaries(None)
        anon = anonymise_document_regex(["N° examen : 23L35781"], [[]], cfg)
        text = selective_rescan(anon.text_out, cfg)
        assert text == "N° examen : [DOSSIER]"
        assert any(h.kind == "DOSSIER" and h.original == "23L35781" for h in anon.audit)
    def test_structured_code_postal_preserves_label_and_audit(self):
        cfg = load_dictionaries(None)
        anon = anonymise_document_regex(["Code postal : 64100"], [[]], cfg)
        text = selective_rescan(anon.text_out, cfg)
        assert text == "Code postal : [CODE_POSTAL]"
        assert any(h.kind == "CODE_POSTAL" and h.original == "64100" for h in anon.audit)
--- a/tests/unit/test_synthetic_regression.py
+++ b/tests/unit/test_synthetic_regression.py
@@ -0,0 +1,100 @@
 #!/usr/bin/env python3
 """
 Tests synthétiques de non-régression pour l'anonymisation.
 """
 import json
 from pathlib import Path
 import pytest
 from anonymizer_core_refactored_onnx import (
    anonymise_document_regex,
    load_dictionaries,
    selective_rescan,
 )
 from evaluation.leak_scanner import LeakScanner
 SUITE_DIR = Path(__file__).resolve().parents[1] / "synthetic_regression"
 CASES_DIR = SUITE_DIR / "cases"
 MANIFEST_PATH = SUITE_DIR / "manifest.json"
 LEAK_SCANNER = LeakScanner()
 def _normalize_text(text: str) -> str:
    text = text.replace("\r\n", "\n").replace("\r", "\n")
    return "\n".join(line.rstrip() for line in text.strip().splitlines())
 def _load_manifest() -> dict:
    return json.loads(MANIFEST_PATH.read_text(encoding="utf-8"))
 def _case_dirs() -> list[Path]:
    return sorted(path for path in CASES_DIR.iterdir() if path.is_dir())
 def _normalize_audit(audit: list) -> list[dict]:
    return [
        {
            "kind": hit.kind,
            "original": hit.original,
            "replacement": hit.placeholder,
        }
        for hit in audit
    ]
 def _load_case_cfg(case_dir: Path):
    overlay_path = case_dir / "config_overlay.yml"
    return load_dictionaries(overlay_path if overlay_path.exists() else None)
 def _assertions_for(case_name: str) -> dict:
    manifest = _load_manifest()
    return manifest[case_name]
 def test_synthetic_regression_inventory():
    assert MANIFEST_PATH.exists()
    assert len(_case_dirs()) == 10
    assert len(_load_manifest()) == 10
@pytest.mark.parametrize("case_dir", _case_dirs(), ids=lambda path: path.name)
 def test_synthetic_regression_case(case_dir: Path):
    cfg = _load_case_cfg(case_dir)
    case_rules = _assertions_for(case_dir.name)
    input_path = case_dir / "test.txt"
    if not input_path.exists():
        input_path = case_dir / "input.txt"
    input_text = input_path.read_text(encoding="utf-8")
    expected_text = _normalize_text((case_dir / "expected.txt").read_text(encoding="utf-8"))
    expected_audit = json.loads((case_dir / "expected.audit.json").read_text(encoding="utf-8"))
    result = anonymise_document_regex([input_text], [[]], cfg)
    actual_text = _normalize_text(selective_rescan(result.text_out, cfg))
    actual_audit = _normalize_audit(result.audit)
    assert actual_text == expected_text
    assert actual_audit == expected_audit
    for required in case_rules.get("must_contain", []):
        assert required in actual_text
    for forbidden in case_rules.get("must_not_contain", []):
        assert forbidden not in actual_text
    leaks = LEAK_SCANNER.scan_text(
        actual_text,
        [
            {
                "kind": item["kind"],
                "original": item["original"],
            }
            for item in actual_audit
        ],
    )
    assert not leaks
--- a/tools/debug_force_term.py
+++ b/tools/debug_force_term.py
@@ -2,12 +2,12 @@
 """Debug force_term mechanism."""
 import re
 import yaml
 from pathlib import Path
-# Load config
+from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH, load_effective_dictionaries_dict
-cfg_path = Path("config/dictionnaires.yml")
+
-cfg = yaml.safe_load(cfg_path.read_text(encoding="utf-8"))
+# Load effective config
 cfg_path = RUNTIME_DICTIONARIES_CONFIG_PATH
 cfg = load_effective_dictionaries_dict(cfg_path)
 print("=" * 80)
 print("CONFIG LOADED")
--- a/tools/quick_test_date_correction.py
+++ b/tools/quick_test_date_correction.py
@@ -5,6 +5,7 @@ import sys
 from pathlib import Path
 sys.path.insert(0, str(Path(__file__).parent.parent))
 from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
 from anonymizer_core_refactored_onnx import process_pdf
 # Test sur 3 documents du test dataset
@@ -32,7 +33,7 @@ for doc in test_docs:
            out_dir=out_dir,
            make_vector_redaction=False,
            also_make_raster_burn=False,
-            config_path=Path("config/dictionnaires.yml"),
+            config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
            use_hf=False,
            ner_manager=None,
            vlm_manager=None,
@@ -56,4 +57,3 @@ for doc in test_docs:
        print(f"❌ {pdf_path.name}: Erreur - {e}")
 print("\n✅ Test terminé")
--- a/tools/run_synthetic_review_corpus.py
+++ b/tools/run_synthetic_review_corpus.py
@@ -0,0 +1,169 @@
 #!/usr/bin/env python3
 """
 Exécute le corpus synthétique de revue humaine et produit les diffs.
 """
 from __future__ import annotations
 import argparse
 import difflib
 import json
 import shutil
 import sys
 from collections import Counter
 from pathlib import Path
 ROOT = Path(__file__).resolve().parents[1]
 if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))
 from anonymizer_core_refactored_onnx import (  # noqa: E402
    anonymise_document_regex,
    load_dictionaries,
    selective_rescan,
 )
 from evaluation.leak_scanner import LeakScanner  # noqa: E402
 CORPUS_DIR = ROOT / "tests" / "synthetic_review"
 CASES_DIR = CORPUS_DIR / "cases"
 ACTUAL_DIR = CORPUS_DIR / "actual"
 SCANNER = LeakScanner()
 def normalize_text(text: str) -> str:
    text = text.replace("\r\n", "\n").replace("\r", "\n")
    return "\n".join(line.rstrip() for line in text.strip().splitlines()) + "\n"
 def load_expectations(case_dir: Path) -> dict:
    expectations_path = case_dir / "expectations.json"
    if not expectations_path.exists():
        return {}
    return json.loads(expectations_path.read_text(encoding="utf-8"))
 def build_leak_scan_seed(audit: list[dict]) -> list[dict]:
    """Évite les faux positifs sur les valeurs trop courtes ou ambiguës."""
    seed = []
    for item in audit:
        original = str(item.get("original", "")).strip()
        compact = original.replace(" ", "")
        if len(compact) < 4:
            continue
        if compact.isdigit() and len(compact) < 6:
            continue
        seed.append(
            {
                "kind": item["kind"],
                "original": original,
            }
        )
    return seed
 def run_case(case_dir: Path) -> dict:
    cfg_path = case_dir / "config_overlay.yml"
    cfg = load_dictionaries(cfg_path if cfg_path.exists() else None)
    source_text = (case_dir / "test.txt").read_text(encoding="utf-8")
    expected_text = normalize_text((case_dir / "expected.txt").read_text(encoding="utf-8"))
    expectations = load_expectations(case_dir)
    anon = anonymise_document_regex([source_text], [[]], cfg)
    actual_text = normalize_text(selective_rescan(anon.text_out, cfg))
    audit = [
        {
            "kind": hit.kind,
            "original": hit.original,
            "replacement": hit.placeholder,
        }
        for hit in anon.audit
    ]
    summary = {
        "kinds_present": sorted(set(item["kind"] for item in audit)),
        "kind_counts": dict(sorted(Counter(item["kind"] for item in audit).items())),
        "audit_len": len(audit),
        "leaks": SCANNER.scan_text(actual_text, build_leak_scan_seed(audit)),
    }
    case_actual_dir = ACTUAL_DIR / case_dir.name
    if case_actual_dir.exists():
        shutil.rmtree(case_actual_dir)
    case_actual_dir.mkdir(parents=True, exist_ok=True)
    (case_actual_dir / "actual.txt").write_text(actual_text, encoding="utf-8")
    (case_actual_dir / "actual.audit.json").write_text(
        json.dumps(audit, ensure_ascii=False, indent=2) + "\n",
        encoding="utf-8",
    )
    (case_actual_dir / "actual.summary.json").write_text(
        json.dumps(summary, ensure_ascii=False, indent=2) + "\n",
        encoding="utf-8",
    )
    diff_lines = list(
        difflib.unified_diff(
            expected_text.splitlines(keepends=True),
            actual_text.splitlines(keepends=True),
            fromfile=f"{case_dir.name}/expected.txt",
            tofile=f"{case_dir.name}/actual.txt",
        )
    )
    (case_actual_dir / "diff.txt").write_text("".join(diff_lines), encoding="utf-8")
    failures = []
    if actual_text != expected_text:
        failures.append("text_diff")
    if summary["leaks"]:
        failures.append("leak_detected")
    required_kinds = expectations.get("required_kinds", [])
    missing_kinds = sorted(kind for kind in required_kinds if kind not in summary["kinds_present"])
    if missing_kinds:
        failures.append(f"missing_kinds:{','.join(missing_kinds)}")
    for required in expectations.get("must_contain", []):
        if required not in actual_text:
            failures.append(f"missing_text:{required}")
    for forbidden in expectations.get("must_not_contain", []):
        if forbidden in actual_text:
            failures.append(f"forbidden_text:{forbidden}")
    return {
        "case": case_dir.name,
        "failures": failures,
        "output_dir": str(case_actual_dir),
    }
 def main() -> int:
    parser = argparse.ArgumentParser(description="Exécuter le corpus synthétique de revue humaine")
    parser.add_argument(
        "--strict",
        action="store_true",
        help="Retourne un code non nul si un cas diffère de l'attendu.",
    )
    args = parser.parse_args()
    ACTUAL_DIR.mkdir(parents=True, exist_ok=True)
    case_dirs = sorted(path for path in CASES_DIR.iterdir() if path.is_dir())
    results = [run_case(case_dir) for case_dir in case_dirs]
    has_failures = False
    for result in results:
        if result["failures"]:
            has_failures = True
            print(f"[FAIL] {result['case']}: {', '.join(result['failures'])}")
        else:
            print(f"[OK]   {result['case']}")
        print(f"       -> {result['output_dir']}")
    if args.strict and has_failures:
        return 1
    return 0
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/tools/test_all_cro.py
+++ b/tools/test_all_cro.py
@@ -8,6 +8,7 @@ sys.path.insert(0, '.')
 from pathlib import Path
 import re
 from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
 from anonymizer_core_refactored_onnx import process_pdf
 import time
@@ -47,7 +48,7 @@ def test_all_cro():
                output_dir,
                make_vector_redaction=False,
                also_make_raster_burn=False,
-                config_path=Path("config/dictionnaires.yml")
+                config_path=RUNTIME_DICTIONARIES_CONFIG_PATH
            )
            # Lire le texte anonymisé
--- a/tools/test_chcb_leak.py
+++ b/tools/test_chcb_leak.py
@@ -8,6 +8,7 @@ import sys
 sys.path.insert(0, str(Path(__file__).parent.parent))
 import anonymizer_core_refactored_onnx as core
 from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
 def test_chcb_detection():
    """Test CHCB detection on the 2 documents with leaks."""
@@ -53,7 +54,7 @@ def test_chcb_detection():
            out_dir=outdir,
            make_vector_redaction=False,
            also_make_raster_burn=False,
-            config_path=Path("config/dictionnaires.yml"),
+            config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
            use_hf=False,
        )
@@ -102,7 +103,7 @@ def test_chcb_detection():
            out_dir=outdir,
            make_vector_redaction=False,
            also_make_raster_burn=False,
-            config_path=Path("config/dictionnaires.yml"),
+            config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
            use_hf=False,
        )
--- a/tools/test_date_propagation.py
+++ b/tools/test_date_propagation.py
@@ -9,6 +9,7 @@ sys.path.insert(0, '.')
 from pathlib import Path
 import re
 from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
 from anonymizer_core_refactored_onnx import process_pdf
 def test_date_propagation():
@@ -47,7 +48,7 @@ def test_date_propagation():
                output_dir,
                make_vector_redaction=False,
                also_make_raster_burn=False,
-                config_path=Path("config/dictionnaires.yml")
+                config_path=RUNTIME_DICTIONARIES_CONFIG_PATH
            )
            # Lire le texte anonymisé
--- a/tools/test_gui_complete.py
+++ b/tools/test_gui_complete.py
@@ -9,6 +9,7 @@ import time
 sys.path.insert(0, str(Path(__file__).parent.parent))
 import anonymizer_core_refactored_onnx as core
 from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
 # Dossier de test
 test_dir = Path("/tmp/test_gui_pdfs")
@@ -39,7 +40,7 @@ for i, pdf in enumerate(pdfs, start=1):
            out_dir=out_dir,
            make_vector_redaction=False,
            also_make_raster_burn=True,
-            config_path=Path("config/dictionnaires.yml"),
+            config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
            use_hf=False,
            ner_manager=None,
            ner_thresholds=None,
--- a/tools/test_gui_simulation.py
+++ b/tools/test_gui_simulation.py
@@ -8,6 +8,7 @@ import sys
 sys.path.insert(0, str(Path(__file__).parent.parent))
 import anonymizer_core_refactored_onnx as core
 from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
 # Simuler exactement ce que fait le GUI
 test_pdf = Path("/tmp/test_gui_pdfs/001_simple_unknown_BACTERIO_23018396.pdf")
@@ -27,7 +28,7 @@ try:
        out_dir=out_dir,
        make_vector_redaction=False,
        also_make_raster_burn=True,
-        config_path=Path("config/dictionnaires.yml"),
+        config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
        use_hf=False,
        ner_manager=None,
        ner_thresholds=None,
--- a/tools/test_phase1_corrections.py
+++ b/tools/test_phase1_corrections.py
@@ -16,6 +16,7 @@ import re
 # Ajouter le répertoire racine au path
 sys.path.insert(0, str(Path(__file__).parent.parent))
 from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
 from anonymizer_core_refactored_onnx import process_pdf
 def test_phase1_corrections():
@@ -52,7 +53,7 @@ def test_phase1_corrections():
            # Anonymiser le document
            result = process_pdf(
                pdf_path=pdf_path,
-                config_path=Path("config/dictionnaires.yml"),
+                config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
                ner_manager=None,
                eds_pseudo_manager=None,
                vlm_manager=None,
--- a/tools/validate_corpus_sample.py
+++ b/tools/validate_corpus_sample.py
@@ -16,6 +16,7 @@ import re
 sys.path.insert(0, str(Path(__file__).parent.parent))
 from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
 from anonymizer_core_refactored_onnx import process_pdf
 def validate_corpus_sample():
@@ -94,7 +95,7 @@ def validate_corpus_sample():
                output_dir,
                make_vector_redaction=False,
                also_make_raster_burn=False,  # Pas de PDF pour aller plus vite
-                config_path=Path("config/dictionnaires.yml")
+                config_path=RUNTIME_DICTIONARIES_CONFIG_PATH
            )
            doc_time = time.time() - doc_start
--- a/tools/validate_full_corpus.py
+++ b/tools/validate_full_corpus.py
@@ -17,6 +17,7 @@ import re
 sys.path.insert(0, str(Path(__file__).parent.parent))
 from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
 from anonymizer_core_refactored_onnx import process_pdf
 def validate_full_corpus():
@@ -70,7 +71,7 @@ def validate_full_corpus():
                output_dir,
                make_vector_redaction=False,
                also_make_raster_burn=True,
-                config_path=Path("config/dictionnaires.yml")
+                config_path=RUNTIME_DICTIONARIES_CONFIG_PATH
            )
            doc_time = time.time() - doc_start
--- a/tools/validate_phase1_on_production.py
+++ b/tools/validate_phase1_on_production.py
@@ -10,6 +10,7 @@ from pathlib import Path
 import json
 sys.path.insert(0, str(Path(__file__).parent.parent))
 from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
 from anonymizer_core_refactored_onnx import process_pdf
 # 5 documents du corpus production (OGC 008)
@@ -58,7 +59,7 @@ for pdf_path in test_docs[:5]:
            out_dir=out_dir,
            make_vector_redaction=False,
            also_make_raster_burn=False,
-            config_path=Path("config/dictionnaires.yml"),
+            config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
            use_hf=False,
            ner_manager=None,
            vlm_manager=None,
		`@@ -0,0 +1 @@`
							`Contact: jean.dupont@example.com ou 01 23 45 67 89`
		`@@ -0,0 +1 @@`
							`Patient adressé au [MASK] pour avis. Retour au [MASK] demain.`
		`@@ -0,0 +1 @@`
							`Patient adressé au CHCB pour avis. Retour au CHCB demain.`
		`@@ -0,0 +1 @@`
							`La classification internationale reste visible. La prise en charge est correcte.`
		`@@ -0,0 +1 @@`
							`Réorientation vers LOCAL_SIGLE en urgence.`
		`@@ -0,0 +1,2 @@`
							`[VILLE], le 12/03/2024`
							`Compte rendu adressé au patient.`
		`@@ -0,0 +1,2 @@`
							`Bayonne, le 12/03/2024`
							`Compte rendu adressé au patient.`
		`@@ -0,0 +1,2 @@`
							`[NOM] [NOM] [NOM]`
							`Le patient [NOM] revient ce jour.`
		`@@ -0,0 +1,2 @@`
							`ETCHEVERRY JEAN CLAUDE`
							`Le patient ETCHEVERRY revient ce jour.`
		`@@ -0,0 +1,2 @@`
							`C E N T R E H O S P I T A L I E R D E L A C O T E B A S Q U E`
							`Service de cardiologie`