Externalize dictionaries and add anonymization review corpus

This commit is contained in:
2026-04-21 10:32:57 +02:00
parent 39db675052
commit 34dcf8f360
99 changed files with 1805 additions and 805 deletions

View File

@@ -122,8 +122,9 @@ Fonction : `_mask_line_by_regex`
| Dates | `[DATE]` | 12/03/2024 | | Dates | `[DATE]` | 12/03/2024 |
| Adresses | `[ADRESSE]` | 12 rue de la Paix | | Adresses | `[ADRESSE]` | 12 rue de la Paix |
Configuration supplementaire via `config/dictionnaires.yml` : Configuration :
listes blanches, force-mask et regex personnalisees. - `config/dictionnaires.default.yml` : template versionne, source de verite des valeurs par defaut
- `config/dictionnaires.yml` : surcharge locale chargee par defaut, contenant uniquement les ecarts site/runtime
### 3. Reconnaissance d'entites nommees (NER) ### 3. Reconnaissance d'entites nommees (NER)
@@ -180,6 +181,7 @@ un fallback OCR est utilise :
| Element | Description | | Element | Description |
|-------------------------------|------------------------------------------------| |-------------------------------|------------------------------------------------|
| `config/dictionnaires.yml` | Listes blanches, force-mask, regex custom | | `config/dictionnaires.default.yml` | Valeurs par defaut completes et versionnees |
| `config/dictionnaires.yml` | Surcharge locale optionnelle (ecarts uniquement) |
| `Pseudonymisation_Gui_V5.py` | Interface graphique (traitement par lots) | | `Pseudonymisation_Gui_V5.py` | Interface graphique (traitement par lots) |
| Ligne de commande | `python anonymizer_core_refactored_onnx.py fichier.pdf --hf --raster` | | Ligne de commande | `python anonymizer_core_refactored_onnx.py fichier.pdf --hf --raster` |

View File

@@ -48,33 +48,16 @@ try:
except Exception: except Exception:
yaml = None yaml = None
APP_TITLE = "Pseudonymisation de PDF" from config_defaults import (
DEFAULT_CFG = Path("config/dictionnaires.yml") RUNTIME_DICTIONARIES_CONFIG_PATH,
read_default_dictionaries_text,
read_runtime_dictionaries_overlay_text,
)
DEFAULTS_CFG_TEXT = r""" APP_TITLE = "Pseudonymisation de PDF"
# dictionnaires.yml valeurs par défaut (bloc littéral pour les regex) DEFAULT_CFG = RUNTIME_DICTIONARIES_CONFIG_PATH
version: 1 DEFAULTS_CFG_TEXT = read_default_dictionaries_text()
encoding: "utf-8" RUNTIME_CFG_TEXT = read_runtime_dictionaries_overlay_text()
normalization: "NFKC"
whitelist:
sections_titres: [DIM, GHM, GHS, RUM, COMPTE, RENDU, DIAGNOSTIC]
noms_maj_excepts: ["Médecin DIM", "Praticien conseil"]
org_gpe_keep: true
blacklist:
force_mask_terms: []
force_mask_regex: []
kv_labels_preserve: [FINESS, IPP, "N° OGC", Etablissement]
regex_overrides:
- name: OGC_court
pattern: |-
\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b
placeholder: '[OGC]'
flags: [IGNORECASE]
flags:
case_insensitive: true
unicode_word_boundaries: true
regex_engine: "python"
"""
class ToolTip: class ToolTip:
@@ -208,7 +191,7 @@ class App:
# YAML helpers # YAML helpers
def _ensure_cfg_exists(self): def _ensure_cfg_exists(self):
p = Path(self.cfg_path.get()); p.parent.mkdir(parents=True, exist_ok=True) p = Path(self.cfg_path.get()); p.parent.mkdir(parents=True, exist_ok=True)
if not p.exists(): p.write_text(DEFAULTS_CFG_TEXT, encoding="utf-8") if not p.exists(): p.write_text(RUNTIME_CFG_TEXT, encoding="utf-8")
def _cfg_browse(self): def _cfg_browse(self):
d = filedialog.asksaveasfilename(defaultextension=".yml", filetypes=[("YAML","*.yml *.yaml"), ("Tous","*.*")]) d = filedialog.asksaveasfilename(defaultextension=".yml", filetypes=[("YAML","*.yml *.yaml"), ("Tous","*.*")])
if d: self.cfg_path.set(d) if d: self.cfg_path.set(d)
@@ -225,14 +208,14 @@ class App:
if yaml is None: if yaml is None:
messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml)."); return messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml)."); return
try: try:
Path(self.cfg_path.get()).write_text(yaml.safe_dump(self.cfg_data or yaml.safe_load(DEFAULTS_CFG_TEXT), allow_unicode=True, sort_keys=False), encoding="utf-8") Path(self.cfg_path.get()).write_text(yaml.safe_dump(self.cfg_data or {}, allow_unicode=True, sort_keys=False), encoding="utf-8")
self._log("Règles sauvegardées.") self._log("Règles sauvegardées.")
except Exception as e: except Exception as e:
messagebox.showerror("Erreur", f"Impossible d'écrire le YAML: {e}") messagebox.showerror("Erreur", f"Impossible d'écrire le YAML: {e}")
def _reload_cfg(self): self._load_cfg(); self._log("Règles rechargées.") def _reload_cfg(self): self._load_cfg(); self._log("Règles rechargées.")
def _restore_defaults(self): def _restore_defaults(self):
try: try:
Path(self.cfg_path.get()).write_text(DEFAULTS_CFG_TEXT, encoding="utf-8"); self._log("CFG par défaut écrit."); self._load_cfg() Path(self.cfg_path.get()).write_text(RUNTIME_CFG_TEXT, encoding="utf-8"); self._log("Surcharge locale réinitialisée."); self._load_cfg()
except Exception as e: except Exception as e:
messagebox.showerror("Erreur", f"Impossible d'écrire le YAML par défaut: {e}") messagebox.showerror("Erreur", f"Impossible d'écrire le YAML par défaut: {e}")

View File

@@ -20,7 +20,6 @@ import os
import platform import platform
import queue import queue
import re import re
import shutil
import subprocess import subprocess
import sys import sys
import threading import threading
@@ -75,6 +74,11 @@ try:
except Exception: except Exception:
yaml = None yaml = None
from config_defaults import (
read_default_dictionaries_text,
read_runtime_dictionaries_overlay_text,
)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Thème optionnel # Thème optionnel
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@@ -142,47 +146,19 @@ def _resolve_config() -> Path:
pour que l'utilisateur puisse la modifier sans recompiler. pour que l'utilisateur puisse la modifier sans recompiler.
""" """
exe_cfg = _exe_dir() / "config" / "dictionnaires.yml" exe_cfg = _exe_dir() / "config" / "dictionnaires.yml"
app_cfg = _app_dir() / "config" / "dictionnaires.yml"
if exe_cfg.exists(): if exe_cfg.exists():
return exe_cfg return exe_cfg
# Premier lancement : copier la config embarquée à côté de l'exe exe_cfg.parent.mkdir(parents=True, exist_ok=True)
if app_cfg.exists(): exe_cfg.write_text(read_runtime_dictionaries_overlay_text(), encoding="utf-8")
exe_cfg.parent.mkdir(parents=True, exist_ok=True) return exe_cfg
import shutil
shutil.copy2(str(app_cfg), str(exe_cfg))
return exe_cfg
return app_cfg # fallback
DEFAULT_CFG = _resolve_config() DEFAULT_CFG = _resolve_config()
MODELS_DIR = _app_dir() / "models" MODELS_DIR = _app_dir() / "models"
DEFAULTS_CFG_TEXT = r""" DEFAULTS_CFG_TEXT = read_default_dictionaries_text()
# dictionnaires.yml valeurs par défaut (bloc littéral pour les regex) RUNTIME_CFG_TEXT = read_runtime_dictionaries_overlay_text()
version: 1
encoding: "utf-8"
normalization: "NFKC"
whitelist:
sections_titres: [DIM, GHM, GHS, RUM, COMPTE, RENDU, DIAGNOSTIC]
noms_maj_excepts: ["Médecin DIM", "Praticien conseil"]
org_gpe_keep: true
blacklist:
force_mask_terms: []
force_mask_regex: []
kv_labels_preserve: [FINESS, IPP, "N° OGC", Etablissement]
regex_overrides:
- name: OGC_court
pattern: |-
\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b
placeholder: '[OGC]'
flags: [IGNORECASE]
flags:
case_insensitive: true
unicode_word_boundaries: true
regex_engine: "python"
"""
# Palette dérivée du logo aivanonym (gradient magenta → rose → pêche → noir) # Palette dérivée du logo aivanonym (gradient magenta → rose → pêche → noir)
# Magenta du logo : primaire (boutons, accents) # Magenta du logo : primaire (boutons, accents)
@@ -1593,7 +1569,7 @@ class App:
p = Path(self.cfg_path.get()) p = Path(self.cfg_path.get())
p.parent.mkdir(parents=True, exist_ok=True) p.parent.mkdir(parents=True, exist_ok=True)
if not p.exists(): if not p.exists():
p.write_text(DEFAULTS_CFG_TEXT, encoding="utf-8") p.write_text(RUNTIME_CFG_TEXT, encoding="utf-8")
def _load_cfg(self): def _load_cfg(self):
if yaml is None: if yaml is None:

View File

@@ -24,36 +24,11 @@ try:
import yaml # PyYAML for dictionaries import yaml # PyYAML for dictionaries
except Exception: except Exception:
yaml = None yaml = None
from config_defaults import (
# ----------------- Defaults & Config ----------------- RUNTIME_DICTIONARIES_CONFIG_PATH,
DEFAULTS_CFG = { load_effective_dictionaries_dict,
"version": 1, load_default_dictionaries_dict,
"encoding": "utf-8", )
"normalization": "NFKC",
"whitelist": {
"sections_titres": ["DIM", "GHM", "GHS", "RUM", "COMPTE", "RENDU", "DIAGNOSTIC"],
"noms_maj_excepts": ["Médecin DIM", "Praticien conseil"],
"org_gpe_keep": True,
},
"blacklist": {
"force_mask_terms": [],
"force_mask_regex": [],
},
"kv_labels_preserve": ["FINESS", "IPP", "N° OGC", "Etablissement"],
"regex_overrides": [
{
"name": "OGC_court",
"pattern": r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b",
"placeholder": "[OGC]",
"flags": ["IGNORECASE"],
}
],
"flags": {
"case_insensitive": True,
"unicode_word_boundaries": True,
"regex_engine": "python",
},
}
PLACEHOLDERS = { PLACEHOLDERS = {
"EMAIL": "[EMAIL]", "EMAIL": "[EMAIL]",
@@ -103,16 +78,7 @@ class AnonResult:
# ----------------- Config loader ----------------- # ----------------- Config loader -----------------
def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]: def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]:
cfg = DEFAULTS_CFG.copy() return load_default_dictionaries_dict() if config_path is None else load_effective_dictionaries_dict(config_path)
if config_path and config_path.exists() and yaml is not None:
try:
user = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
# shallow-merge for top-level keys
for k, v in user.items():
cfg[k] = v
except Exception:
pass
return cfg
# ----------------- Extraction ----------------- # ----------------- Extraction -----------------
@@ -416,7 +382,7 @@ if __name__ == "__main__":
ap.add_argument("--out", type=str, default="out") ap.add_argument("--out", type=str, default="out")
ap.add_argument("--no-vector", action="store_true") ap.add_argument("--no-vector", action="store_true")
ap.add_argument("--raster", action="store_true") ap.add_argument("--raster", action="store_true")
ap.add_argument("--config", type=str, default=str(Path("config/dictionnaires.yml"))) ap.add_argument("--config", type=str, default=str(RUNTIME_DICTIONARIES_CONFIG_PATH))
args = ap.parse_args() args = ap.parse_args()
outs = process_pdf(Path(args.pdf), Path(args.out), make_vector_redaction=not args.no_vector, also_make_raster_burn=args.raster, config_path=Path(args.config)) outs = process_pdf(Path(args.pdf), Path(args.out), make_vector_redaction=not args.no_vector, also_make_raster_burn=args.raster, config_path=Path(args.config))
print(json.dumps(outs, indent=2, ensure_ascii=False)) print(json.dumps(outs, indent=2, ensure_ascii=False))

View File

@@ -44,6 +44,12 @@ try:
except Exception: except Exception:
yaml = None yaml = None
from config_defaults import (
RUNTIME_DICTIONARIES_CONFIG_PATH,
load_effective_dictionaries_dict,
load_default_dictionaries_dict,
)
try: try:
from doctr.models import ocr_predictor as _doctr_ocr_predictor from doctr.models import ocr_predictor as _doctr_ocr_predictor
_DOCTR_AVAILABLE = True _DOCTR_AVAILABLE = True
@@ -115,6 +121,29 @@ def _load_bdpm_medication_names() -> set:
return set() return set()
def _load_wordlist_file(
path: Path,
*,
transform=lambda s: s,
label: str,
min_len: int = 1,
) -> set:
"""Charge un fichier texte, un mot par ligne."""
result: set = set()
if not path.exists():
log.warning("%s introuvable : %s", label, path)
return result
try:
for line in path.read_text(encoding="utf-8").splitlines():
word = line.strip()
if word and not word.startswith("#") and len(word) >= min_len:
result.add(transform(word))
log.info("%s chargé : %d entrées depuis %s", label, len(result), path.name)
except Exception as exc:
log.error("%s : erreur de lecture %s%s", label, path, exc)
return result
# ----------------- Gazetteers INSEE (prénoms + communes + noms de famille) ----------------- # ----------------- Gazetteers INSEE (prénoms + communes + noms de famille) -----------------
# Prénoms et noms de famille sont utilisés sous deux formes : # Prénoms et noms de famille sont utilisés sous deux formes :
# - _INSEE_PRENOMS (lowercase) : check rapide "w.lower() in _INSEE_PRENOMS" # - _INSEE_PRENOMS (lowercase) : check rapide "w.lower() in _INSEE_PRENOMS"
@@ -199,62 +228,24 @@ _FINESS_ADDR_AC = None # Automate Aho-Corasick pour adresses (noms d
_VILLE_AC = None # Automate Aho-Corasick pour villes (INSEE + FINESS) _VILLE_AC = None # Automate Aho-Corasick pour villes (INSEE + FINESS)
# Communes trop ambiguës (homonymes de mots courants, trop courts, etc.) # Communes trop ambiguës (homonymes de mots courants, trop courts, etc.)
_VILLE_BLACKLIST = { _VILLE_BLACKLIST_FALLBACK = {
# Directions / mots géographiques génériques "PARIS",
"SAINT", "NORD", "SUD", "EST", "OUEST", "FRANCE",
"CENTRE", "SERVICE", "BOURG", "EUROPE",
# Communes homonymes de mots courants français "COURANT",
"ORANGE", "TOURS", "NICE", "SENS", "VITRE", "COU",
"ROMANS", "MENTON", "SALON", "VIENNE", "DOS",
"BREST", # trop court et ambigu "SEIN",
"HYERES", # proche de termes médicaux "BRAS",
"AGEN", "AUCH", "ALBI",
"BLOIS", "LAON", "LENS",
"GIEN", "GRAY",
"AIRE", "LURE", "SETE", "DOLE",
"VIRE", "LUNEL", "MURET", "MORET",
"COEUR", "FOIX", "GIVET",
"EVIAN", "MAURE", "MENDE",
"JOUE", "MEAUX", "REDON",
"CREIL", "CERGY",
# Communes de 4-5 lettres homonymes de mots très courants
"VERS", "MONT", "MARS", "PORT", "PONT", "FORT",
"BOIS", "ISLE", "LACS", "MURS", "OUST", "PREY",
"VAUX", "VERT", "FAUX", "REZE",
"BILLE", "PLACE", "VILLE", "COURS", "GRAND",
"ROUGE", "RICHE", "NUITS", "SORE", "SARE",
"TRANS", "RANS", "MARSA",
# Mots courants français (6+ lettres) aussi communes
"CHARGE", "SIGNES", "BARRES", "FOSSES", "GARDES",
"MARCHE", "LIGNES", "MOULIN", "PIERRE", "CHAISE",
"SOURCE", "VALLEE", "MAISON", "BEAUNE", "CORPS",
"PUITS", "CROIX", "LIGNE", "QUATRE", "PRISON",
# Prénoms très courants (aussi communes)
"MARIE", "PIERRE", "JEAN", "PAUL", "ANNE",
# Expressions composées ambiguës (aussi communes INSEE)
"LONG", "RECY", "PLAN", "MARCHE", "SALLE",
"CONTRE", "MERE", "ONDRES", "VEBRE",
# Mots structurels / médicaux
"PARIS", # omniprésent, source de faux positifs
"FRANCE", "EUROPE",
# Termes ambigus (aussi communes INSEE) - trackare/DPI
"COURANT", # "Médecin courant" ≠ ville
# Parties du corps homonymes de communes (FP "prurit invalidant (COU, décolleté)")
"COU", "DOS", "SEIN", "BRAS",
} }
# Enrichissement depuis fichier externe (modifiable sans toucher au code) _VILLE_BLACKLIST = _load_wordlist_file(
_villes_bl_file = Path(__file__).parent / "data" / "villes_blacklist.txt" Path(__file__).parent / "data" / "villes_blacklist.txt",
if _villes_bl_file.exists(): transform=str.upper,
try: label="Villes blacklist",
for _line in _villes_bl_file.read_text(encoding="utf-8").splitlines(): )
_w = _line.strip() if not _VILLE_BLACKLIST:
if _w and not _w.startswith("#"): _VILLE_BLACKLIST = set(_VILLE_BLACKLIST_FALLBACK)
_VILLE_BLACKLIST.add(_w) _BASE_VILLE_BLACKLIST = set(_VILLE_BLACKLIST)
log.info("Villes blacklist chargées : %d entrées", len(_VILLE_BLACKLIST))
except Exception as _exc:
log.error("Villes blacklist : erreur de lecture %s%s", _villes_bl_file, _exc)
else:
log.warning("Villes blacklist : fichier introuvable %s — défauts intégrés utilisés", _villes_bl_file)
try: try:
import ahocorasick as _ahocorasick import ahocorasick as _ahocorasick
@@ -331,7 +322,7 @@ def load_medical_whitelists():
global _MEDICAL_STRUCTURAL_TERMS, _MEDICATION_WHITELIST global _MEDICAL_STRUCTURAL_TERMS, _MEDICATION_WHITELIST
# 1. Charger les termes médicaux structurels # 1. Charger les termes médicaux structurels
config_path = Path("config/medical_terms_whitelist.yml") config_path = Path(__file__).parent / "config" / "medical_terms_whitelist.yml"
if config_path.exists() and yaml: if config_path.exists() and yaml:
try: try:
with open(config_path, 'r', encoding='utf-8') as f: with open(config_path, 'r', encoding='utf-8') as f:
@@ -345,48 +336,20 @@ def load_medical_whitelists():
# 2. Charger la whitelist des médicaments (edsnlp + BDPM + manuels) # 2. Charger la whitelist des médicaments (edsnlp + BDPM + manuels)
_MEDICATION_WHITELIST = _load_edsnlp_drug_names() _MEDICATION_WHITELIST = _load_edsnlp_drug_names()
_MEDICATION_WHITELIST.update(_load_bdpm_medication_names()) _MEDICATION_WHITELIST.update(_load_bdpm_medication_names())
# Ajouter médicaments manquants _MEDICATION_WHITELIST.update(
additional_meds = { _load_wordlist_file(
"idacio", "salazopyrine", "infliximab", "apranax", Path(__file__).parent / "data" / "bdpm" / "medication_whitelist_manual.txt",
"ketoprofene", "prevenar", "pneumovax", "bétadine" transform=str.lower,
} label="Whitelist médicaments manuelle",
_MEDICATION_WHITELIST.update(additional_meds) min_len=3,
)
)
log.info(f"Whitelist médicaments chargée: {len(_MEDICATION_WHITELIST)} médicaments (edsnlp+BDPM)") log.info(f"Whitelist médicaments chargée: {len(_MEDICATION_WHITELIST)} médicaments (edsnlp+BDPM)")
# Charger les whitelists au démarrage du module # Charger les whitelists au démarrage du module
load_medical_whitelists() load_medical_whitelists()
# ----------------- Defaults & Config -----------------
DEFAULTS_CFG = {
"version": 1,
"encoding": "utf-8",
"normalization": "NFKC",
"whitelist": {
"sections_titres": ["DIM", "GHM", "GHS", "RUM", "COMPTE", "RENDU", "DIAGNOSTIC"],
"noms_maj_excepts": ["Médecin DIM", "Praticien conseil"],
"org_gpe_keep": False,
},
"blacklist": {
"force_mask_terms": [],
"force_mask_regex": [],
},
"kv_labels_preserve": ["FINESS", "IPP", "N° OGC", "Etablissement"],
"regex_overrides": [
{
"name": "OGC_court",
"pattern": r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b",
"placeholder": "[OGC]",
"flags": ["IGNORECASE"],
}
],
"flags": {
"case_insensitive": True,
"unicode_word_boundaries": True,
"regex_engine": "python",
},
}
PLACEHOLDERS = { PLACEHOLDERS = {
"EMAIL": "[EMAIL]", "EMAIL": "[EMAIL]",
"TEL": "[TEL]", "TEL": "[TEL]",
@@ -445,408 +408,49 @@ def validate_nir(nir_raw: str) -> bool:
return False return False
return key_int == (97 - (body_int % 97)) return key_int == (97 - (body_int % 97))
# Mots médicaux/techniques/courants qui ne sont pas des noms de personnes # Mots médicaux/techniques/courants qui ne sont pas des noms de personnes.
_MEDICAL_STOP_WORDS_SET = { # Source de vérité externalisée dans data/stopwords_manuels.txt + BDPM/edsnlp.
# Mots français courants (déterminants, prépositions, adverbes, etc.) _MEDICAL_STOP_WORDS_FALLBACK = {
"pas", "mon", "bien", "ancien", "ancienne", "bon", "bonne", "tout", "tous", "date",
"mais", "donc", "car", "que", "qui", "avec", "dans", "pour", "sur", "par", "note",
"les", "des", "une", "est", "son", "ses", "nos", "aux", "cette", "ces", "heure",
"cher", "chez", "entre", "sans", "sous", "vers", "selon", "après", "avant", "type",
"puis", "aussi", "très", "plus", "moins", "peu", "non", "oui", "quelques", "traitement",
"mise", "début", "fin", "suite", "fait", "lieu", "cas", "jour", "jours", "traitements",
"semaine", "semaines", "mois", "temps", "place", "nouvelle", "nouveau", "soins",
"franche", "légère", "quelque", "depuis", "comme", "encore", "votre", "surveillance",
"date", "note", "notes", "nom", "heure", "matin", "soir", "midi", "consultation",
"signé", "réalisé", "courrier", "cabinet", "rue", "hospitalisation",
# Verbes / participes courants
"remontée", "associée", "réalisée", "débuté", "prolongé", "prolongée",
"prescrit", "prescrite", "présente", "présent", "absente", "absent",
"reprise", "introduction", "arrêt", "relais",
# Titres / rôles hospitaliers
"chef", "assistant", "assistante", "praticien", "praticienne",
"docteur", "professeur", "hospitalier", "hospitalière", "hospitaliers",
"spécialiste", "contractuel", "contractuelle", "titulaire",
"confrère", "consoeur", "coordonnateur", "coordonnatrice",
"médecin", "médical", "infirmier", "infirmière",
"praticiens", "patient", "patiente",
# Structure hospitalière
"service", "pôle", "clinique", "consultation", "secrétariat",
"hôpital", "hôpitaux", "centre", "établissement", "polyclinique",
# Villes / géographie (pas des noms de personnes)
"bordeaux", "bayonne", "paris", "lyon", "lille", "marseille",
"toulouse", "nantes", "montpellier", "pessac", "biarritz", "soustons",
"basque", "basques", "sud", "côte",
# Médicaments génériques et spécialités (DCI + noms commerciaux)
"colchicine", "aspirine", "cortancyl", "bisoprolol", "entresto",
"methotrexate", "eplerenone", "speciafoldine", "prednisone",
"corticoïdes", "cortisone",
"paracetamol", "metformine", "solupred", "novorapid", "abasaglar",
"lovenox", "methylprednisolone", "potassium", "humalog", "furosemide",
"insuline", "trulicity", "forxiga", "atorvastatine", "amlodipine",
"ondansetron", "eliquis", "nebivolol", "gaviscon", "loxen",
"morphine", "oxycodone", "kardegic", "tercian", "zopiclone",
"seresta", "tramadol", "alprazolam", "forlax", "levothyrox",
"bromazepam", "gliclazide", "zymad", "pravastatine", "spiriva",
"quetiapine", "sertraline", "crestor", "lercanidipine", "amoxicilline",
"opocalcium", "ferinject", "candesartan", "ceftriaxone", "calcidose",
"laroxyl", "brintellix", "ketoprofene", "adrenaline", "exacyl",
"terbutaline", "ipratropium", "actiskenan", "vialebex", "oxynormoro",
"lansoprazole", "perindopril", "sodium", "velmetia",
"doliprane", "dafalgan", "efferalgan", "spasfon", "vogalene",
"augmentin", "inexium", "omeprazole", "pantoprazole", "esomeprazole",
"ramipril", "lisinopril", "enalapril", "losartan", "valsartan",
"irbesartan", "olmesartan", "telmisartan", "hydrochlorothiazide",
"spironolactone", "furosemide", "lasilix", "aldactone",
"tahor", "crestor", "rosuvastatine", "simvastatine", "fluvastatine",
"xarelto", "pradaxa", "apixaban", "rivaroxaban", "dabigatran",
"plavix", "clopidogrel", "ticagrelor", "brilique",
"ventoline", "seretide", "symbicort", "salmeterol", "fluticasone",
"salbutamol", "tiotropium", "budesonide", "beclometasone",
"oxycodone", "oxynorm", "skenan", "actiskenan", "fentanyl",
"nubain", "nalbuphine", "nefopam", "acupan", "profenid",
"ibuprofene", "diclofenac", "naproxene", "celecoxib",
"gabapentine", "pregabaline", "lyrica", "neurontin",
"amitriptyline", "duloxetine", "venlafaxine", "fluoxetine",
"paroxetine", "escitalopram", "citalopram", "mirtazapine",
"olanzapine", "risperidone", "aripiprazole", "haloperidol",
"loxapine", "cyamemazine", "diazepam", "oxazepam", "lorazepam",
"clonazepam", "midazolam", "hydroxyzine", "atarax", "melatonine",
"stilnox", "zolpidem", "imovane",
"levothyroxine", "metformine", "glimepiride", "sitagliptine",
"januvia", "jardiance", "empagliflozine", "dapagliflozine",
"ozempic", "semaglutide", "dulaglutide", "liraglutide", "victoza",
"heparine", "enoxaparine", "tinzaparine", "innohep",
"warfarine", "coumadine", "fluindione", "previscan",
"ciprofloxacine", "levofloxacine", "ofloxacine", "metronidazole",
"vancomycine", "gentamicine", "tazocilline", "piperacilline",
"meropenem", "imipenem", "clindamycine", "doxycycline",
"azithromycine", "clarithromycine", "cotrimoxazole", "bactrim",
"polyionique", "propranolol", "apidra", "solostar",
# Noms et suffixes laboratoires pharmaceutiques
"arw", "myl", "myp", "arg", "teva", "bga", "agt",
"mylan", "biogaran", "arrow", "sandoz", "zentiva", "cristers",
"accord", "viatris", "ranbaxy", "ratiopharm", "almus", "qualimed",
"evolugen", "alter", "zydus", "medisol", "substipharm",
"sdz", "bgr", "egt", "rnb",
# Formes galéniques / voies d'administration
"cpr", "sachet", "orale", "oral", "sol", "buv", "stylo", "flexpen",
"flestouch", "kwikpen", "inj", "susp", "gelule", "comprime",
"unidose", "perf", "inh", "seringue", "aerosol", "sach", "pdr",
"orodisp", "capsule", "patch", "suppositoire", "gouttes",
# Termes de prescription / pharmacie
"prescription", "prescriptions", "dose", "fréquence", "statut",
"technique", "capteur", "bandelettes", "glycemiques", "glycemique",
"lancettes", "aiguilles", "fines", "micro", "pompe", "réserve",
"glycemie", "capillaire", "hgt",
# Termes médicaux / cliniques
"myocardite", "myosite", "corticothérapie", "biopsie", "pathologie",
"dysimmunitaire", "récidive", "récidivante", "traitement", "diagnostic",
"antécédents", "examen", "bilan", "résultats", "analyse",
"interne", "externe", "médecine", "chirurgie", "rhumatologie",
"dermatologie", "immunologie", "cardiologie", "pneumologie",
"neurologie", "gynécologie", "radiologie", "sénologie",
"douleur", "douleurs", "douloureux", "musculaire", "musculaires",
"thoracique", "thoraciques", "membres", "supérieurs", "inférieurs",
"normale", "normaux", "habituelle", "habituelles",
"synthèse", "hospitalisation", "syndrome", "vaccination", "ophtalmo",
"pelvien", "diabétique", "sommeil", "régime", "diet",
"desinfection", "environnement", "identification", "bracelet",
"toilettes", "accompagner", "installer", "transfusion",
"signes", "vitaux", "alimentaire", "avis", "zone",
"calcémie",
# Abréviations médicales
"irm", "ett", "ecg", "mtx", "fevg", "bdc", "crp", "sfu", "hdj",
"bnp", "asat", "alat", "cpk", "ctc", "hba", "hba1c",
"saos", "tsh", "inr", "vgm", "pnn", "plq", "hb",
"poc", "bax", "act", "bic", "cfx", "acc", "ado", "acf", "vfo",
"qvl", "cci", "pse", "pca", "chl", "crt", "bbm", "pds", "ren",
"vit", "zen",
"scanner", "radio", "écho", "échographie",
# Spécialités médicales (éviter faux positifs NOM)
"hépato-gastro-entérologue", "gastro-entérologue", "gastro-entérologie",
"proctologue", "oncologue", "anesthésiste", "pneumologue", "gérontologue",
"cardiologue", "néphrologue", "urologue", "gériatre",
"hépatologue", "endocrinologue", "stomatologue",
# Termes médicaux / titres fréquemment détectés comme NOM par le NER
"supplémentation", "supplementation", "endocrinologie", "monsieur", "madame",
"suivi", "sortie", "emog", "ophtalmo",
# Médicaments détectés comme NOM/PRENOM par EDS-Pseudo
"eliquis", "trulicity", "saos", "wind", "taxotere", "eupantol", "ezetimibe",
"lansoyl", "xatral", "xenetix", "trimbow", "buspirone", "cetirizine",
"depakote", "versatis", "durogesic", "montelukast", "metformine", "viatris",
"rosuvastatine", "gliclazide", "amlodipine", "perindopril", "nebivolol",
"pravastatine", "bisoprolol", "amoxicilline", "kardegic", "lovenox",
# Termes médicaux / soins / actes détectés comme NOM
"partielle", "cutanee", "cutané", "cutanée", "osseuse", "diabetique",
"diabétique", "transdermique", "transderm", "diarrhees", "diarrhées",
"ionogramme", "scintigraphie", "thoraco", "thorax", "négative", "negative",
"diététicienne", "pressurise", "pressuriser", "inhalee", "inhalée", "inhal",
# Mots courants français détectés comme NOM dans les trackare
"toilette", "repas", "poche", "installation", "education", "éducation",
"refection", "réfection", "complete", "complète", "regime", "régime",
"normal", "traité", "traite", "arrêté", "arrete", "volume",
"commentaires", "france", "covid", "framboise", "epoux", "époux",
# Abréviations médicales courtes (3-4 chars) détectées comme NOM
"ide", "ipp", "pcr", "tap", "gel", "ahl", "ssr", "hds", "tca", "etp",
"mcg", "sdz", "iao", "ser", "orod", "clav", "disp", "cart", "atcd", "mdrd",
"amox", "endoc", "microg", "item", "pyélo", "néphro",
# En-têtes de colonnes / mots structurels trackare
"observations", "observation", "commentaires", "commentaire",
"surveillance", "température", "temperature", "glycémie", "glycemie",
"diurèse", "diurese", "balance", "pouls", "systolique", "diastolique",
"saturation", "fréquence", "frequence", "respiratoire", "douleur",
"alertes", "alerte", "antécédents", "antecedents", "habitus",
"allergies", "prescriptions", "prescription", "administration",
"catégorie", "categorie", "expiration", "message",
"destination", "diagnostique", "diagnostiques",
"date", "note", "nom", "heure", "type", "code", "etat",
"comprime", "comprimé", "gelule", "gélule", "solution", "injectable",
# Médicaments supplémentaires détectés dans les trackare
"depakote", "versatis", "humalog", "forxiga", "durogesic",
"montelukast", "rosuvastatine",
# Abréviations pharma courtes
"cpr", "sol", "bic", "agt", "poche", "inhal",
# Termes chirurgicaux/cliniques FP
"cure", "endocrine", "operatoire", "opératoire", "realisee", "réalisée",
"gauche", "droit", "droite", "face", "profil",
# Faux positifs EDS supplémentaires
"psy", "inhales", "inhalés", "kwikpen", "lansoprazole", "tiorfan", "smecta",
"axa", "ttt", "anionique", "abdomino", "cod", "omi", "urg", "med",
"10mg", "20mg", "40mg", "100mg", "300ui", "500ml", "innohep", "coaprovel",
"actiskenan", "simvastatine", "forlax",
# Mots temporels / contextuels détectés comme EDS_HOPITAL
"semaine", "jour", "matin", "soir", "nuit", "midi",
# Mots clés de contexte document
"compétences", "maladies", "inflammatoires", "systémiques", "rares",
"fret", "fax", "contexte", "résultat", "resultat", "résultats", "resultats",
"haute", "maison", "aide", "rpps", "poste", "fonct",
"sante", "santé", "etxe", "ttipi", "gastro", "concha",
"endoscopie", "endoscopique", "fibroscopie",
"indication", "conclusion", "technique", "anesthésie",
"digestif", "digestive", "digestives", "nutritive",
# Abréviations soins trackare détectées comme NOM (batch 20 OGC)
"soins", "lit", "jeun", "lever", "pose", "surv", "ggt", "vvp",
# Verbes d'instructions soins (aussi des patronymes INSEE → FP)
"coucher", "manger", "marcher", "sortir",
"verif", "crop", "evs", "maco", "pan", "cet", "trou", "nit", "nfs",
# Mots narratifs CRH capturés par fusion sidebar 2-colonnes
"evolution", "évolution", "explorations", "fermeture", "allergie", "allergies",
"lotissement", "cholangiographie", "cholecystectomie", "cholécystectomie",
"paracetamol", "paracétamol", "unité", "unite",
# FP résiduels batch 10 OGC (termes médicaux/instructions soins)
"glyc", "glycosurie", "vider", "forte",
# FP audit batch 59 OGC (mots courants/médicaux flagués comme NOM)
"oncologie", "confrères", "confrere", "doubles", "chers", "motif",
"responsable", "autre", "autres", "autonome", "autonomes",
"préparations", "preparations", "prévenir", "prevenir",
"acétylsalicylique", "acetylsalicylique", "angio",
"desc", "diu", "barreau",
"haitz", "alde",
# FP audit OGC 21 — termes médicaux/courants flagués NOM_GLOBAL
"alimentation", "augmentation", "amelioration", "amélioration",
"biliaire", "biliaires", "bili", "voies", "voie",
"apyrexie", "apyréxie", "apyrétique", "apyretique",
"clavulanique", "mecillinam", "sulfamides", "sulfamide",
"tazobactam", "temocilline", "ecoflac", "furanes", "furane",
"exilar", "lipruzet", "mopral",
"sensible", "sensibles", "dossier", "dossiers",
"entero", "entéro", "medecine", "bio",
"aviation", "contention", "isolement",
"elimination", "élimination", "infectieux",
"hémodynamique", "hemodynamique", "pancréatite", "pancreatite",
"cholecystite", "cholécystite", "cholécystectomie", "cholecystectomie",
"appendicectomie", "néoplasie", "neoplasie",
"ovarienne", "prandial", "fébrile", "febrile",
"eupnéique", "eupneique", "normocarde", "normotendue",
"variable", "dosage", "posologie",
# Abréviations diététiques/soins trackare
"bcy", "po2", "po1", "po3", "bha", "atg", "ras", "cat", "ass",
# FP audit OGC 17 CRH
"mode", "retraitee", "retraité", "retraitée", "régression", "regression", "tel",
"strasbourg", "bordeaux", "toulouse", "paris", "lyon", "marseille", "bayonne", "anglet",
"saint-palais", "tarnos", "hendaye", "dax", "orthez", "oloron", "pau", "cambo",
# Spécialités/services récurrents comme FP NOM
"cancérologie", "cancerologie", "réanimation", "reanimation",
"urologie", "néphrologie", "nephrologie", "hématologie", "hematologie",
"gériatrie", "geriatrie", "pédiatrie", "pediatrie",
"ophtalmologie", "stomatologie", "allergologie",
"kinésithérapie", "kinesitherapie", "ergothérapie", "ergotherapie",
"orthopédie", "orthopedie", "traumatologie",
"palliatifs", "palliative", "palliatif",
"addictologie", "alcoologie", "tabacologie",
# FP soignants trackare (mots courants capturés par patterns Note d'évolution / Signé / Flacon)
"discussion", "echelle", "échelle", "scope", "tdm", "bouteille",
"evendol", "relais", "repas", "poursuite", "indication",
# FP pattern timestamp (termes ALL-CAPS capturés par "HH:MM NOM")
"eliminatin", "elimination", "élimination", "preremplie", "pré-remplie",
"thermie", "alim", "alimentation", "admin",
# Médicaments/tests labo capturés par patterns soignants
"biprofenid", "bi-profenid", "phosphatase", "phosphatases",
"ecbu", "suppo", "suppositoire", "mucite", "microlax", "normacol",
"ciprofloxacine", "lavement", "desinfection", "désinfection",
"avaler", "rachis", "lombaire", "thoraco-lombaire",
"cérébrale", "cerebrale", "cérébral", "cerebral", "hépatique", "hepatique",
"thoracique", "abdominale", "abdominal", "pelvienne", "pelvien", "médullaire",
# Dosages et labos pharma (FP fréquents dans prescriptions Trackare)
"faible", "fort", "forte",
"myl", "mylan", "sandoz", "teva", "arrow", "biogaran", "zentiva", "eg",
"arw", "pan", "mso", "bgn", "ratiopharm", "accord", "vts", "viatris",
"abdomino-pelvien", "abdomino", "pelvien", "thoraco-abdominal",
"entree", "entrée", "continu", "continue",
"morphine", "claforan", "skenan", "actiskenan",
# Fragments de noms de médicaments (pdfplumber split)
"sium", "pegic", "fenid", "profenid",
# Catégories cliniques Trackare (en-têtes de section masqués à tort)
"respi", "respiratoire", "nephro", "cardio", "neuro", "onco", "pulmo",
"hemato", "hémato", "infectieux", "thermie", "diurese", "diurèse",
"transit", "anemie", "anémie", "constantes", "examen",
"post-op", "postop", "pré-op", "preop", "chimio", "elim",
"toilette", "sommeil", "hypota", "hypotension", "spo2",
"urine", "urines", "sng",
"rénale", "renale", "rénal", "renal", "cardiaque",
# Termes structurels trackare
"transmissions", "transmission", "releve", "relevé",
"objectif", "objectifs", "evaluation", "évaluation",
"planification", "planifié", "planifiee",
# ── FP détectés automatiquement par audit_fp_detector.py ──
# Lot 2 : tokens basse confiance (DICT_FR seul) clairement non-noms
"acide", "adulte", "ambulatoire", "avenue", "bandelette", "bassin",
"bijoux", "bouche", "bouchon", "changes", "court", "demande", "dessert",
"devenir", "diffusé", "douche", "entrée", "escarre", "espace",
"explications", "fauteuil", "feuillet", "fixateur", "fois", "gamma",
"germes", "glace", "habillage", "liste", "maquillage", "matelas",
"mettre", "obésité", "ongles", "palais", "perlant", "pertes",
"pièce", "plaie", "risque", "saint", "sang", "signe", "sonde",
"tenue", "texte", "transaminases", "transit", "transmis", "urinal",
"vernis", "vessie", "vrac",
# Lot 2 : termes médicaux (préfixes/suffixes)
"anatomo-pathologique", "anemie", "anémie", "angioscanner",
"cétonurie", "cetonurie", "depilation", "dépilation",
"folique", "gastroentérologue", "gastroenterologue",
"microgrammes", "nalidixique", "naso-gastrique",
"angio-irm", "neuro", "neuro-chirurgie", "endoplasmique",
"cyto", "plaie-colle", "bionolyte",
# Lot 1 (103 tokens, confiance >= 0.5) ──
# Anatomie / clinique
"abdomen", "aortique", "bilirubine", "drain", "gastrique", "histologique",
"intraveineuse", "lithiasique", "macroscopie", "miction", "molles", "pelvienne",
"plaquette", "plaquettes", "rectale", "sanguine", "sciatique", "selle", "urinaire",
# Pathologies / symptômes
"algie", "angoisse", "antécédent", "douloureuse", "dyslipidémie",
"hemodialyse", "hemorragique", "hyperthermie", "hématologue",
# Médicaments / matériel médical
"ampoule", "antalgique", "antiseptique", "compresse", "flacon",
"oxygène", "pansement", "vitamine",
# Biologie / examens
"biochimie", "biologie", "fer",
# Actions / états cliniques
"ablation", "absence", "admission", "bloc", "changement", "cliniquement",
"cognitif", "couchage", "dispositif", "dynamique", "entretien", "histoire",
"intervention", "position", "rappel", "relation", "retour", "réalisation",
"résistant", "réévaluation", "situation", "temporaire", "urgence", "urgences",
"urgent", "validation",
# Mots courants / contextuels
"angle", "bille", "boisson", "bureau", "cases", "circuit",
"concubin", "confortable", "demain", "densité", "dernière",
"distant", "domaine", "elle", "fils", "frère", "grand", "horizon",
"hui", "identifiant", "minuit", "murent", "neuf", "original", "pages",
"personne", "premier", "quartier", "retraite", "route", "rés",
"trouve", "verrouillé", "villa", "étage",
# Termes médicaux courants faussement détectés comme NOM (Phase 2 audit mars 2026)
"ains", "ponction", "hanche", "burkitt", "orl", "gds", "oap", "tvp", "epp",
"bronchite", "accueil", "cadre", "transfert", "relecture", "examens",
"traitements", "traitement", "infectiologie", "cancérologie", "cancerologie",
"maternité", "orale", "sachet", "absence",
# FP audit 30 fichiers Phase 2 (mars 2026)
"bouffee", "bouffée", "discontinue", "respimat", "lyoc",
"probnp", "pro-bnp", "nt-probnp",
"bpco", "colle", "gsc", "masse",
"selle", "selles",
# Acronymes médicaux courts (3 lettres) souvent FP comme NOM
"epo", "irc", "sib", "inr", "iec", "ira", "ait", "avc",
"imc", "ipp", "ivo", "amp", "ivg", "img", "had", "ssr",
"hta", "ecg", "irm", "tep", "crp", "nfs", "bhc", "vgm",
"vni", "aeg", "bas", "snv", "hba", "ide", "dci",
# Termes pharmaceutiques FP comme NOM (audit 30 fichiers mars 2026)
"buvable", "buvables", "nominal", "nominaux",
"acide", "principale", "principal", "principaux",
"hyaluronique", "valproique", "valproïque", "tranexamique", "tranéxamique",
"clavulanique", "nalidixique",
"grancher", # Centre de réadaptation (nom d'établissement homonyme)
"experf", # Prestataire HAD (nom commercial homonyme)
# Noms de services hospitaliers (FP comme [NOM])
"ortho", "mobile", "polyvalente", "polyvalent",
"geriatrie", "gériatrie", "ambulatoire", "provisoire",
"intraveineuse", "intraveineux", "sous-cutanee", "sous-cutané",
# Noms de services hospitaliers (aussi patronymes INSEE → FP récurrents)
"viscerale", "viscérale", "vasculaire", "vasculaires",
"conventionnelle", "conventionnel",
"polyvalente", "polyvalent",
"infectieuse", "infectieuses",
# Termes soins infirmiers / activités de la vie quotidienne (FP trackare doc 216)
"aide", "partielle", "partiel", "complete", "complète", "complet",
"contention", "lavabo", "blader", "scan", "post", "lunettes",
"deshabillage", "déshabillage", "habillage",
"surveillance", "surv", "refection", "réfection",
"miction", "toilette", "douche", "changes",
"installation", "transfert", "mobilisation",
"alimentation", "hydratation", "collation",
"stimulation", "prevention", "prévention",
# Termes pharmaceutiques/matériel médical FP (retour relecteur 2026-03-16)
"chlorure",
# Dispositifs médicaux (FP "OXYGENE LUNETTES" → [NOM])
"canule", "canules", "masque", "sonde", "sondes",
# Termes chirurgicaux FP comme [NOM] (retour relecteur 2026-03-17)
"totale", "total", "partielle", "partiel",
"prothese", "prothèse", "protheses", "prothèses", "unicompartimentale",
# Antiseptiques / produits de soins (FP trackare prescriptions)
"betascrub", "hibiscrub", "betadine", "biseptine", "chlorhexidine",
# Nutrition entérale / compléments
"fresubin", "nutrison", "sondalis", "isosource", "novasource",
# Termes médicaux FP dans bactério / texte libre
"nombreuses", "nombreux", "plusieurs", "quelques",
"internationale", "international",
"resorbable", "résorbable", "resorbables", "résorbables",
"alfa", "capsule", "capsules",
} }
# Enrichissement automatique avec les ~4000 noms de médicaments d'edsnlp _MEDICAL_STOP_WORDS_SET = _load_wordlist_file(
_MEDICAL_STOP_WORDS_SET.update(_load_edsnlp_drug_names()) Path(__file__).parent / "data" / "stopwords_manuels.txt",
transform=str.lower,
# Enrichissement depuis fichier externe (modifiable sans toucher au code) label="Stop-words manuels",
_stopwords_file = Path(__file__).parent / "data" / "stopwords_manuels.txt"
if _stopwords_file.exists():
try:
_sw_count = 0
for _line in _stopwords_file.read_text(encoding="utf-8").splitlines():
_w = _line.strip()
if _w and not _w.startswith("#"):
_MEDICAL_STOP_WORDS_SET.add(_w)
_sw_count += 1
log.info("Stop-words manuels chargés : %d mots depuis %s", _sw_count, _stopwords_file.name)
except Exception as _exc:
log.error("Stop-words manuels : erreur de lecture %s%s", _stopwords_file, _exc)
else:
log.warning("Stop-words manuels : fichier introuvable %s — qualité dégradée", _stopwords_file)
# Enrichissement BDPM : ~7300 noms commerciaux + DCI/substances actives
_bdpm_path = Path(__file__).parent / "data" / "bdpm" / "medicaments_stopwords.txt"
if _bdpm_path.exists():
try:
_bdpm_count = 0
for _line in _bdpm_path.read_text(encoding="utf-8").splitlines():
_w = _line.strip()
if _w and not _w.startswith("#"):
_MEDICAL_STOP_WORDS_SET.add(_w)
_bdpm_count += 1
log.info("BDPM stop-words chargés : %d mots", _bdpm_count)
except Exception as _exc:
log.error("BDPM stop-words : erreur de lecture %s%s", _bdpm_path, _exc)
else:
log.warning("BDPM stop-words : fichier introuvable %s — qualité dégradée", _bdpm_path)
_MEDICAL_STOP_WORDS = (
r"(?:" + "|".join(re.escape(w) for w in _MEDICAL_STOP_WORDS_SET) + r")"
) )
_MEDICAL_STOP_WORDS_SET.update(_load_edsnlp_drug_names())
_MEDICAL_STOP_WORDS_SET.update(
_load_wordlist_file(
Path(__file__).parent / "data" / "bdpm" / "medicaments_stopwords.txt",
transform=str.lower,
label="BDPM stop-words",
)
)
if not _MEDICAL_STOP_WORDS_SET:
_MEDICAL_STOP_WORDS_SET = set(_MEDICAL_STOP_WORDS_FALLBACK)
_BASE_MEDICAL_STOP_WORDS_SET = set(_MEDICAL_STOP_WORDS_SET)
def _refresh_medical_stopwords_pattern() -> None:
global _MEDICAL_STOP_WORDS
if not _MEDICAL_STOP_WORDS_SET:
_MEDICAL_STOP_WORDS = r"(?!)"
return
_MEDICAL_STOP_WORDS = (
r"(?:" + "|".join(re.escape(w) for w in sorted(_MEDICAL_STOP_WORDS_SET)) + r")"
)
_refresh_medical_stopwords_pattern()
# Un token de nom : commence par majuscule, lettres/tirets/apostrophes (PAS d'espace ni de point) # Un token de nom : commence par majuscule, lettres/tirets/apostrophes (PAS d'espace ni de point)
_PERSON_TOKEN = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+" _PERSON_TOKEN = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+"
RE_PERSON_CONTEXT = re.compile( RE_PERSON_CONTEXT = re.compile(
@@ -985,7 +589,17 @@ RE_CIVILITE_INITIALE = re.compile(
# --- N° examen / N° patient imagerie (radiologie) --- # --- N° examen / N° patient imagerie (radiologie) ---
RE_NUM_EXAMEN_PATIENT = re.compile( RE_NUM_EXAMEN_PATIENT = re.compile(
r"N[°o]?\s*(?:d['''\u2019]\s*)?(?:examen|patient|accession|passage)\s*[:\-]?\s*([A-Za-z]{0,4}\d{5,12})", r"N[°o]?\s*(?:d['''\u2019]\s*)?(?:examen|patient(?:\s+imagerie)?|accession|passage)\s*[:\-]?\s*"
r"((?=[A-Za-z0-9\-]{6,20}\b)(?=[A-Za-z0-9\-]*\d)[A-Za-z0-9\-]+)",
re.IGNORECASE,
)
# --- N° bare d'entête labo / imagerie ---
# Exemple:
# N° 23L35781
# Prélevé le 26/07/2023 Enregistré le 27/07/2023
RE_NUM_ACCESSION_HEADER = re.compile(
r"(?:^|\n)\s*N[°o]\s*[:\-]?\s*([A-Za-z0-9\-]{6,20})\s*\n"
r"(?:[^\n]*\n){0,2}\s*(?:Pr[ée]lev[ée]\s+le|Enregistr[ée]\s+le)",
re.IGNORECASE, re.IGNORECASE,
) )
@@ -1177,6 +791,7 @@ _DPI_LABELS_SET: set = _load_txt_set(
) )
if not _DPI_LABELS_SET: if not _DPI_LABELS_SET:
_DPI_LABELS_SET = set(_DPI_LABELS_FALLBACK) _DPI_LABELS_SET = set(_DPI_LABELS_FALLBACK)
_BASE_DPI_LABELS_SET = set(_DPI_LABELS_SET)
# Companion blacklist : termes EN MAJUSCULES qui ne sont JAMAIS des noms # Companion blacklist : termes EN MAJUSCULES qui ne sont JAMAIS des noms
# (spécialités, labos pharma, mots courants ambigus). # (spécialités, labos pharma, mots courants ambigus).
@@ -1189,6 +804,7 @@ _COMPANION_BLACKLIST_SET: set = _load_txt_set(
) )
if not _COMPANION_BLACKLIST_SET: if not _COMPANION_BLACKLIST_SET:
_COMPANION_BLACKLIST_SET = set(_COMPANION_BLACKLIST_FALLBACK) _COMPANION_BLACKLIST_SET = set(_COMPANION_BLACKLIST_FALLBACK)
_BASE_COMPANION_BLACKLIST_SET = set(_COMPANION_BLACKLIST_SET)
_WHITELIST_FUNCTION_WORDS = { _WHITELIST_FUNCTION_WORDS = {
@@ -1223,14 +839,15 @@ def _load_whitelist_phrases(phrases) -> int:
def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]: def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]:
cfg = DEFAULTS_CFG.copy() global _MEDICAL_STOP_WORDS_SET, _VILLE_BLACKLIST, _DPI_LABELS_SET, _COMPANION_BLACKLIST_SET
if config_path and config_path.exists() and yaml is not None: cfg = load_default_dictionaries_dict() if config_path is None else load_effective_dictionaries_dict(config_path)
try:
user = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {} _MEDICAL_STOP_WORDS_SET = set(_BASE_MEDICAL_STOP_WORDS_SET)
for k, v in user.items(): _VILLE_BLACKLIST = set(_BASE_VILLE_BLACKLIST)
cfg[k] = v _DPI_LABELS_SET = set(_BASE_DPI_LABELS_SET)
except Exception: _COMPANION_BLACKLIST_SET = set(_BASE_COMPANION_BLACKLIST_SET)
pass _WHITELIST_NEVER_MASK_TOKENS.clear()
_WHITELIST_NEVER_MASK_PHRASES.clear()
# Charger les stop-words et villes supplémentaires depuis le YAML # Charger les stop-words et villes supplémentaires depuis le YAML
extra_sw = cfg.get("additional_stopwords", []) extra_sw = cfg.get("additional_stopwords", [])
@@ -1239,6 +856,7 @@ def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]:
if w and str(w).strip(): if w and str(w).strip():
_MEDICAL_STOP_WORDS_SET.add(str(w).strip().lower()) _MEDICAL_STOP_WORDS_SET.add(str(w).strip().lower())
log.info("Stop-words YAML supplémentaires : %d", len(extra_sw)) log.info("Stop-words YAML supplémentaires : %d", len(extra_sw))
_refresh_medical_stopwords_pattern()
extra_villes = cfg.get("additional_villes_blacklist", []) extra_villes = cfg.get("additional_villes_blacklist", [])
if extra_villes: if extra_villes:
@@ -1871,8 +1489,49 @@ def _mask_critical_in_key(key: str, audit: List[PiiHit], page_idx: int) -> str:
return key return key
def _replace_captured_value(full_match: str, captured_value: str, placeholder: str) -> str:
start = full_match.find(captured_value)
if start < 0:
return placeholder
end = start + len(captured_value)
return full_match[:start] + placeholder + full_match[end:]
def _mask_structured_line(line: str, audit: List[PiiHit], page_idx: int) -> str:
"""Masque les champs structurés dont la détection dépend du libellé de la ligne."""
def _repl_code_postal(m: re.Match) -> str:
original = m.group(1) or m.group(2) or m.group(0)
audit.append(PiiHit(page_idx, "CODE_POSTAL", original, PLACEHOLDERS["CODE_POSTAL"]))
if m.group(1):
return _replace_captured_value(m.group(0), m.group(1), PLACEHOLDERS["CODE_POSTAL"])
return PLACEHOLDERS["CODE_POSTAL"]
def _repl_num_examen(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "DOSSIER", m.group(1), PLACEHOLDERS["DOSSIER"]))
return _replace_captured_value(m.group(0), m.group(1), PLACEHOLDERS["DOSSIER"])
def _repl_dossier(m: re.Match) -> str:
original = m.group(1) or m.group(2) or m.group(0)
audit.append(PiiHit(page_idx, "DOSSIER", original, PLACEHOLDERS["DOSSIER"]))
return _replace_captured_value(m.group(0), original, PLACEHOLDERS["DOSSIER"])
def _repl_venue(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "NDA", m.group(1), PLACEHOLDERS["NDA"]))
return _replace_captured_value(m.group(0), m.group(1), PLACEHOLDERS["NDA"])
masked = RE_CODE_POSTAL.sub(_repl_code_postal, line)
masked = RE_NUM_EXAMEN_PATIENT.sub(_repl_num_examen, masked)
masked = RE_NUMERO_DOSSIER.sub(_repl_dossier, masked)
masked = RE_VENUE_SEJOUR.sub(_repl_venue, masked)
return masked
def _kv_value_only_mask(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str: def _kv_value_only_mask(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
line = _mask_admin_label(line, audit, page_idx) line = _mask_admin_label(line, audit, page_idx)
structured_line = _mask_structured_line(line, audit, page_idx)
if structured_line != line:
return structured_line
parts = SPLITTER.split(line, maxsplit=1) parts = SPLITTER.split(line, maxsplit=1)
if len(parts) == 2: if len(parts) == 2:
key, value = parts key, value = parts
@@ -2413,6 +2072,35 @@ def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> Tuple[set, s
for m in _RE_EMAIL_HEADER.finditer(full_text): for m in _RE_EMAIL_HEADER.finditer(full_text):
_add_tokens_force_all(m.group(1), "EMAIL_HEADER", "medium") _add_tokens_force_all(m.group(1), "EMAIL_HEADER", "medium")
# En-têtes patient en capitales, sans libellé explicite.
# Exemple:
# ETCHEVERRY JEAN CLAUDE
# On reste conservateur: 2-4 tokens uppercase, avec au moins un prénom
# INSEE et un nom de famille INSEE. Les tokens proposés viennent
# exclusivement des dictionnaires INSEE, sans blacklist codée en dur ici.
_UPPER_NAME_LINE_RE = re.compile(
r"^[ \t]*([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ\-' ]+"
r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ])[ \t]*$",
re.MULTILINE,
)
for m in _UPPER_NAME_LINE_RE.finditer(full_text):
raw_line = re.sub(r"\s+", " ", m.group(1)).strip()
tokens = [tok.strip(" .-'") for tok in raw_line.split() if tok.strip(" .-'")]
if len(tokens) < 2 or len(tokens) > 4:
continue
if any(len(tok) < 3 for tok in tokens):
continue
norm_tokens = [_normalize_nfkd_upper(tok) for tok in tokens]
has_prenom = any(tok in _INSEE_PRENOMS_SET for tok in norm_tokens)
has_nom = any(tok in _INSEE_NOMS_FAMILLE for tok in norm_tokens)
if not (has_prenom and has_nom):
continue
for tok, norm_tok in zip(tokens, norm_tokens):
if norm_tok in _INSEE_PRENOMS_SET or norm_tok in _INSEE_NOMS_FAMILLE:
_add_candidate(tok, "UPPER_NAME_LINE", "low", False)
# Pour les noms composés avec tiret (ex: "LACLAU-LACROUTS"), # Pour les noms composés avec tiret (ex: "LACLAU-LACROUTS"),
# ajouter aussi les parties individuelles pour capturer les occurrences standalone. # ajouter aussi les parties individuelles pour capturer les occurrences standalone.
# _apply_extracted_names traite le composé en premier (plus long) puis les parties. # _apply_extracted_names traite le composé en premier (plus long) puis les parties.
@@ -2582,10 +2270,10 @@ def _apply_extracted_names(text: str, names: set, audit: List[PiiHit], force_nam
def _apply_trackare_hits_to_text(text: str, audit: List[PiiHit]) -> str: def _apply_trackare_hits_to_text(text: str, audit: List[PiiHit]) -> str:
"""Applique les PiiHit non-NOM dans le texte (NDA footers, EPISODE, RPPS, FINESS, etc.). """Applique les PiiHit non-NOM dans le texte (NDA, DOSSIER, EPISODE, RPPS, FINESS, etc.).
Ces hits sont détectés par _extract_trackare_identity ou la phase 0c Ces hits sont détectés par _extract_trackare_identity ou la phase 0c
mais n'étaient appliqués qu'au PDF raster, pas au fichier .pseudonymise.txt.""" mais n'étaient appliqués qu'au PDF raster, pas au fichier .pseudonymise.txt."""
_APPLY_KINDS = {"EPISODE", "RPPS", "FINESS"} _APPLY_KINDS = {"DOSSIER", "EPISODE", "FINESS", "NDA", "RPPS"}
# Collecter les valeurs à remplacer, groupées par placeholder # Collecter les valeurs à remplacer, groupées par placeholder
replacements: Dict[str, str] = {} # original → placeholder replacements: Dict[str, str] = {} # original → placeholder
for h in audit: for h in audit:
@@ -2698,7 +2386,14 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
for m in _RE_IPP_MULTILINE.finditer(full_raw): for m in _RE_IPP_MULTILINE.finditer(full_raw):
audit.append(PiiHit(-1, "IPP", m.group(1), PLACEHOLDERS["IPP"])) audit.append(PiiHit(-1, "IPP", m.group(1), PLACEHOLDERS["IPP"]))
# Phase 0f : DEMANDE N° multiline (DEMANDE N°\n2300261164) # Phase 0f : numéro d'accession / d'examen en en-tête de labo ou imagerie
# Ex:
# N° 23L35781
# Prélevé le 26/07/2023
for m in RE_NUM_ACCESSION_HEADER.finditer(full_raw):
audit.append(PiiHit(-1, "DOSSIER", m.group(1), PLACEHOLDERS["DOSSIER"]))
# Phase 0g : DEMANDE N° multiline (DEMANDE N°\n2300261164)
_RE_DEMANDE_MULTILINE = re.compile( _RE_DEMANDE_MULTILINE = re.compile(
r"DEMANDE\s+N[°o]?\s*\n\s*(\d{8,})", r"DEMANDE\s+N[°o]?\s*\n\s*(\d{8,})",
re.IGNORECASE, re.IGNORECASE,
@@ -2706,14 +2401,14 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
for m in _RE_DEMANDE_MULTILINE.finditer(full_raw): for m in _RE_DEMANDE_MULTILINE.finditer(full_raw):
audit.append(PiiHit(-1, "DOSSIER", m.group(1), PLACEHOLDERS["DOSSIER"])) audit.append(PiiHit(-1, "DOSSIER", m.group(1), PLACEHOLDERS["DOSSIER"]))
# Phase 0g : N° venue multiline (tableaux BACTERIO : label et valeur séparés) # Phase 0h : N° venue multiline (tableaux BACTERIO : label et valeur séparés)
_RE_VENUE_MULTILINE = re.compile( _RE_VENUE_MULTILINE = re.compile(
r"N[°o]?\s*venue\s*[:\-]?\s*\n(?:[^\n]*\n){0,2}\s*(\d{6,})", r"N[°o]?\s*venue\s*[:\-]?\s*\n(?:[^\n]*\n){0,2}\s*(\d{6,})",
re.IGNORECASE, re.IGNORECASE,
) )
for m in _RE_VENUE_MULTILINE.finditer(full_raw): for m in _RE_VENUE_MULTILINE.finditer(full_raw):
audit.append(PiiHit(-1, "NDA", m.group(1), PLACEHOLDERS["NDA"])) audit.append(PiiHit(-1, "NDA", m.group(1), PLACEHOLDERS["NDA"]))
# Phase 0g-bis : N° venue inversé (layout-aware réordonne : numéro AVANT label) # Phase 0h-bis : N° venue inversé (layout-aware réordonne : numéro AVANT label)
_RE_VENUE_REVERSE = re.compile( _RE_VENUE_REVERSE = re.compile(
r"(?<!\d)(\d{7,10})(?!\d)\s*\n(?:[^\n]*\n){0,4}N[°o]?\s*venue\s*[:\-]?\s*$", r"(?<!\d)(\d{7,10})(?!\d)\s*\n(?:[^\n]*\n){0,4}N[°o]?\s*venue\s*[:\-]?\s*$",
re.IGNORECASE | re.MULTILINE, re.IGNORECASE | re.MULTILINE,
@@ -3092,55 +2787,17 @@ def _build_finess_ac():
return return
# Mots génériques qui ne doivent jamais être matchés seuls # Mots génériques qui ne doivent jamais être matchés seuls
_ac_generic_blacklist = { _ac_generic_blacklist = _load_wordlist_file(
# Types d'établissements data_dir / "generic_name_blacklist.txt",
"clinique", "pharmacie", "hopital", "centre", "foyer", transform=str.lower,
"residence", "maison", "cabinet", "service", "laboratoire", label="FINESS noms génériques blacklist",
"institut", "association", "fondation", "mutuelle", "polyclinique", )
"dispensaire", "hospice", "annexe", "antenne", "site",
# Mots français courants qui sont aussi des noms d'établissements
"collegiale", "collegial", "cathedral", "cathedrale",
"providence", "esperance", "renaissance", "liberation",
"republique", "fraternite", "solidarite", "independance",
"beauregard", "bellevue", "belvedere",
"promenade", "esplanade", "corniche", "prefecture",
"croissant", "confluence", "bienvenue",
"chartreuse", "commanderie", "chapelle", "basilique",
"departement", "departementale", "communautaire",
# Spécialités médicales / termes cliniques courants
"chirurgicale", "radiologie", "addictologie", "prevention",
"psychotherapique", "ambulatoire", "hospitalisation",
"consultation", "surveillance", "therapeutique",
"readaptation", "reeducation", "reanimation",
"specialisee", "conventionnelle", "professionnelle",
"informatique", "administrative", "regionale",
# Mots communs
"generation", "revolution", "assomption", "visitation",
"consolation", "atlantique", "manutention", "prefiguration",
"intervalle", "pharmaciens", "pharmacien", "transfert",
"comprimee", "comprimees", "injectable", "injectables",
"maintenant", "actuellement", "auparavant", "prochainement",
"rapidement", "correctement", "directement", "simplement",
"internationale", "international", "intercommunal", "intercommunale",
# Termes médicaux homonymes d'établissements FINESS (retour relecteur 2026-03-17)
"resistance", "radiotherapie", "chimiotherapie", "curietherapie",
"hormonotherapie", "immunotherapie", "kinesitherapie",
"ergotherapie", "orthophonie", "psychomotricite",
"reeducation", "readaptation", "convalescence",
"dependance", "autonomie", "gerontologie",
}
# Expressions multi-mots trop génériques # Expressions multi-mots trop génériques
_ac_generic_phrases = { _ac_generic_phrases = _load_wordlist_file(
"a domicile", "au domicile", "menage a domicile", data_dir / "generic_phrase_blacklist.txt",
"du nord", "du sud", "de l est", "de l ouest", transform=str.lower,
"la maison", "la residence", "les jardins", label="FINESS expressions génériques blacklist",
"le village", "le parc", "la colline", )
"au soleil", "en france",
# Expressions médicales homonymes d'établissements FINESS (FP relecteur 2026-03-16)
"long cours", "au long cours",
"le bourg", "le val", "le clos", "le mas",
"les pins", "les chenes", "les oliviers",
}
# Whitelist explicite de mono-mots < 10 chars considérés comme distinctifs # Whitelist explicite de mono-mots < 10 chars considérés comme distinctifs
# (sinon rejetés par le filtre général). Exemple : EMBRUNS (7 chars). # (sinon rejetés par le filtre général). Exemple : EMBRUNS (7 chars).
# Alimentée depuis data/finess/mono_mots_distinctifs.txt — curation manuelle. # Alimentée depuis data/finess/mono_mots_distinctifs.txt — curation manuelle.
@@ -3365,8 +3022,11 @@ def _build_finess_addr_ac():
"sentier", "rond-point", "traverse", "esplanade", "sentier", "rond-point", "traverse", "esplanade",
"promenade", "montee", "voie", "carrefour", "faubourg"} "promenade", "montee", "voie", "carrefour", "faubourg"}
# Patterns non-adresse à exclure # Patterns non-adresse à exclure
_addr_blacklist = {"cabinet medical", "cabinet dentaire", "cabinet infirmier", _addr_blacklist = _load_wordlist_file(
"cabinet paramedical", "cabinet sage-femme"} data_dir / "address_blacklist.txt",
transform=str.lower,
label="FINESS adresses blacklist",
)
for line in addr_path.read_text(encoding="utf-8").splitlines(): for line in addr_path.read_text(encoding="utf-8").splitlines():
name = line.strip() name = line.strip()
if not name or len(name) < 10: if not name or len(name) < 10:
@@ -3804,11 +3464,19 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
protected = RE_ADRESSE.sub(PLACEHOLDERS["ADRESSE"], protected) protected = RE_ADRESSE.sub(PLACEHOLDERS["ADRESSE"], protected)
protected = RE_ADRESSE_LIEU_DIT.sub(PLACEHOLDERS["ADRESSE"], protected) protected = RE_ADRESSE_LIEU_DIT.sub(PLACEHOLDERS["ADRESSE"], protected)
protected = RE_BP.sub(PLACEHOLDERS["ADRESSE"], protected) protected = RE_BP.sub(PLACEHOLDERS["ADRESSE"], protected)
protected = RE_CODE_POSTAL.sub(PLACEHOLDERS["CODE_POSTAL"], protected) def _rescan_code_postal(m: re.Match) -> str:
if m.group(1):
return _replace_captured_value(m.group(0), m.group(1), PLACEHOLDERS["CODE_POSTAL"])
return PLACEHOLDERS["CODE_POSTAL"]
protected = RE_CODE_POSTAL.sub(_rescan_code_postal, protected)
# N° Episode # N° Episode
protected = RE_EPISODE.sub(PLACEHOLDERS["EPISODE"], protected) protected = RE_EPISODE.sub(PLACEHOLDERS["EPISODE"], protected)
# N° venue / séjour # N° venue / séjour
protected = RE_VENUE_SEJOUR.sub(PLACEHOLDERS["NDA"], protected) protected = RE_VENUE_SEJOUR.sub(
lambda m: _replace_captured_value(m.group(0), m.group(1), PLACEHOLDERS["NDA"]),
protected,
)
# N° RPPS # N° RPPS
protected = RE_RPPS.sub(PLACEHOLDERS["RPPS"], protected) protected = RE_RPPS.sub(PLACEHOLDERS["RPPS"], protected)
# FINESS par gazetteer (nombres 9 chiffres matchant un vrai numéro FINESS) # FINESS par gazetteer (nombres 9 chiffres matchant un vrai numéro FINESS)
@@ -4825,7 +4493,7 @@ if __name__ == "__main__":
ap.add_argument("--out", type=str, default="out") ap.add_argument("--out", type=str, default="out")
ap.add_argument("--no-vector", action="store_true") ap.add_argument("--no-vector", action="store_true")
ap.add_argument("--raster", action="store_true") ap.add_argument("--raster", action="store_true")
ap.add_argument("--config", type=str, default=str(Path("config/dictionnaires.yml"))) ap.add_argument("--config", type=str, default=str(RUNTIME_DICTIONARIES_CONFIG_PATH))
ap.add_argument("--hf", action="store_true", help="Activer NER ONNX sur narratif (nécessite ner_manager_onnx)") ap.add_argument("--hf", action="store_true", help="Activer NER ONNX sur narratif (nécessite ner_manager_onnx)")
ap.add_argument("--model", type=str, default="cmarkea/distilcamembert-base-ner") ap.add_argument("--model", type=str, default="cmarkea/distilcamembert-base-ner")
args = ap.parse_args() args = ap.parse_args()

View File

@@ -33,6 +33,7 @@ python -m nuitka ^
--include-module=ner_manager_onnx ^ --include-module=ner_manager_onnx ^
--include-module=eds_pseudo_manager ^ --include-module=eds_pseudo_manager ^
--include-data-dir=config=config ^ --include-data-dir=config=config ^
--include-data-dir=data=data ^
--include-data-dir=models=models ^ --include-data-dir=models=models ^
--nofollow-import-to=onnxruntime ^ --nofollow-import-to=onnxruntime ^
--nofollow-import-to=numpy ^ --nofollow-import-to=numpy ^

View File

@@ -0,0 +1,59 @@
# Template versionné des règles d'anonymisation.
# Ce fichier décrit les valeurs par défaut complètes de l'application.
# La surcharge locale chargée par défaut est config/dictionnaires.yml.
version: 1
encoding: utf-8
normalization: NFKC
whitelist:
sections_titres:
- DIM
- GHM
- GHS
- RUM
- COMPTE
- RENDU
- DIAGNOSTIC
noms_maj_excepts:
- Médecin DIM
- Praticien conseil
org_gpe_keep: false
blacklist:
# Sigles et libellés propres à l'établissement non couverts par les gazetteers
# nationaux (FINESS / INSEE / BDPM). Évitez d'ajouter ici des noms d'hôpitaux,
# villes, codes postaux ou numéros FINESS — ils sont déjà détectés automatiquement.
force_mask_terms:
- CHCB
- 'Dates du séjour :'
- CONCERTATION
- LABORATOIRE de BIOLOGIE MEDICALE
force_mask_regex:
- '13\s*,?\s*Avenue\s+de\s+l.Interne\s+J\.?\s*LOEB\s+BP\s*\d+'
kv_labels_preserve:
- FINESS
- IPP
- N° OGC
- Etablissement
regex_overrides:
- name: OGC_court
pattern: \b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b
placeholder: '[OGC]'
flags:
- IGNORECASE
whitelist_phrases:
- "classification internationale"
- "prise en charge"
- "bas de contention"
- "date de naissance"
- "lieu de naissance"
- "ville de résidence"
- "date de sortie"
- "date d'admission"
- "code postal"
additional_stopwords: []
additional_villes_blacklist: []
additional_dpi_labels: []
additional_companion_blacklist: []
flags:
case_insensitive: true
unicode_word_boundaries: true
regex_engine: python

View File

@@ -1,83 +1,11 @@
version: 1 # Surcharge locale chargée par défaut par l'application.
encoding: utf-8 # Source de vérité des valeurs par défaut : config/dictionnaires.default.yml
normalization: NFKC # Ce fichier ne doit contenir que les écarts spécifiques à l'environnement courant.
whitelist: #
sections_titres: # Exemples :
- DIM # blacklist:
- GHM # force_mask_terms:
- GHS # - VOTRE_SIGLE
- RUM # additional_stopwords:
- COMPTE # - votre_terme
- RENDU {}
- DIAGNOSTIC
noms_maj_excepts:
- Médecin DIM
- Praticien conseil
org_gpe_keep: false
blacklist:
# Sigles et libellés propres à l'établissement non couverts par les gazetteers
# nationaux (FINESS / INSEE / BDPM). Évitez d'ajouter ici des noms d'hôpitaux,
# villes, codes postaux ou numéros FINESS — ils sont déjà détectés automatiquement.
force_mask_terms:
- CHCB # Sigle local non référencé FINESS
- 'Dates du séjour :' # Libellé administratif (politique masquage)
- CONCERTATION # Mention de RCP (politique métier)
- LABORATOIRE de BIOLOGIE MEDICALE # Libellé administratif générique
force_mask_regex:
# Adresse précise du CHCB — couverte par l'AC FINESS adresses mais on garde
# la regex en filet de sécurité (encodages PDF, espaces non standards).
- '13\s*,?\s*Avenue\s+de\s+l.Interne\s+J\.?\s*LOEB\s+BP\s*\d+'
kv_labels_preserve:
- FINESS
- IPP
- N° OGC
- Etablissement
regex_overrides:
- name: OGC_court
pattern: \b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b
placeholder: '[OGC]'
flags:
- IGNORECASE
# Phrases à ne JAMAIS anonymiser (faux positifs récurrents)
# Ajouter ici les expressions qui sont masquées à tort.
# La correspondance est insensible à la casse.
whitelist_phrases:
- "classification internationale"
- "prise en charge"
- "bas de contention"
- "date de naissance"
- "lieu de naissance"
- "ville de résidence"
- "date de sortie"
- "date d'admission"
- "code postal"
# Mots supplémentaires à ne jamais masquer comme noms de personnes
# (complète les 9000+ stop-words intégrés)
additional_stopwords: []
# Exemple :
# - "votre_mot"
# Villes supplémentaires à ne jamais matcher comme lieux
# (complète les 115+ villes blacklistées intégrées)
additional_villes_blacklist: []
# Exemple :
# - "VOTRE_VILLE"
# Labels DPI supplémentaires à ne jamais masquer comme noms
# (complète data/dpi_labels_blacklist.txt)
# Utiliser pour : titres de colonnes, en-têtes de sections, libellés de champs
additional_dpi_labels: []
# Exemple :
# - "Service"
# - "Statut"
# Termes en MAJUSCULES à ne jamais propager comme noms compagnons
# (complète data/companion_blacklist.txt — spécialités, labos pharma, mots ambigus)
additional_companion_blacklist: []
# Exemple :
# - "VOTRE_SPECIALITE"
flags:
case_insensitive: true
unicode_word_boundaries: true
regex_engine: python

177
config_defaults.py Normal file
View File

@@ -0,0 +1,177 @@
#!/usr/bin/env python3
"""
Helpers partagés pour la config dictionnaires.
"""
from __future__ import annotations
from copy import deepcopy
from pathlib import Path
from typing import Any, Dict
try:
import yaml
except Exception:
yaml = None
PROJECT_DIR = Path(__file__).resolve().parent
CONFIG_DIR = PROJECT_DIR / "config"
DEFAULT_DICTIONARIES_CONFIG_PATH = CONFIG_DIR / "dictionnaires.default.yml"
RUNTIME_DICTIONARIES_CONFIG_PATH = CONFIG_DIR / "dictionnaires.yml"
_RUNTIME_DICTIONARIES_OVERLAY_TEXT = """# Surcharge locale chargée par défaut par l'application.
# Seuls les écarts par rapport à config/dictionnaires.default.yml sont nécessaires ici.
# Si ce fichier est vide, les valeurs du template par défaut s'appliquent.
#
# Exemples :
# blacklist:
# force_mask_terms:
# - VOTRE_SIGLE
# additional_stopwords:
# - votre_terme
{}
"""
_FALLBACK_DEFAULT_DICTIONARIES_TEXT = """version: 1
encoding: utf-8
normalization: NFKC
whitelist:
sections_titres:
- DIM
- GHM
- GHS
- RUM
- COMPTE
- RENDU
- DIAGNOSTIC
noms_maj_excepts:
- Médecin DIM
- Praticien conseil
org_gpe_keep: false
blacklist:
force_mask_terms: []
force_mask_regex: []
kv_labels_preserve:
- FINESS
- IPP
- N° OGC
- Etablissement
regex_overrides:
- name: OGC_court
pattern: \\b(?:N°\\s*)?OGC\\s*[:\\-]?\\s*([A-Za-z0-9\\-]{1,3})\\b
placeholder: '[OGC]'
flags:
- IGNORECASE
whitelist_phrases: []
additional_stopwords: []
additional_villes_blacklist: []
additional_dpi_labels: []
additional_companion_blacklist: []
flags:
case_insensitive: true
unicode_word_boundaries: true
regex_engine: python
"""
_FALLBACK_DEFAULT_DICTIONARIES_DICT: Dict[str, Any] = {
"version": 1,
"encoding": "utf-8",
"normalization": "NFKC",
"whitelist": {
"sections_titres": ["DIM", "GHM", "GHS", "RUM", "COMPTE", "RENDU", "DIAGNOSTIC"],
"noms_maj_excepts": ["Médecin DIM", "Praticien conseil"],
"org_gpe_keep": False,
},
"blacklist": {
"force_mask_terms": [],
"force_mask_regex": [],
},
"kv_labels_preserve": ["FINESS", "IPP", "N° OGC", "Etablissement"],
"regex_overrides": [
{
"name": "OGC_court",
"pattern": r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b",
"placeholder": "[OGC]",
"flags": ["IGNORECASE"],
}
],
"whitelist_phrases": [],
"additional_stopwords": [],
"additional_villes_blacklist": [],
"additional_dpi_labels": [],
"additional_companion_blacklist": [],
"flags": {
"case_insensitive": True,
"unicode_word_boundaries": True,
"regex_engine": "python",
},
}
def read_default_dictionaries_text() -> str:
try:
return DEFAULT_DICTIONARIES_CONFIG_PATH.read_text(encoding="utf-8")
except Exception:
return _FALLBACK_DEFAULT_DICTIONARIES_TEXT
def read_runtime_dictionaries_overlay_text() -> str:
return _RUNTIME_DICTIONARIES_OVERLAY_TEXT
def load_default_dictionaries_dict() -> Dict[str, Any]:
text = read_default_dictionaries_text()
if yaml is not None:
try:
loaded = yaml.safe_load(text) or {}
if isinstance(loaded, dict):
return loaded
except Exception:
pass
return deepcopy(_FALLBACK_DEFAULT_DICTIONARIES_DICT)
def load_runtime_dictionaries_overlay_dict(path: Path | None = None) -> Dict[str, Any]:
target = Path(path) if path is not None else RUNTIME_DICTIONARIES_CONFIG_PATH
if not target.exists():
return {}
if yaml is None:
return {}
try:
loaded = yaml.safe_load(target.read_text(encoding="utf-8")) or {}
if isinstance(loaded, dict):
return loaded
except Exception:
pass
return {}
def load_effective_dictionaries_dict(path: Path | None = None) -> Dict[str, Any]:
return deep_merge_dict(
load_default_dictionaries_dict(),
load_runtime_dictionaries_overlay_dict(path),
)
def deep_merge_dict(base: Dict[str, Any], override: Dict[str, Any]) -> Dict[str, Any]:
merged = deepcopy(base)
for key, value in (override or {}).items():
if isinstance(value, dict) and isinstance(merged.get(key), dict):
merged[key] = deep_merge_dict(merged[key], value)
elif isinstance(value, list) and isinstance(merged.get(key), list):
combined = list(merged[key])
for item in value:
if item not in combined:
combined.append(deepcopy(item))
merged[key] = combined
else:
merged[key] = deepcopy(value)
return merged
def ensure_runtime_dictionaries_config(path: Path | None = None) -> Path:
target = Path(path) if path is not None else RUNTIME_DICTIONARIES_CONFIG_PATH
if not target.exists():
target.parent.mkdir(parents=True, exist_ok=True)
target.write_text(read_runtime_dictionaries_overlay_text(), encoding="utf-8")
return target

View File

@@ -0,0 +1,11 @@
# Compléments manuels à la whitelist médicaments.
# Un terme par ligne, en lowercase.
idacio
salazopyrine
infliximab
apranax
ketoprofene
prevenar
pneumovax
bétadine

View File

@@ -0,0 +1,7 @@
# Faux positifs à exclure du gazetteer d'adresses FINESS.
cabinet medical
cabinet dentaire
cabinet infirmier
cabinet paramedical
cabinet sage-femme

View File

@@ -0,0 +1,112 @@
# Noms d'établissements trop génériques à ignorer dans l'automate FINESS.
clinique
pharmacie
hopital
centre
foyer
residence
maison
cabinet
service
laboratoire
institut
association
fondation
mutuelle
polyclinique
dispensaire
hospice
annexe
antenne
site
collegiale
collegial
cathedral
cathedrale
providence
esperance
renaissance
liberation
republique
fraternite
solidarite
independance
beauregard
bellevue
belvedere
promenade
esplanade
corniche
prefecture
croissant
confluence
bienvenue
chartreuse
commanderie
chapelle
basilique
departement
departementale
communautaire
chirurgicale
radiologie
addictologie
prevention
psychotherapique
ambulatoire
hospitalisation
consultation
surveillance
therapeutique
readaptation
reeducation
reanimation
specialisee
conventionnelle
professionnelle
informatique
administrative
regionale
generation
revolution
assomption
visitation
consolation
atlantique
manutention
prefiguration
intervalle
pharmaciens
pharmacien
transfert
comprimee
comprimees
injectable
injectables
maintenant
actuellement
auparavant
prochainement
rapidement
correctement
directement
simplement
internationale
international
intercommunal
intercommunale
resistance
radiotherapie
chimiotherapie
curietherapie
hormonotherapie
immunotherapie
kinesitherapie
ergotherapie
orthophonie
psychomotricite
convalescence
dependance
autonomie
gerontologie

View File

@@ -0,0 +1,26 @@
# Expressions FINESS multi-mots trop génériques à ignorer.
a domicile
au domicile
menage a domicile
du nord
du sud
de l est
de l ouest
la maison
la residence
les jardins
le village
le parc
la colline
au soleil
en france
long cours
au long cours
le bourg
le val
le clos
le mas
les pins
les chenes
les oliviers

View File

@@ -37,33 +37,18 @@ try:
except Exception: except Exception:
yaml = None yaml = None
APP_TITLE = "Pseudonymisation de PDF" from config_defaults import (
DEFAULT_CFG = Path("config/dictionnaires.yml") RUNTIME_DICTIONARIES_CONFIG_PATH,
read_default_dictionaries_text,
read_runtime_dictionaries_overlay_text,
)
# YAML par défaut (patterns en bloc littéral pour éviter les échappements) APP_TITLE = "Pseudonymisation de PDF"
DEFAULTS_CFG_TEXT = """# dictionnaires.yml valeurs par défaut DEFAULT_CFG = RUNTIME_DICTIONARIES_CONFIG_PATH
version: 1
encoding: "utf-8" # YAML par défaut externalisé dans config/dictionnaires.default.yml
normalization: "NFKC" DEFAULTS_CFG_TEXT = read_default_dictionaries_text()
whitelist: RUNTIME_CFG_TEXT = read_runtime_dictionaries_overlay_text()
sections_titres: [DIM, GHM, GHS, RUM, COMPTE, RENDU, DIAGNOSTIC]
noms_maj_excepts: ["Médecin DIM", "Praticien conseil"]
org_gpe_keep: true
blacklist:
force_mask_terms: []
force_mask_regex: []
kv_labels_preserve: [FINESS, IPP, "N° OGC", Etablissement]
regex_overrides:
- name: OGC_court
pattern: |-
\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b
placeholder: '[OGC]'
flags: [IGNORECASE]
flags:
case_insensitive: true
unicode_word_boundaries: true
regex_engine: "python"
"""
# ---------- util : ToolTip & helpers ---------- # ---------- util : ToolTip & helpers ----------
class ToolTip: class ToolTip:
@@ -211,7 +196,7 @@ class App:
p = Path(self.cfg_path.get()) p = Path(self.cfg_path.get())
p.parent.mkdir(parents=True, exist_ok=True) p.parent.mkdir(parents=True, exist_ok=True)
if not p.exists(): if not p.exists():
p.write_text(DEFAULTS_CFG_TEXT, encoding="utf-8") p.write_text(RUNTIME_CFG_TEXT, encoding="utf-8")
def _cfg_browse(self): def _cfg_browse(self):
d = filedialog.asksaveasfilename(defaultextension=".yml", filetypes=[("YAML","*.yml *.yaml"), ("Tous","*.*")]) d = filedialog.asksaveasfilename(defaultextension=".yml", filetypes=[("YAML","*.yml *.yaml"), ("Tous","*.*")])
@@ -248,7 +233,7 @@ class App:
return return
try: try:
with open(self.cfg_path.get(), "w", encoding="utf-8") as f: with open(self.cfg_path.get(), "w", encoding="utf-8") as f:
yaml.safe_dump(self.cfg_data or yaml.safe_load(DEFAULTS_CFG_TEXT), f, allow_unicode=True, sort_keys=False) yaml.safe_dump(self.cfg_data or {}, f, allow_unicode=True, sort_keys=False)
self._log("Règles sauvegardées.") self._log("Règles sauvegardées.")
except Exception as e: except Exception as e:
messagebox.showerror("Erreur", f"Impossible d'écrire le fichier de règles: {e}") messagebox.showerror("Erreur", f"Impossible d'écrire le fichier de règles: {e}")
@@ -258,8 +243,8 @@ class App:
def _restore_defaults(self): def _restore_defaults(self):
try: try:
Path(self.cfg_path.get()).write_text(DEFAULTS_CFG_TEXT, encoding="utf-8") Path(self.cfg_path.get()).write_text(RUNTIME_CFG_TEXT, encoding="utf-8")
self._log("Règles restaurées aux valeurs par défaut.") self._log("Surcharge locale réinitialisée.")
self._load_cfg() self._load_cfg()
except Exception as e: except Exception as e:
messagebox.showerror("Erreur", f"Impossible d'écrire le YAML par défaut: {e}") messagebox.showerror("Erreur", f"Impossible d'écrire le YAML par défaut: {e}")

View File

@@ -9,6 +9,7 @@ from collections import Counter
sys.path.insert(0, str(Path(__file__).parent)) sys.path.insert(0, str(Path(__file__).parent))
import anonymizer_core_refactored_onnx as core import anonymizer_core_refactored_onnx as core
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
from eds_pseudo_manager import EdsPseudoManager from eds_pseudo_manager import EdsPseudoManager
from vlm_manager import VlmManager from vlm_manager import VlmManager
from gliner_manager import GlinerManager from gliner_manager import GlinerManager
@@ -16,7 +17,7 @@ from camembert_ner_manager import CamembertNerManager
SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)") SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
OUTDIR = SRC / "anonymise_audit_30" OUTDIR = SRC / "anonymise_audit_30"
CONFIG = Path("/home/dom/ai/anonymisation/config/dictionnaires.yml") CONFIG = RUNTIME_DICTIONARIES_CONFIG_PATH
PDFS = [ PDFS = [
SRC / "114_23060661/CONSULTATION ANESTHESISTE 23060661.pdf", SRC / "114_23060661/CONSULTATION ANESTHESISTE 23060661.pdf",

View File

@@ -9,11 +9,12 @@ from collections import Counter
sys.path.insert(0, str(Path(__file__).parent)) sys.path.insert(0, str(Path(__file__).parent))
import anonymizer_core_refactored_onnx as core import anonymizer_core_refactored_onnx as core
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
from eds_pseudo_manager import EdsPseudoManager from eds_pseudo_manager import EdsPseudoManager
SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)") SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
OUTDIR = SRC / "anonymise" OUTDIR = SRC / "anonymise"
CONFIG = Path("/home/dom/ai/anonymisation/config/dictionnaires.yml") CONFIG = RUNTIME_DICTIONARIES_CONFIG_PATH
def main(): def main():
# Charger EDS-Pseudo # Charger EDS-Pseudo

View File

@@ -19,9 +19,11 @@ from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent)) sys.path.insert(0, str(Path(__file__).parent))
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)") SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
OUTDIR = SRC / "anonymise_silver_extra" OUTDIR = SRC / "anonymise_silver_extra"
CONFIG = Path("/home/dom/ai/anonymisation/config/dictionnaires.yml") CONFIG = RUNTIME_DICTIONARIES_CONFIG_PATH
# PDFs déjà traités dans l'audit 30 (à exclure) # PDFs déjà traités dans l'audit 30 (à exclure)
ALREADY_DONE_AUDIT30 = { ALREADY_DONE_AUDIT30 = {

View File

@@ -13,13 +13,18 @@ import json
import sys import sys
from pathlib import Path from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
try: try:
import yaml import yaml
except ImportError: except ImportError:
print("ERREUR : pyyaml requis (pip install pyyaml)") print("ERREUR : pyyaml requis (pip install pyyaml)")
sys.exit(1) sys.exit(1)
CONFIG = Path(__file__).parent.parent / "config" / "dictionnaires.yml" from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
CONFIG = RUNTIME_DICTIONARIES_CONFIG_PATH
def merge_params(json_files: list, config_path: Path = CONFIG, dry_run: bool = False): def merge_params(json_files: list, config_path: Path = CONFIG, dry_run: bool = False):

View File

@@ -29,6 +29,8 @@ from typing import Optional
from fastapi import FastAPI, File, Form, UploadFile from fastapi import FastAPI, File, Form, UploadFile
from fastapi.responses import JSONResponse from fastapi.responses import JSONResponse
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
logging.basicConfig( logging.basicConfig(
level=logging.INFO, level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
@@ -86,7 +88,7 @@ def _load_models():
"""Charge tous les modèles NER une seule fois au démarrage.""" """Charge tous les modèles NER une seule fois au démarrage."""
global _eds_manager, _camembert_manager, _gliner_manager, _vlm_manager, _cfg global _eds_manager, _camembert_manager, _gliner_manager, _vlm_manager, _cfg
_cfg = load_dictionaries(Path(__file__).parent / "config" / "dictionnaires.yml") _cfg = load_dictionaries(RUNTIME_DICTIONARIES_CONFIG_PATH)
# EDS-Pseudo (F1=0.97) # EDS-Pseudo (F1=0.97)
if EdsPseudoManager is not None: if EdsPseudoManager is not None:
@@ -288,7 +290,7 @@ async def anonymize_pdf(
out_dir=out_dir, out_dir=out_dir,
make_vector_redaction=vector_redaction, make_vector_redaction=vector_redaction,
also_make_raster_burn=raster_redaction, also_make_raster_burn=raster_redaction,
config_path=Path(__file__).parent / "config" / "dictionnaires.yml", config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
use_hf=use_ner and ner_mgr is not None, use_hf=use_ner and ner_mgr is not None,
ner_manager=ner_mgr, ner_manager=ner_mgr,
gliner_manager=_gliner_manager if use_ner else None, gliner_manager=_gliner_manager if use_ner else None,

View File

@@ -3,6 +3,7 @@
from pathlib import Path from pathlib import Path
import anonymizer_core_refactored_onnx as core import anonymizer_core_refactored_onnx as core
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
# Tester avec un seul PDF # Tester avec un seul PDF
test_pdf = Path("/home/dom/Téléchargements").rglob("*.pdf") test_pdf = Path("/home/dom/Téléchargements").rglob("*.pdf")
@@ -16,7 +17,7 @@ if test_pdf:
Path("/tmp/test_gui"), Path("/tmp/test_gui"),
make_vector_redaction=False, make_vector_redaction=False,
also_make_raster_burn=True, also_make_raster_burn=True,
config_path=Path("config/dictionnaires.yml"), config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
use_hf=False, use_hf=False,
) )
print(f"✅ Succès: {result}") print(f"✅ Succès: {result}")

View File

@@ -6,6 +6,7 @@ from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent)) sys.path.insert(0, str(Path(__file__).parent))
import anonymizer_core_refactored_onnx as core import anonymizer_core_refactored_onnx as core
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
# Test avec un PDF simple # Test avec un PDF simple
test_pdf = Path("/tmp/test_gui_pdfs") test_pdf = Path("/tmp/test_gui_pdfs")
@@ -31,7 +32,7 @@ try:
out_dir=out_dir, out_dir=out_dir,
make_vector_redaction=False, make_vector_redaction=False,
also_make_raster_burn=True, also_make_raster_burn=True,
config_path=Path("config/dictionnaires.yml"), config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
use_hf=False, use_hf=False,
ner_manager=None, ner_manager=None,
ner_thresholds=None, ner_thresholds=None,

12
tests/conftest.py Normal file
View File

@@ -0,0 +1,12 @@
#!/usr/bin/env python3
"""
Configuration pytest partagée pour les imports du dépôt.
"""
import sys
from pathlib import Path
ROOT_DIR = Path(__file__).resolve().parent.parent
if str(ROOT_DIR) not in sys.path:
sys.path.insert(0, str(ROOT_DIR))

View File

@@ -0,0 +1,26 @@
# Tests synthétiques de non-régression
Cette suite fournit 10 cas synthétiques courts, relisibles et diffables, pensés
comme première barrière de sécurité avant la revue humaine.
Principe :
- `test.txt` contient le document synthétique d'entrée à relire ou diff-er.
- `expected.txt` contient la sortie anonymisée attendue, normalisée.
- `expected.audit.json` contient un résumé stable de l'audit attendu.
- `config_overlay.yml` est optionnel et permet de tester une surcharge locale.
Objectif :
- bloquer les régressions évidentes sur les règles critiques ;
- rendre les écarts lisibles dans un diff Git ou dans la sortie de `pytest` ;
- compléter, et non remplacer, la validation humaine sur corpus réel.
Portée de cette première version :
- texte uniquement ;
- pas encore de PDF/OCR/layout ;
- pas encore de cas `xfail` pour les bugs connus.
Exécution :
```bash
pytest -q tests/unit/test_synthetic_regression.py
```

View File

@@ -0,0 +1,22 @@
[
{
"kind": "DATE_NAISSANCE",
"original": "Né le 12/03/1980",
"replacement": "[DATE_NAISSANCE]"
},
{
"kind": "NOM_GLOBAL",
"original": "ETCHEVERRY",
"replacement": "[NOM]"
},
{
"kind": "NOM_GLOBAL",
"original": "CLAUDE",
"replacement": "[NOM]"
},
{
"kind": "NOM_GLOBAL",
"original": "JEAN",
"replacement": "[NOM]"
}
]

View File

@@ -0,0 +1,3 @@
[NOM] [NOM] [NOM]
[DATE_NAISSANCE]
Consultation du 14/03/2024

View File

@@ -0,0 +1,3 @@
ETCHEVERRY JEAN CLAUDE
Né le 12/03/1980
Consultation du 14/03/2024

View File

@@ -0,0 +1,3 @@
ETCHEVERRY JEAN CLAUDE
Né le 12/03/1980
Consultation du 14/03/2024

View File

@@ -0,0 +1,12 @@
[
{
"kind": "EMAIL",
"original": "jean.dupont@example.com",
"replacement": "[EMAIL]"
},
{
"kind": "TEL",
"original": "01 23 45 67 89",
"replacement": "[TEL]"
}
]

View File

@@ -0,0 +1 @@
Contact : [EMAIL] ou [TEL]

View File

@@ -0,0 +1 @@
Contact: jean.dupont@example.com ou 01 23 45 67 89

View File

@@ -0,0 +1 @@
Contact: jean.dupont@example.com ou 01 23 45 67 89

View File

@@ -0,0 +1,7 @@
[
{
"kind": "NDA",
"original": "1234567",
"replacement": "[NDA]"
}
]

View File

@@ -0,0 +1,3 @@
N° venue :
[NDA]
Date de séjour : 14/03/2024

View File

@@ -0,0 +1,3 @@
N° venue :
1234567
Date de séjour : 14/03/2024

View File

@@ -0,0 +1,3 @@
N° venue :
1234567
Date de séjour : 14/03/2024

View File

@@ -0,0 +1,27 @@
[
{
"kind": "RPPS",
"original": "12345678901",
"replacement": "[RPPS]"
},
{
"kind": "FINESS",
"original": "123456789",
"replacement": "[FINESS]"
},
{
"kind": "IPP",
"original": "ABC12345",
"replacement": "[IPP]"
},
{
"kind": "OGC",
"original": "12",
"replacement": "[OGC]"
},
{
"kind": "IBAN",
"original": "FR76 3000 6000 0112 3456 7890 189",
"replacement": "[IBAN]"
}
]

View File

@@ -0,0 +1,5 @@
RPPS : [RPPS]
FINESS : [FINESS]
IPP : [IPP]
N° OGC : [OGC]
IBAN : [IBAN]

View File

@@ -0,0 +1,5 @@
RPPS : 12345678901
FINESS : 123456789
IPP : ABC12345
N° OGC : 12
IBAN : FR76 3000 6000 0112 3456 7890 189

View File

@@ -0,0 +1,5 @@
RPPS : 12345678901
FINESS : 123456789
IPP : ABC12345
N° OGC : 12
IBAN : FR76 3000 6000 0112 3456 7890 189

View File

@@ -0,0 +1,7 @@
[
{
"kind": "force_term",
"original": "CHCB",
"replacement": "[MASK]"
}
]

View File

@@ -0,0 +1 @@
Patient adressé au [MASK] pour avis. Retour au [MASK] demain.

View File

@@ -0,0 +1 @@
Patient adressé au CHCB pour avis. Retour au CHCB demain.

View File

@@ -0,0 +1 @@
Patient adressé au CHCB pour avis. Retour au CHCB demain.

View File

@@ -0,0 +1 @@
La classification internationale reste visible. La prise en charge est correcte.

View File

@@ -0,0 +1 @@
La classification internationale reste visible. La prise en charge est correcte.

View File

@@ -0,0 +1 @@
La classification internationale reste visible. La prise en charge est correcte.

View File

@@ -0,0 +1,3 @@
blacklist:
force_mask_terms:
- LOCAL_SIGLE

View File

@@ -0,0 +1,7 @@
[
{
"kind": "force_term",
"original": "LOCAL_SIGLE",
"replacement": "[MASK]"
}
]

View File

@@ -0,0 +1 @@
Réorientation vers [MASK] en urgence.

View File

@@ -0,0 +1 @@
Réorientation vers LOCAL_SIGLE en urgence.

View File

@@ -0,0 +1 @@
Réorientation vers LOCAL_SIGLE en urgence.

View File

@@ -0,0 +1,7 @@
[
{
"kind": "VILLE",
"original": "Bayonne",
"replacement": "[VILLE]"
}
]

View File

@@ -0,0 +1,2 @@
[VILLE], le 12/03/2024
Compte rendu adressé au patient.

View File

@@ -0,0 +1,2 @@
Bayonne, le 12/03/2024
Compte rendu adressé au patient.

View File

@@ -0,0 +1,2 @@
Bayonne, le 12/03/2024
Compte rendu adressé au patient.

View File

@@ -0,0 +1,17 @@
[
{
"kind": "NOM_GLOBAL",
"original": "ETCHEVERRY",
"replacement": "[NOM]"
},
{
"kind": "NOM_GLOBAL",
"original": "CLAUDE",
"replacement": "[NOM]"
},
{
"kind": "NOM_GLOBAL",
"original": "JEAN",
"replacement": "[NOM]"
}
]

View File

@@ -0,0 +1,2 @@
[NOM] [NOM] [NOM]
Le patient [NOM] revient ce jour.

View File

@@ -0,0 +1,2 @@
ETCHEVERRY JEAN CLAUDE
Le patient ETCHEVERRY revient ce jour.

View File

@@ -0,0 +1,2 @@
ETCHEVERRY JEAN CLAUDE
Le patient ETCHEVERRY revient ce jour.

View File

@@ -0,0 +1,7 @@
[
{
"kind": "ETAB_SPACED",
"original": "C E N T R E H O S P I T A L I E R D E L A C O T E B A S Q U E",
"replacement": "[ETABLISSEMENT]"
}
]

View File

@@ -0,0 +1,2 @@
[ETABLISSEMENT]
Service de cardiologie

View File

@@ -0,0 +1,2 @@
C E N T R E H O S P I T A L I E R D E L A C O T E B A S Q U E
Service de cardiologie

View File

@@ -0,0 +1,2 @@
C E N T R E H O S P I T A L I E R D E L A C O T E B A S Q U E
Service de cardiologie

View File

@@ -0,0 +1,110 @@
{
"001_patient_header_and_birth": {
"description": "En-tête patient en majuscules avec date de naissance masquée et date de soin conservée.",
"must_contain": [
"[DATE_NAISSANCE]",
"Consultation du 14/03/2024"
],
"must_not_contain": [
"ETCHEVERRY",
"JEAN",
"CLAUDE",
"12/03/1980"
]
},
"002_contact_bundle": {
"description": "Email et téléphone dans une même ligne de contact.",
"must_contain": [
"[EMAIL]",
"[TEL]"
],
"must_not_contain": [
"jean.dupont@example.com",
"01 23 45 67 89"
]
},
"003_multiline_venue_number": {
"description": "Numéro de venue éclaté sur deux lignes.",
"must_contain": [
"N° venue :",
"[NDA]",
"Date de séjour : 14/03/2024"
],
"must_not_contain": [
"1234567"
]
},
"004_identifier_bundle": {
"description": "Bloc d'identifiants structurés variés.",
"must_contain": [
"[RPPS]",
"[FINESS]",
"[IPP]",
"[OGC]",
"[IBAN]"
],
"must_not_contain": [
"12345678901",
"123456789",
"ABC12345",
"FR76 3000 6000 0112 3456 7890 189"
]
},
"005_force_mask_default_term": {
"description": "Terme forcé par la configuration par défaut.",
"must_contain": [
"[MASK]"
],
"must_not_contain": [
"CHCB"
]
},
"006_whitelist_phrases_preserved": {
"description": "Expressions métier explicitement préservées.",
"must_contain": [
"classification internationale",
"prise en charge"
],
"must_not_contain": []
},
"007_overlay_force_mask_local": {
"description": "Terme local masqué via surcharge runtime.",
"must_contain": [
"[MASK]"
],
"must_not_contain": [
"LOCAL_SIGLE"
]
},
"008_ville_header": {
"description": "Ville en en-tête de courrier, date conservée.",
"must_contain": [
"[VILLE], le 12/03/2024"
],
"must_not_contain": [
"Bayonne"
]
},
"009_header_and_repeated_name": {
"description": "Propagation globale d'un nom vu dans l'en-tête.",
"must_contain": [
"Le patient [NOM] revient ce jour."
],
"must_not_contain": [
"ETCHEVERRY",
"JEAN",
"CLAUDE"
]
},
"010_spaced_establishment_header": {
"description": "En-tête d'établissement avec lettres espacées.",
"must_contain": [
"[ETABLISSEMENT]",
"Service de cardiologie"
],
"must_not_contain": [
"C E N T R E",
"H O S P I T A L I E R"
]
}
}

View File

@@ -0,0 +1,25 @@
# Jeux de tests synthétiques
Ces fichiers sont les cas de test relisibles à la main. Chaque dossier contient :
- `test.txt` : document synthétique d'entrée
- `expected.txt` : sortie anonymisée attendue
- `expected.audit.json` : résumé d'audit attendu
Cas disponibles :
- `001_patient_header_and_birth`
- `002_contact_bundle`
- `003_multiline_venue_number`
- `004_identifier_bundle`
- `005_force_mask_default_term`
- `006_whitelist_phrases_preserved`
- `007_overlay_force_mask_local`
- `008_ville_header`
- `009_header_and_repeated_name`
- `010_spaced_establishment_header`
Exemples de fichiers à ouvrir :
- [001 test](</home/dom/ai/anonymisation/tests/synthetic_regression/cases/001_patient_header_and_birth/test.txt:1>)
- [001 attendu](</home/dom/ai/anonymisation/tests/synthetic_regression/cases/001_patient_header_and_birth/expected.txt:1>)
- [004 test](</home/dom/ai/anonymisation/tests/synthetic_regression/cases/004_identifier_bundle/test.txt:1>)
- [004 attendu](</home/dom/ai/anonymisation/tests/synthetic_regression/cases/004_identifier_bundle/expected.txt:1>)
- [007 surcharge locale](</home/dom/ai/anonymisation/tests/synthetic_regression/cases/007_overlay_force_mask_local/config_overlay.yml:1>)

View File

@@ -0,0 +1,26 @@
# Corpus synthétique de revue humaine
Ce corpus ne remplace pas les tests unitaires. Il sert à valider des documents
complets, relus par un humain, avec un vrai diff entre :
- `test.txt` : document synthétique source
- `expected.txt` : anonymisation attendue selon la règle métier
- `actual/` : sortie réellement produite par le moteur
Objectif :
- détecter les régressions de composition sur des documents réalistes ;
- rendre visibles les écarts de comportement du moteur ;
- préparer une validation humaine avant promotion éventuelle en suite bloquante.
Commande :
```bash
python3 tools/run_synthetic_review_corpus.py
```
Chaque exécution écrit :
- `actual.txt`
- `actual.audit.json`
- `actual.summary.json`
- `diff.txt`
Sous [actual](/home/dom/ai/anonymisation/tests/synthetic_review/actual).

View File

@@ -0,0 +1,31 @@
{
"required_kinds": [
"ADRESSE",
"CODE_POSTAL",
"DATE_NAISSANCE",
"EMAIL",
"ETAB",
"IPP",
"NDA",
"NOM_FORCE",
"TEL",
"VILLE",
"force_term"
],
"must_contain": [
"classification internationale",
"prise en charge",
"Service de cardiologie"
],
"must_not_contain": [
"ETCHEVERRY",
"JEAN",
"CLAUDE",
"12/03/1980",
"06 12 34 56 78",
"jean.claude.etcheverry@example.com",
"ABC12345",
"1234567",
"CHCB"
]
}

View File

@@ -0,0 +1,19 @@
[ETABLISSEMENT]
[VILLE], le 14/03/2024
COMPTE RENDU D'HOSPITALISATION
Patient : [NOM] [NOM] [NOM]
[DATE_NAISSANCE]
Adresse : [ADRESSE]
Code postal : [CODE_POSTAL]
Ville de résidence : [VILLE]
Téléphone : [TEL]
Mail : [EMAIL]
IPP : [IPP]
N° venue :
[NDA]
Le patient [NOM] [NOM] [NOM] est adressé au [MASK] pour bilan.
La classification internationale et la prise en charge sont discutées.
Service de cardiologie.

View File

@@ -0,0 +1,10 @@
# Revue 001
Points critiques :
- le patient doit être masqué partout, y compris en reprise narrative ;
- la date de naissance doit être masquée, pas la date de soin ;
- l'adresse, le code postal, la ville, le téléphone, le mail, l'IPP et le numéro de venue doivent disparaître ;
- `classification internationale`, `prise en charge` et `Service de cardiologie` doivent rester lisibles.
Écart attendu aujourd'hui :
- ce cas doit mettre en évidence si le moteur perd des labels structurés comme `Code postal :` ou `N° venue :`.

View File

@@ -0,0 +1,19 @@
CENTRE HOSPITALIER DE LA COTE BASQUE
Bayonne, le 14/03/2024
COMPTE RENDU D'HOSPITALISATION
Patient : ETCHEVERRY JEAN CLAUDE
Né le 12/03/1980
Adresse : 14 rue des Lilas
Code postal : 64100
Ville de résidence : Bayonne
Téléphone : 06 12 34 56 78
Mail : jean.claude.etcheverry@example.com
IPP : ABC12345
N° venue :
1234567
Le patient ETCHEVERRY JEAN CLAUDE est adressé au CHCB pour bilan.
La classification internationale et la prise en charge sont discutées.
Service de cardiologie.

View File

@@ -0,0 +1,26 @@
{
"required_kinds": [
"DATE_NAISSANCE",
"DOSSIER",
"ETAB_SPACED",
"FINESS",
"IBAN",
"NOM_FORCE",
"OGC",
"RPPS"
],
"must_contain": [
"Service de radiologie",
"classification internationale"
],
"must_not_contain": [
"DUPONT",
"MARIE",
"PAULE",
"01/02/1975",
"23L35781",
"12345678901",
"123456789",
"FR76 3000 6000 0112 3456 7890 189"
]
}

View File

@@ -0,0 +1,13 @@
[ETABLISSEMENT]
Service de radiologie
Compte rendu d'imagerie
Patient : [NOM] [NOM] [NOM]
[DATE_NAISSANCE]
N° examen : [DOSSIER]
RPPS : [RPPS]
FINESS : [FINESS]
N° OGC : [OGC]
IBAN : [IBAN]
Le dossier de [NOM] [NOM] [NOM] est revu ce jour.
La classification internationale est conservée.

View File

@@ -0,0 +1,7 @@
# Revue 002
Points critiques :
- l'en-tête d'établissement espacé doit être réduit à un placeholder ;
- le numéro d'examen, le RPPS, le FINESS, l'OGC et l'IBAN doivent disparaître ;
- le nom du patient doit être masqué dans le champ structuré et dans la phrase narrative ;
- `Service de radiologie` et `classification internationale` doivent rester visibles.

View File

@@ -0,0 +1,13 @@
C E N T R E H O S P I T A L I E R D E L A C O T E B A S Q U E
Service de radiologie
Compte rendu d'imagerie
Patient : DUPONT MARIE PAULE
Née le 01/02/1975
N° examen : 23L35781
RPPS : 12345678901
FINESS : 123456789
N° OGC : 12
IBAN : FR76 3000 6000 0112 3456 7890 189
Le dossier de DUPONT MARIE PAULE est revu ce jour.
La classification internationale est conservée.

View File

@@ -0,0 +1,29 @@
{
"required_kinds": [
"DATE_NAISSANCE",
"EMAIL",
"ETAB",
"IPP",
"NOM_FORCE",
"RPPS",
"TEL",
"VILLE",
"force_term"
],
"must_contain": [
"prise en charge en hôpital de jour"
],
"must_not_contain": [
"LAFITTE",
"ANNE",
"MARIE",
"18/07/1968",
"Bordeaux",
"Anglet",
"anne.lafitte@example.com",
"01 23 45 67 89",
"10987654321",
"ZXC98765",
"CHCB"
]
}

View File

@@ -0,0 +1,14 @@
[ETABLISSEMENT]
[VILLE], le 22/05/2024
CONSULTATION DE SUIVI
Patient : [NOM] [NOM] [NOM]
[DATE_NAISSANCE]
Lieu de naissance : [VILLE]
Ville de résidence : [VILLE]
Contact : [EMAIL] ou [TEL]
RPPS : [RPPS]
IPP : [IPP]
Le patient [NOM] [NOM] [NOM] est adressé au [MASK].
La prise en charge en hôpital de jour est maintenue.

View File

@@ -0,0 +1,7 @@
# Revue 003
Points critiques :
- la ville d'en-tête, le lieu de naissance et la ville de résidence doivent être masqués ;
- le contact mail/téléphone, le RPPS et l'IPP doivent être masqués ;
- la reprise narrative du nom du patient doit être masquée ;
- `prise en charge en hôpital de jour` doit rester visible.

View File

@@ -0,0 +1,14 @@
CLINIQUE ATLANTIQUE
Biarritz, le 22/05/2024
CONSULTATION DE SUIVI
Patient : LAFITTE ANNE MARIE
Née le 18/07/1968
Lieu de naissance : Bordeaux
Ville de résidence : Anglet
Contact : anne.lafitte@example.com ou 01 23 45 67 89
RPPS : 10987654321
IPP : ZXC98765
Le patient LAFITTE ANNE MARIE est adressé au CHCB.
La prise en charge en hôpital de jour est maintenue.

View File

@@ -0,0 +1,27 @@
{
"required_kinds": [
"EMAIL",
"FINESS",
"IPP",
"NOM_GLOBAL",
"OGC",
"RPPS",
"TEL",
"VILLE",
"force_term"
],
"must_not_contain": [
"ETCHEVERRY",
"JEAN",
"CLAUDE",
"ABC12345",
"123456789",
"12345678901",
"Bayonne",
"Bordeaux",
"Anglet",
"06 11 22 33 44",
"jean.dupont@example.com",
"CHCB"
]
}

View File

@@ -0,0 +1,11 @@
[NOM] [NOM] [NOM]
IPP : [IPP]
FINESS : [FINESS]
RPPS : [RPPS]
[VILLE], le 12/03/2024
Lieu de naissance : [VILLE]
Ville de résidence : [VILLE]
Téléphone : [TEL]
Mail : [EMAIL]
N° OGC : [OGC]
Patient adressé au [MASK] pour avis. Retour au [MASK] demain.

View File

@@ -0,0 +1,7 @@
# Revue 004
Points critiques :
- les identifiants structurés doivent être masqués même quand le label et la valeur sont séparés ;
- la ville d'en-tête et les villes structurées doivent disparaître ;
- le nom de patient en en-tête doit être propagé ;
- les deux occurrences de `CHCB` doivent être masquées.

View File

@@ -0,0 +1,12 @@
ETCHEVERRY JEAN CLAUDE
IPP
ABC12345
FINESS : 123456789
RPPS : 12345678901
Bayonne, le 12/03/2024
Lieu de naissance : Bordeaux
Ville de résidence : Anglet
Téléphone : 06 11 22 33 44
Mail : jean.dupont@example.com
N° OGC : 12
Patient adressé au CHCB pour avis. Retour au CHCB demain.

View File

@@ -0,0 +1,15 @@
# Index du corpus de revue
Cas complets disponibles :
- [001 source](</home/dom/ai/anonymisation/tests/synthetic_review/cases/001_crh_hospitalisation_complete/test.txt:1>)
- [001 attendu](</home/dom/ai/anonymisation/tests/synthetic_review/cases/001_crh_hospitalisation_complete/expected.txt:1>)
- [001 revue](</home/dom/ai/anonymisation/tests/synthetic_review/cases/001_crh_hospitalisation_complete/review.md:1>)
- [002 source](</home/dom/ai/anonymisation/tests/synthetic_review/cases/002_imagerie_complete/test.txt:1>)
- [002 attendu](</home/dom/ai/anonymisation/tests/synthetic_review/cases/002_imagerie_complete/expected.txt:1>)
- [002 revue](</home/dom/ai/anonymisation/tests/synthetic_review/cases/002_imagerie_complete/review.md:1>)
- [003 source](</home/dom/ai/anonymisation/tests/synthetic_review/cases/003_consultation_complete/test.txt:1>)
- [003 attendu](</home/dom/ai/anonymisation/tests/synthetic_review/cases/003_consultation_complete/expected.txt:1>)
- [003 revue](</home/dom/ai/anonymisation/tests/synthetic_review/cases/003_consultation_complete/review.md:1>)
- [004 source](</home/dom/ai/anonymisation/tests/synthetic_review/cases/004_structured_admin_complete/test.txt:1>)
- [004 attendu](</home/dom/ai/anonymisation/tests/synthetic_review/cases/004_structured_admin_complete/expected.txt:1>)
- [004 revue](</home/dom/ai/anonymisation/tests/synthetic_review/cases/004_structured_admin_complete/review.md:1>)

View File

@@ -0,0 +1,92 @@
#!/usr/bin/env python3
"""
Tests de non-régression pour la config externalisée.
"""
from pathlib import Path
import anonymizer_core_refactored_onnx as core
from config_defaults import (
deep_merge_dict,
ensure_runtime_dictionaries_config,
load_effective_dictionaries_dict,
read_default_dictionaries_text,
read_runtime_dictionaries_overlay_text,
)
def test_default_config_template_is_externalized():
text = read_default_dictionaries_text()
assert "blacklist:" in text
assert "whitelist_phrases:" in text
cfg = core.load_dictionaries(None)
assert "CHCB" in cfg["blacklist"]["force_mask_terms"]
def test_runtime_overlay_template_is_minimal():
text = read_runtime_dictionaries_overlay_text()
assert "dictionnaires.default.yml" in text
assert "{}" in text
def test_deep_merge_dict_preserves_nested_defaults():
base = {
"whitelist": {
"sections_titres": ["DIM"],
"org_gpe_keep": False,
},
"flags": {
"case_insensitive": True,
"regex_engine": "python",
},
}
override = {
"whitelist": {
"sections_titres": ["GHM"],
"org_gpe_keep": True,
},
"flags": {
"regex_engine": "re2",
},
}
merged = deep_merge_dict(base, override)
assert merged["whitelist"]["sections_titres"] == ["DIM", "GHM"]
assert merged["whitelist"]["org_gpe_keep"] is True
assert merged["flags"]["case_insensitive"] is True
assert merged["flags"]["regex_engine"] == "re2"
def test_additional_stopwords_refresh_and_reset(tmp_path: Path):
cfg_path = tmp_path / "cfg.yml"
cfg_path.write_text("additional_stopwords:\n - xyzzymed\n", encoding="utf-8")
core.load_dictionaries(cfg_path)
assert "xyzzymed" in core._MEDICAL_STOP_WORDS_SET
assert "xyzzymed" in core._MEDICAL_STOP_WORDS
core.load_dictionaries(None)
assert "xyzzymed" not in core._MEDICAL_STOP_WORDS_SET
assert "xyzzymed" not in core._MEDICAL_STOP_WORDS
def test_runtime_overlay_is_created_and_effective_merge_works(tmp_path: Path):
cfg_path = tmp_path / "dictionnaires.yml"
created = ensure_runtime_dictionaries_config(cfg_path)
assert created == cfg_path
assert cfg_path.exists()
effective = load_effective_dictionaries_dict(cfg_path)
assert "CHCB" in effective["blacklist"]["force_mask_terms"]
cfg_path.write_text(
"blacklist:\n force_mask_terms:\n - LOCAL_SIGLE\n",
encoding="utf-8",
)
effective = load_effective_dictionaries_dict(cfg_path)
assert "CHCB" in effective["blacklist"]["force_mask_terms"]
assert "LOCAL_SIGLE" in effective["blacklist"]["force_mask_terms"]

View File

@@ -0,0 +1,63 @@
#!/usr/bin/env python3
"""
Tests de non-régression pour les fuites en en-tête de document.
"""
from anonymizer_core_refactored_onnx import (
RE_NUM_ACCESSION_HEADER,
RE_NUM_EXAMEN_PATIENT,
anonymise_document_regex,
load_dictionaries,
selective_rescan,
)
class TestHeaderPiiDetection:
"""Cas réels vus en production: nom patient en capitales + numéro d'examen compact."""
def test_uppercase_patient_header_is_masked(self):
cfg = load_dictionaries(None)
anon = anonymise_document_regex(["ETCHEVERRY JEAN CLAUDE"], [[]], cfg)
assert "ETCHEVERRY" not in anon.text_out
assert "JEAN" not in anon.text_out
assert "CLAUDE" not in anon.text_out
assert anon.text_out == "[NOM] [NOM] [NOM]"
def test_compact_exam_number_matches_labeled_pattern(self):
match = RE_NUM_EXAMEN_PATIENT.search("N° examen : 23L35781")
assert match is not None
assert match.group(1) == "23L35781"
def test_bare_header_accession_number_is_added_to_audit(self):
cfg = load_dictionaries(None)
text = (
"N° 23L35781\n"
"Prélevé le 26/07/2023\n"
"Enregistré le 27/07/2023\n"
)
match = RE_NUM_ACCESSION_HEADER.search(text)
assert match is not None
assert match.group(1) == "23L35781"
anon = anonymise_document_regex([text], [[]], cfg)
assert any(h.kind == "DOSSIER" and h.original == "23L35781" for h in anon.audit)
def test_labeled_exam_number_is_masked_in_text_and_audit(self):
cfg = load_dictionaries(None)
anon = anonymise_document_regex(["N° examen : 23L35781"], [[]], cfg)
text = selective_rescan(anon.text_out, cfg)
assert text == "N° examen : [DOSSIER]"
assert any(h.kind == "DOSSIER" and h.original == "23L35781" for h in anon.audit)
def test_structured_code_postal_preserves_label_and_audit(self):
cfg = load_dictionaries(None)
anon = anonymise_document_regex(["Code postal : 64100"], [[]], cfg)
text = selective_rescan(anon.text_out, cfg)
assert text == "Code postal : [CODE_POSTAL]"
assert any(h.kind == "CODE_POSTAL" and h.original == "64100" for h in anon.audit)

View File

@@ -0,0 +1,100 @@
#!/usr/bin/env python3
"""
Tests synthétiques de non-régression pour l'anonymisation.
"""
import json
from pathlib import Path
import pytest
from anonymizer_core_refactored_onnx import (
anonymise_document_regex,
load_dictionaries,
selective_rescan,
)
from evaluation.leak_scanner import LeakScanner
SUITE_DIR = Path(__file__).resolve().parents[1] / "synthetic_regression"
CASES_DIR = SUITE_DIR / "cases"
MANIFEST_PATH = SUITE_DIR / "manifest.json"
LEAK_SCANNER = LeakScanner()
def _normalize_text(text: str) -> str:
text = text.replace("\r\n", "\n").replace("\r", "\n")
return "\n".join(line.rstrip() for line in text.strip().splitlines())
def _load_manifest() -> dict:
return json.loads(MANIFEST_PATH.read_text(encoding="utf-8"))
def _case_dirs() -> list[Path]:
return sorted(path for path in CASES_DIR.iterdir() if path.is_dir())
def _normalize_audit(audit: list) -> list[dict]:
return [
{
"kind": hit.kind,
"original": hit.original,
"replacement": hit.placeholder,
}
for hit in audit
]
def _load_case_cfg(case_dir: Path):
overlay_path = case_dir / "config_overlay.yml"
return load_dictionaries(overlay_path if overlay_path.exists() else None)
def _assertions_for(case_name: str) -> dict:
manifest = _load_manifest()
return manifest[case_name]
def test_synthetic_regression_inventory():
assert MANIFEST_PATH.exists()
assert len(_case_dirs()) == 10
assert len(_load_manifest()) == 10
@pytest.mark.parametrize("case_dir", _case_dirs(), ids=lambda path: path.name)
def test_synthetic_regression_case(case_dir: Path):
cfg = _load_case_cfg(case_dir)
case_rules = _assertions_for(case_dir.name)
input_path = case_dir / "test.txt"
if not input_path.exists():
input_path = case_dir / "input.txt"
input_text = input_path.read_text(encoding="utf-8")
expected_text = _normalize_text((case_dir / "expected.txt").read_text(encoding="utf-8"))
expected_audit = json.loads((case_dir / "expected.audit.json").read_text(encoding="utf-8"))
result = anonymise_document_regex([input_text], [[]], cfg)
actual_text = _normalize_text(selective_rescan(result.text_out, cfg))
actual_audit = _normalize_audit(result.audit)
assert actual_text == expected_text
assert actual_audit == expected_audit
for required in case_rules.get("must_contain", []):
assert required in actual_text
for forbidden in case_rules.get("must_not_contain", []):
assert forbidden not in actual_text
leaks = LEAK_SCANNER.scan_text(
actual_text,
[
{
"kind": item["kind"],
"original": item["original"],
}
for item in actual_audit
],
)
assert not leaks

View File

@@ -2,12 +2,12 @@
"""Debug force_term mechanism.""" """Debug force_term mechanism."""
import re import re
import yaml
from pathlib import Path
# Load config from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH, load_effective_dictionaries_dict
cfg_path = Path("config/dictionnaires.yml")
cfg = yaml.safe_load(cfg_path.read_text(encoding="utf-8")) # Load effective config
cfg_path = RUNTIME_DICTIONARIES_CONFIG_PATH
cfg = load_effective_dictionaries_dict(cfg_path)
print("=" * 80) print("=" * 80)
print("CONFIG LOADED") print("CONFIG LOADED")

View File

@@ -5,6 +5,7 @@ import sys
from pathlib import Path from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent)) sys.path.insert(0, str(Path(__file__).parent.parent))
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
from anonymizer_core_refactored_onnx import process_pdf from anonymizer_core_refactored_onnx import process_pdf
# Test sur 3 documents du test dataset # Test sur 3 documents du test dataset
@@ -32,7 +33,7 @@ for doc in test_docs:
out_dir=out_dir, out_dir=out_dir,
make_vector_redaction=False, make_vector_redaction=False,
also_make_raster_burn=False, also_make_raster_burn=False,
config_path=Path("config/dictionnaires.yml"), config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
use_hf=False, use_hf=False,
ner_manager=None, ner_manager=None,
vlm_manager=None, vlm_manager=None,
@@ -56,4 +57,3 @@ for doc in test_docs:
print(f"{pdf_path.name}: Erreur - {e}") print(f"{pdf_path.name}: Erreur - {e}")
print("\n✅ Test terminé") print("\n✅ Test terminé")

View File

@@ -0,0 +1,169 @@
#!/usr/bin/env python3
"""
Exécute le corpus synthétique de revue humaine et produit les diffs.
"""
from __future__ import annotations
import argparse
import difflib
import json
import shutil
import sys
from collections import Counter
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
from anonymizer_core_refactored_onnx import ( # noqa: E402
anonymise_document_regex,
load_dictionaries,
selective_rescan,
)
from evaluation.leak_scanner import LeakScanner # noqa: E402
CORPUS_DIR = ROOT / "tests" / "synthetic_review"
CASES_DIR = CORPUS_DIR / "cases"
ACTUAL_DIR = CORPUS_DIR / "actual"
SCANNER = LeakScanner()
def normalize_text(text: str) -> str:
text = text.replace("\r\n", "\n").replace("\r", "\n")
return "\n".join(line.rstrip() for line in text.strip().splitlines()) + "\n"
def load_expectations(case_dir: Path) -> dict:
expectations_path = case_dir / "expectations.json"
if not expectations_path.exists():
return {}
return json.loads(expectations_path.read_text(encoding="utf-8"))
def build_leak_scan_seed(audit: list[dict]) -> list[dict]:
"""Évite les faux positifs sur les valeurs trop courtes ou ambiguës."""
seed = []
for item in audit:
original = str(item.get("original", "")).strip()
compact = original.replace(" ", "")
if len(compact) < 4:
continue
if compact.isdigit() and len(compact) < 6:
continue
seed.append(
{
"kind": item["kind"],
"original": original,
}
)
return seed
def run_case(case_dir: Path) -> dict:
cfg_path = case_dir / "config_overlay.yml"
cfg = load_dictionaries(cfg_path if cfg_path.exists() else None)
source_text = (case_dir / "test.txt").read_text(encoding="utf-8")
expected_text = normalize_text((case_dir / "expected.txt").read_text(encoding="utf-8"))
expectations = load_expectations(case_dir)
anon = anonymise_document_regex([source_text], [[]], cfg)
actual_text = normalize_text(selective_rescan(anon.text_out, cfg))
audit = [
{
"kind": hit.kind,
"original": hit.original,
"replacement": hit.placeholder,
}
for hit in anon.audit
]
summary = {
"kinds_present": sorted(set(item["kind"] for item in audit)),
"kind_counts": dict(sorted(Counter(item["kind"] for item in audit).items())),
"audit_len": len(audit),
"leaks": SCANNER.scan_text(actual_text, build_leak_scan_seed(audit)),
}
case_actual_dir = ACTUAL_DIR / case_dir.name
if case_actual_dir.exists():
shutil.rmtree(case_actual_dir)
case_actual_dir.mkdir(parents=True, exist_ok=True)
(case_actual_dir / "actual.txt").write_text(actual_text, encoding="utf-8")
(case_actual_dir / "actual.audit.json").write_text(
json.dumps(audit, ensure_ascii=False, indent=2) + "\n",
encoding="utf-8",
)
(case_actual_dir / "actual.summary.json").write_text(
json.dumps(summary, ensure_ascii=False, indent=2) + "\n",
encoding="utf-8",
)
diff_lines = list(
difflib.unified_diff(
expected_text.splitlines(keepends=True),
actual_text.splitlines(keepends=True),
fromfile=f"{case_dir.name}/expected.txt",
tofile=f"{case_dir.name}/actual.txt",
)
)
(case_actual_dir / "diff.txt").write_text("".join(diff_lines), encoding="utf-8")
failures = []
if actual_text != expected_text:
failures.append("text_diff")
if summary["leaks"]:
failures.append("leak_detected")
required_kinds = expectations.get("required_kinds", [])
missing_kinds = sorted(kind for kind in required_kinds if kind not in summary["kinds_present"])
if missing_kinds:
failures.append(f"missing_kinds:{','.join(missing_kinds)}")
for required in expectations.get("must_contain", []):
if required not in actual_text:
failures.append(f"missing_text:{required}")
for forbidden in expectations.get("must_not_contain", []):
if forbidden in actual_text:
failures.append(f"forbidden_text:{forbidden}")
return {
"case": case_dir.name,
"failures": failures,
"output_dir": str(case_actual_dir),
}
def main() -> int:
parser = argparse.ArgumentParser(description="Exécuter le corpus synthétique de revue humaine")
parser.add_argument(
"--strict",
action="store_true",
help="Retourne un code non nul si un cas diffère de l'attendu.",
)
args = parser.parse_args()
ACTUAL_DIR.mkdir(parents=True, exist_ok=True)
case_dirs = sorted(path for path in CASES_DIR.iterdir() if path.is_dir())
results = [run_case(case_dir) for case_dir in case_dirs]
has_failures = False
for result in results:
if result["failures"]:
has_failures = True
print(f"[FAIL] {result['case']}: {', '.join(result['failures'])}")
else:
print(f"[OK] {result['case']}")
print(f" -> {result['output_dir']}")
if args.strict and has_failures:
return 1
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -8,6 +8,7 @@ sys.path.insert(0, '.')
from pathlib import Path from pathlib import Path
import re import re
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
from anonymizer_core_refactored_onnx import process_pdf from anonymizer_core_refactored_onnx import process_pdf
import time import time
@@ -47,7 +48,7 @@ def test_all_cro():
output_dir, output_dir,
make_vector_redaction=False, make_vector_redaction=False,
also_make_raster_burn=False, also_make_raster_burn=False,
config_path=Path("config/dictionnaires.yml") config_path=RUNTIME_DICTIONARIES_CONFIG_PATH
) )
# Lire le texte anonymisé # Lire le texte anonymisé

View File

@@ -8,6 +8,7 @@ import sys
sys.path.insert(0, str(Path(__file__).parent.parent)) sys.path.insert(0, str(Path(__file__).parent.parent))
import anonymizer_core_refactored_onnx as core import anonymizer_core_refactored_onnx as core
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
def test_chcb_detection(): def test_chcb_detection():
"""Test CHCB detection on the 2 documents with leaks.""" """Test CHCB detection on the 2 documents with leaks."""
@@ -53,7 +54,7 @@ def test_chcb_detection():
out_dir=outdir, out_dir=outdir,
make_vector_redaction=False, make_vector_redaction=False,
also_make_raster_burn=False, also_make_raster_burn=False,
config_path=Path("config/dictionnaires.yml"), config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
use_hf=False, use_hf=False,
) )
@@ -102,7 +103,7 @@ def test_chcb_detection():
out_dir=outdir, out_dir=outdir,
make_vector_redaction=False, make_vector_redaction=False,
also_make_raster_burn=False, also_make_raster_burn=False,
config_path=Path("config/dictionnaires.yml"), config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
use_hf=False, use_hf=False,
) )

View File

@@ -9,6 +9,7 @@ sys.path.insert(0, '.')
from pathlib import Path from pathlib import Path
import re import re
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
from anonymizer_core_refactored_onnx import process_pdf from anonymizer_core_refactored_onnx import process_pdf
def test_date_propagation(): def test_date_propagation():
@@ -47,7 +48,7 @@ def test_date_propagation():
output_dir, output_dir,
make_vector_redaction=False, make_vector_redaction=False,
also_make_raster_burn=False, also_make_raster_burn=False,
config_path=Path("config/dictionnaires.yml") config_path=RUNTIME_DICTIONARIES_CONFIG_PATH
) )
# Lire le texte anonymisé # Lire le texte anonymisé

View File

@@ -9,6 +9,7 @@ import time
sys.path.insert(0, str(Path(__file__).parent.parent)) sys.path.insert(0, str(Path(__file__).parent.parent))
import anonymizer_core_refactored_onnx as core import anonymizer_core_refactored_onnx as core
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
# Dossier de test # Dossier de test
test_dir = Path("/tmp/test_gui_pdfs") test_dir = Path("/tmp/test_gui_pdfs")
@@ -39,7 +40,7 @@ for i, pdf in enumerate(pdfs, start=1):
out_dir=out_dir, out_dir=out_dir,
make_vector_redaction=False, make_vector_redaction=False,
also_make_raster_burn=True, also_make_raster_burn=True,
config_path=Path("config/dictionnaires.yml"), config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
use_hf=False, use_hf=False,
ner_manager=None, ner_manager=None,
ner_thresholds=None, ner_thresholds=None,

View File

@@ -8,6 +8,7 @@ import sys
sys.path.insert(0, str(Path(__file__).parent.parent)) sys.path.insert(0, str(Path(__file__).parent.parent))
import anonymizer_core_refactored_onnx as core import anonymizer_core_refactored_onnx as core
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
# Simuler exactement ce que fait le GUI # Simuler exactement ce que fait le GUI
test_pdf = Path("/tmp/test_gui_pdfs/001_simple_unknown_BACTERIO_23018396.pdf") test_pdf = Path("/tmp/test_gui_pdfs/001_simple_unknown_BACTERIO_23018396.pdf")
@@ -27,7 +28,7 @@ try:
out_dir=out_dir, out_dir=out_dir,
make_vector_redaction=False, make_vector_redaction=False,
also_make_raster_burn=True, also_make_raster_burn=True,
config_path=Path("config/dictionnaires.yml"), config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
use_hf=False, use_hf=False,
ner_manager=None, ner_manager=None,
ner_thresholds=None, ner_thresholds=None,

View File

@@ -16,6 +16,7 @@ import re
# Ajouter le répertoire racine au path # Ajouter le répertoire racine au path
sys.path.insert(0, str(Path(__file__).parent.parent)) sys.path.insert(0, str(Path(__file__).parent.parent))
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
from anonymizer_core_refactored_onnx import process_pdf from anonymizer_core_refactored_onnx import process_pdf
def test_phase1_corrections(): def test_phase1_corrections():
@@ -52,7 +53,7 @@ def test_phase1_corrections():
# Anonymiser le document # Anonymiser le document
result = process_pdf( result = process_pdf(
pdf_path=pdf_path, pdf_path=pdf_path,
config_path=Path("config/dictionnaires.yml"), config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
ner_manager=None, ner_manager=None,
eds_pseudo_manager=None, eds_pseudo_manager=None,
vlm_manager=None, vlm_manager=None,

View File

@@ -16,6 +16,7 @@ import re
sys.path.insert(0, str(Path(__file__).parent.parent)) sys.path.insert(0, str(Path(__file__).parent.parent))
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
from anonymizer_core_refactored_onnx import process_pdf from anonymizer_core_refactored_onnx import process_pdf
def validate_corpus_sample(): def validate_corpus_sample():
@@ -94,7 +95,7 @@ def validate_corpus_sample():
output_dir, output_dir,
make_vector_redaction=False, make_vector_redaction=False,
also_make_raster_burn=False, # Pas de PDF pour aller plus vite also_make_raster_burn=False, # Pas de PDF pour aller plus vite
config_path=Path("config/dictionnaires.yml") config_path=RUNTIME_DICTIONARIES_CONFIG_PATH
) )
doc_time = time.time() - doc_start doc_time = time.time() - doc_start

View File

@@ -17,6 +17,7 @@ import re
sys.path.insert(0, str(Path(__file__).parent.parent)) sys.path.insert(0, str(Path(__file__).parent.parent))
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
from anonymizer_core_refactored_onnx import process_pdf from anonymizer_core_refactored_onnx import process_pdf
def validate_full_corpus(): def validate_full_corpus():
@@ -70,7 +71,7 @@ def validate_full_corpus():
output_dir, output_dir,
make_vector_redaction=False, make_vector_redaction=False,
also_make_raster_burn=True, also_make_raster_burn=True,
config_path=Path("config/dictionnaires.yml") config_path=RUNTIME_DICTIONARIES_CONFIG_PATH
) )
doc_time = time.time() - doc_start doc_time = time.time() - doc_start

View File

@@ -10,6 +10,7 @@ from pathlib import Path
import json import json
sys.path.insert(0, str(Path(__file__).parent.parent)) sys.path.insert(0, str(Path(__file__).parent.parent))
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
from anonymizer_core_refactored_onnx import process_pdf from anonymizer_core_refactored_onnx import process_pdf
# 5 documents du corpus production (OGC 008) # 5 documents du corpus production (OGC 008)
@@ -58,7 +59,7 @@ for pdf_path in test_docs[:5]:
out_dir=out_dir, out_dir=out_dir,
make_vector_redaction=False, make_vector_redaction=False,
also_make_raster_burn=False, also_make_raster_burn=False,
config_path=Path("config/dictionnaires.yml"), config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
use_hf=False, use_hf=False,
ner_manager=None, ner_manager=None,
vlm_manager=None, vlm_manager=None,