Externalize dictionaries and add anonymization review corpus

This commit is contained in:
2026-04-21 10:32:57 +02:00
parent 012445755a
commit 500ebc28c2
99 changed files with 1805 additions and 805 deletions

View File

@@ -44,6 +44,12 @@ try:
except Exception:
yaml = None
from config_defaults import (
RUNTIME_DICTIONARIES_CONFIG_PATH,
load_effective_dictionaries_dict,
load_default_dictionaries_dict,
)
try:
from doctr.models import ocr_predictor as _doctr_ocr_predictor
_DOCTR_AVAILABLE = True
@@ -115,6 +121,29 @@ def _load_bdpm_medication_names() -> set:
return set()
def _load_wordlist_file(
path: Path,
*,
transform=lambda s: s,
label: str,
min_len: int = 1,
) -> set:
"""Charge un fichier texte, un mot par ligne."""
result: set = set()
if not path.exists():
log.warning("%s introuvable : %s", label, path)
return result
try:
for line in path.read_text(encoding="utf-8").splitlines():
word = line.strip()
if word and not word.startswith("#") and len(word) >= min_len:
result.add(transform(word))
log.info("%s chargé : %d entrées depuis %s", label, len(result), path.name)
except Exception as exc:
log.error("%s : erreur de lecture %s%s", label, path, exc)
return result
# ----------------- Gazetteers INSEE (prénoms + communes + noms de famille) -----------------
# Prénoms et noms de famille sont utilisés sous deux formes :
# - _INSEE_PRENOMS (lowercase) : check rapide "w.lower() in _INSEE_PRENOMS"
@@ -199,62 +228,24 @@ _FINESS_ADDR_AC = None # Automate Aho-Corasick pour adresses (noms d
_VILLE_AC = None # Automate Aho-Corasick pour villes (INSEE + FINESS)
# Communes trop ambiguës (homonymes de mots courants, trop courts, etc.)
_VILLE_BLACKLIST = {
# Directions / mots géographiques génériques
"SAINT", "NORD", "SUD", "EST", "OUEST",
"CENTRE", "SERVICE", "BOURG",
# Communes homonymes de mots courants français
"ORANGE", "TOURS", "NICE", "SENS", "VITRE",
"ROMANS", "MENTON", "SALON", "VIENNE",
"BREST", # trop court et ambigu
"HYERES", # proche de termes médicaux
"AGEN", "AUCH", "ALBI",
"BLOIS", "LAON", "LENS",
"GIEN", "GRAY",
"AIRE", "LURE", "SETE", "DOLE",
"VIRE", "LUNEL", "MURET", "MORET",
"COEUR", "FOIX", "GIVET",
"EVIAN", "MAURE", "MENDE",
"JOUE", "MEAUX", "REDON",
"CREIL", "CERGY",
# Communes de 4-5 lettres homonymes de mots très courants
"VERS", "MONT", "MARS", "PORT", "PONT", "FORT",
"BOIS", "ISLE", "LACS", "MURS", "OUST", "PREY",
"VAUX", "VERT", "FAUX", "REZE",
"BILLE", "PLACE", "VILLE", "COURS", "GRAND",
"ROUGE", "RICHE", "NUITS", "SORE", "SARE",
"TRANS", "RANS", "MARSA",
# Mots courants français (6+ lettres) aussi communes
"CHARGE", "SIGNES", "BARRES", "FOSSES", "GARDES",
"MARCHE", "LIGNES", "MOULIN", "PIERRE", "CHAISE",
"SOURCE", "VALLEE", "MAISON", "BEAUNE", "CORPS",
"PUITS", "CROIX", "LIGNE", "QUATRE", "PRISON",
# Prénoms très courants (aussi communes)
"MARIE", "PIERRE", "JEAN", "PAUL", "ANNE",
# Expressions composées ambiguës (aussi communes INSEE)
"LONG", "RECY", "PLAN", "MARCHE", "SALLE",
"CONTRE", "MERE", "ONDRES", "VEBRE",
# Mots structurels / médicaux
"PARIS", # omniprésent, source de faux positifs
"FRANCE", "EUROPE",
# Termes ambigus (aussi communes INSEE) - trackare/DPI
"COURANT", # "Médecin courant" ≠ ville
# Parties du corps homonymes de communes (FP "prurit invalidant (COU, décolleté)")
"COU", "DOS", "SEIN", "BRAS",
_VILLE_BLACKLIST_FALLBACK = {
"PARIS",
"FRANCE",
"EUROPE",
"COURANT",
"COU",
"DOS",
"SEIN",
"BRAS",
}
# Enrichissement depuis fichier externe (modifiable sans toucher au code)
_villes_bl_file = Path(__file__).parent / "data" / "villes_blacklist.txt"
if _villes_bl_file.exists():
try:
for _line in _villes_bl_file.read_text(encoding="utf-8").splitlines():
_w = _line.strip()
if _w and not _w.startswith("#"):
_VILLE_BLACKLIST.add(_w)
log.info("Villes blacklist chargées : %d entrées", len(_VILLE_BLACKLIST))
except Exception as _exc:
log.error("Villes blacklist : erreur de lecture %s%s", _villes_bl_file, _exc)
else:
log.warning("Villes blacklist : fichier introuvable %s — défauts intégrés utilisés", _villes_bl_file)
_VILLE_BLACKLIST = _load_wordlist_file(
Path(__file__).parent / "data" / "villes_blacklist.txt",
transform=str.upper,
label="Villes blacklist",
)
if not _VILLE_BLACKLIST:
_VILLE_BLACKLIST = set(_VILLE_BLACKLIST_FALLBACK)
_BASE_VILLE_BLACKLIST = set(_VILLE_BLACKLIST)
try:
import ahocorasick as _ahocorasick
@@ -331,7 +322,7 @@ def load_medical_whitelists():
global _MEDICAL_STRUCTURAL_TERMS, _MEDICATION_WHITELIST
# 1. Charger les termes médicaux structurels
config_path = Path("config/medical_terms_whitelist.yml")
config_path = Path(__file__).parent / "config" / "medical_terms_whitelist.yml"
if config_path.exists() and yaml:
try:
with open(config_path, 'r', encoding='utf-8') as f:
@@ -345,48 +336,20 @@ def load_medical_whitelists():
# 2. Charger la whitelist des médicaments (edsnlp + BDPM + manuels)
_MEDICATION_WHITELIST = _load_edsnlp_drug_names()
_MEDICATION_WHITELIST.update(_load_bdpm_medication_names())
# Ajouter médicaments manquants
additional_meds = {
"idacio", "salazopyrine", "infliximab", "apranax",
"ketoprofene", "prevenar", "pneumovax", "bétadine"
}
_MEDICATION_WHITELIST.update(additional_meds)
_MEDICATION_WHITELIST.update(
_load_wordlist_file(
Path(__file__).parent / "data" / "bdpm" / "medication_whitelist_manual.txt",
transform=str.lower,
label="Whitelist médicaments manuelle",
min_len=3,
)
)
log.info(f"Whitelist médicaments chargée: {len(_MEDICATION_WHITELIST)} médicaments (edsnlp+BDPM)")
# Charger les whitelists au démarrage du module
load_medical_whitelists()
# ----------------- Defaults & Config -----------------
DEFAULTS_CFG = {
"version": 1,
"encoding": "utf-8",
"normalization": "NFKC",
"whitelist": {
"sections_titres": ["DIM", "GHM", "GHS", "RUM", "COMPTE", "RENDU", "DIAGNOSTIC"],
"noms_maj_excepts": ["Médecin DIM", "Praticien conseil"],
"org_gpe_keep": False,
},
"blacklist": {
"force_mask_terms": [],
"force_mask_regex": [],
},
"kv_labels_preserve": ["FINESS", "IPP", "N° OGC", "Etablissement"],
"regex_overrides": [
{
"name": "OGC_court",
"pattern": r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b",
"placeholder": "[OGC]",
"flags": ["IGNORECASE"],
}
],
"flags": {
"case_insensitive": True,
"unicode_word_boundaries": True,
"regex_engine": "python",
},
}
PLACEHOLDERS = {
"EMAIL": "[EMAIL]",
"TEL": "[TEL]",
@@ -445,408 +408,49 @@ def validate_nir(nir_raw: str) -> bool:
return False
return key_int == (97 - (body_int % 97))
# Mots médicaux/techniques/courants qui ne sont pas des noms de personnes
_MEDICAL_STOP_WORDS_SET = {
# Mots français courants (déterminants, prépositions, adverbes, etc.)
"pas", "mon", "bien", "ancien", "ancienne", "bon", "bonne", "tout", "tous",
"mais", "donc", "car", "que", "qui", "avec", "dans", "pour", "sur", "par",
"les", "des", "une", "est", "son", "ses", "nos", "aux", "cette", "ces",
"cher", "chez", "entre", "sans", "sous", "vers", "selon", "après", "avant",
"puis", "aussi", "très", "plus", "moins", "peu", "non", "oui", "quelques",
"mise", "début", "fin", "suite", "fait", "lieu", "cas", "jour", "jours",
"semaine", "semaines", "mois", "temps", "place", "nouvelle", "nouveau",
"franche", "légère", "quelque", "depuis", "comme", "encore", "votre",
"date", "note", "notes", "nom", "heure", "matin", "soir", "midi",
"signé", "réalisé", "courrier", "cabinet", "rue",
# Verbes / participes courants
"remontée", "associée", "réalisée", "débuté", "prolongé", "prolongée",
"prescrit", "prescrite", "présente", "présent", "absente", "absent",
"reprise", "introduction", "arrêt", "relais",
# Titres / rôles hospitaliers
"chef", "assistant", "assistante", "praticien", "praticienne",
"docteur", "professeur", "hospitalier", "hospitalière", "hospitaliers",
"spécialiste", "contractuel", "contractuelle", "titulaire",
"confrère", "consoeur", "coordonnateur", "coordonnatrice",
"médecin", "médical", "infirmier", "infirmière",
"praticiens", "patient", "patiente",
# Structure hospitalière
"service", "pôle", "clinique", "consultation", "secrétariat",
"hôpital", "hôpitaux", "centre", "établissement", "polyclinique",
# Villes / géographie (pas des noms de personnes)
"bordeaux", "bayonne", "paris", "lyon", "lille", "marseille",
"toulouse", "nantes", "montpellier", "pessac", "biarritz", "soustons",
"basque", "basques", "sud", "côte",
# Médicaments génériques et spécialités (DCI + noms commerciaux)
"colchicine", "aspirine", "cortancyl", "bisoprolol", "entresto",
"methotrexate", "eplerenone", "speciafoldine", "prednisone",
"corticoïdes", "cortisone",
"paracetamol", "metformine", "solupred", "novorapid", "abasaglar",
"lovenox", "methylprednisolone", "potassium", "humalog", "furosemide",
"insuline", "trulicity", "forxiga", "atorvastatine", "amlodipine",
"ondansetron", "eliquis", "nebivolol", "gaviscon", "loxen",
"morphine", "oxycodone", "kardegic", "tercian", "zopiclone",
"seresta", "tramadol", "alprazolam", "forlax", "levothyrox",
"bromazepam", "gliclazide", "zymad", "pravastatine", "spiriva",
"quetiapine", "sertraline", "crestor", "lercanidipine", "amoxicilline",
"opocalcium", "ferinject", "candesartan", "ceftriaxone", "calcidose",
"laroxyl", "brintellix", "ketoprofene", "adrenaline", "exacyl",
"terbutaline", "ipratropium", "actiskenan", "vialebex", "oxynormoro",
"lansoprazole", "perindopril", "sodium", "velmetia",
"doliprane", "dafalgan", "efferalgan", "spasfon", "vogalene",
"augmentin", "inexium", "omeprazole", "pantoprazole", "esomeprazole",
"ramipril", "lisinopril", "enalapril", "losartan", "valsartan",
"irbesartan", "olmesartan", "telmisartan", "hydrochlorothiazide",
"spironolactone", "furosemide", "lasilix", "aldactone",
"tahor", "crestor", "rosuvastatine", "simvastatine", "fluvastatine",
"xarelto", "pradaxa", "apixaban", "rivaroxaban", "dabigatran",
"plavix", "clopidogrel", "ticagrelor", "brilique",
"ventoline", "seretide", "symbicort", "salmeterol", "fluticasone",
"salbutamol", "tiotropium", "budesonide", "beclometasone",
"oxycodone", "oxynorm", "skenan", "actiskenan", "fentanyl",
"nubain", "nalbuphine", "nefopam", "acupan", "profenid",
"ibuprofene", "diclofenac", "naproxene", "celecoxib",
"gabapentine", "pregabaline", "lyrica", "neurontin",
"amitriptyline", "duloxetine", "venlafaxine", "fluoxetine",
"paroxetine", "escitalopram", "citalopram", "mirtazapine",
"olanzapine", "risperidone", "aripiprazole", "haloperidol",
"loxapine", "cyamemazine", "diazepam", "oxazepam", "lorazepam",
"clonazepam", "midazolam", "hydroxyzine", "atarax", "melatonine",
"stilnox", "zolpidem", "imovane",
"levothyroxine", "metformine", "glimepiride", "sitagliptine",
"januvia", "jardiance", "empagliflozine", "dapagliflozine",
"ozempic", "semaglutide", "dulaglutide", "liraglutide", "victoza",
"heparine", "enoxaparine", "tinzaparine", "innohep",
"warfarine", "coumadine", "fluindione", "previscan",
"ciprofloxacine", "levofloxacine", "ofloxacine", "metronidazole",
"vancomycine", "gentamicine", "tazocilline", "piperacilline",
"meropenem", "imipenem", "clindamycine", "doxycycline",
"azithromycine", "clarithromycine", "cotrimoxazole", "bactrim",
"polyionique", "propranolol", "apidra", "solostar",
# Noms et suffixes laboratoires pharmaceutiques
"arw", "myl", "myp", "arg", "teva", "bga", "agt",
"mylan", "biogaran", "arrow", "sandoz", "zentiva", "cristers",
"accord", "viatris", "ranbaxy", "ratiopharm", "almus", "qualimed",
"evolugen", "alter", "zydus", "medisol", "substipharm",
"sdz", "bgr", "egt", "rnb",
# Formes galéniques / voies d'administration
"cpr", "sachet", "orale", "oral", "sol", "buv", "stylo", "flexpen",
"flestouch", "kwikpen", "inj", "susp", "gelule", "comprime",
"unidose", "perf", "inh", "seringue", "aerosol", "sach", "pdr",
"orodisp", "capsule", "patch", "suppositoire", "gouttes",
# Termes de prescription / pharmacie
"prescription", "prescriptions", "dose", "fréquence", "statut",
"technique", "capteur", "bandelettes", "glycemiques", "glycemique",
"lancettes", "aiguilles", "fines", "micro", "pompe", "réserve",
"glycemie", "capillaire", "hgt",
# Termes médicaux / cliniques
"myocardite", "myosite", "corticothérapie", "biopsie", "pathologie",
"dysimmunitaire", "récidive", "récidivante", "traitement", "diagnostic",
"antécédents", "examen", "bilan", "résultats", "analyse",
"interne", "externe", "médecine", "chirurgie", "rhumatologie",
"dermatologie", "immunologie", "cardiologie", "pneumologie",
"neurologie", "gynécologie", "radiologie", "sénologie",
"douleur", "douleurs", "douloureux", "musculaire", "musculaires",
"thoracique", "thoraciques", "membres", "supérieurs", "inférieurs",
"normale", "normaux", "habituelle", "habituelles",
"synthèse", "hospitalisation", "syndrome", "vaccination", "ophtalmo",
"pelvien", "diabétique", "sommeil", "régime", "diet",
"desinfection", "environnement", "identification", "bracelet",
"toilettes", "accompagner", "installer", "transfusion",
"signes", "vitaux", "alimentaire", "avis", "zone",
"calcémie",
# Abréviations médicales
"irm", "ett", "ecg", "mtx", "fevg", "bdc", "crp", "sfu", "hdj",
"bnp", "asat", "alat", "cpk", "ctc", "hba", "hba1c",
"saos", "tsh", "inr", "vgm", "pnn", "plq", "hb",
"poc", "bax", "act", "bic", "cfx", "acc", "ado", "acf", "vfo",
"qvl", "cci", "pse", "pca", "chl", "crt", "bbm", "pds", "ren",
"vit", "zen",
"scanner", "radio", "écho", "échographie",
# Spécialités médicales (éviter faux positifs NOM)
"hépato-gastro-entérologue", "gastro-entérologue", "gastro-entérologie",
"proctologue", "oncologue", "anesthésiste", "pneumologue", "gérontologue",
"cardiologue", "néphrologue", "urologue", "gériatre",
"hépatologue", "endocrinologue", "stomatologue",
# Termes médicaux / titres fréquemment détectés comme NOM par le NER
"supplémentation", "supplementation", "endocrinologie", "monsieur", "madame",
"suivi", "sortie", "emog", "ophtalmo",
# Médicaments détectés comme NOM/PRENOM par EDS-Pseudo
"eliquis", "trulicity", "saos", "wind", "taxotere", "eupantol", "ezetimibe",
"lansoyl", "xatral", "xenetix", "trimbow", "buspirone", "cetirizine",
"depakote", "versatis", "durogesic", "montelukast", "metformine", "viatris",
"rosuvastatine", "gliclazide", "amlodipine", "perindopril", "nebivolol",
"pravastatine", "bisoprolol", "amoxicilline", "kardegic", "lovenox",
# Termes médicaux / soins / actes détectés comme NOM
"partielle", "cutanee", "cutané", "cutanée", "osseuse", "diabetique",
"diabétique", "transdermique", "transderm", "diarrhees", "diarrhées",
"ionogramme", "scintigraphie", "thoraco", "thorax", "négative", "negative",
"diététicienne", "pressurise", "pressuriser", "inhalee", "inhalée", "inhal",
# Mots courants français détectés comme NOM dans les trackare
"toilette", "repas", "poche", "installation", "education", "éducation",
"refection", "réfection", "complete", "complète", "regime", "régime",
"normal", "traité", "traite", "arrêté", "arrete", "volume",
"commentaires", "france", "covid", "framboise", "epoux", "époux",
# Abréviations médicales courtes (3-4 chars) détectées comme NOM
"ide", "ipp", "pcr", "tap", "gel", "ahl", "ssr", "hds", "tca", "etp",
"mcg", "sdz", "iao", "ser", "orod", "clav", "disp", "cart", "atcd", "mdrd",
"amox", "endoc", "microg", "item", "pyélo", "néphro",
# En-têtes de colonnes / mots structurels trackare
"observations", "observation", "commentaires", "commentaire",
"surveillance", "température", "temperature", "glycémie", "glycemie",
"diurèse", "diurese", "balance", "pouls", "systolique", "diastolique",
"saturation", "fréquence", "frequence", "respiratoire", "douleur",
"alertes", "alerte", "antécédents", "antecedents", "habitus",
"allergies", "prescriptions", "prescription", "administration",
"catégorie", "categorie", "expiration", "message",
"destination", "diagnostique", "diagnostiques",
"date", "note", "nom", "heure", "type", "code", "etat",
"comprime", "comprimé", "gelule", "gélule", "solution", "injectable",
# Médicaments supplémentaires détectés dans les trackare
"depakote", "versatis", "humalog", "forxiga", "durogesic",
"montelukast", "rosuvastatine",
# Abréviations pharma courtes
"cpr", "sol", "bic", "agt", "poche", "inhal",
# Termes chirurgicaux/cliniques FP
"cure", "endocrine", "operatoire", "opératoire", "realisee", "réalisée",
"gauche", "droit", "droite", "face", "profil",
# Faux positifs EDS supplémentaires
"psy", "inhales", "inhalés", "kwikpen", "lansoprazole", "tiorfan", "smecta",
"axa", "ttt", "anionique", "abdomino", "cod", "omi", "urg", "med",
"10mg", "20mg", "40mg", "100mg", "300ui", "500ml", "innohep", "coaprovel",
"actiskenan", "simvastatine", "forlax",
# Mots temporels / contextuels détectés comme EDS_HOPITAL
"semaine", "jour", "matin", "soir", "nuit", "midi",
# Mots clés de contexte document
"compétences", "maladies", "inflammatoires", "systémiques", "rares",
"fret", "fax", "contexte", "résultat", "resultat", "résultats", "resultats",
"haute", "maison", "aide", "rpps", "poste", "fonct",
"sante", "santé", "etxe", "ttipi", "gastro", "concha",
"endoscopie", "endoscopique", "fibroscopie",
"indication", "conclusion", "technique", "anesthésie",
"digestif", "digestive", "digestives", "nutritive",
# Abréviations soins trackare détectées comme NOM (batch 20 OGC)
"soins", "lit", "jeun", "lever", "pose", "surv", "ggt", "vvp",
# Verbes d'instructions soins (aussi des patronymes INSEE → FP)
"coucher", "manger", "marcher", "sortir",
"verif", "crop", "evs", "maco", "pan", "cet", "trou", "nit", "nfs",
# Mots narratifs CRH capturés par fusion sidebar 2-colonnes
"evolution", "évolution", "explorations", "fermeture", "allergie", "allergies",
"lotissement", "cholangiographie", "cholecystectomie", "cholécystectomie",
"paracetamol", "paracétamol", "unité", "unite",
# FP résiduels batch 10 OGC (termes médicaux/instructions soins)
"glyc", "glycosurie", "vider", "forte",
# FP audit batch 59 OGC (mots courants/médicaux flagués comme NOM)
"oncologie", "confrères", "confrere", "doubles", "chers", "motif",
"responsable", "autre", "autres", "autonome", "autonomes",
"préparations", "preparations", "prévenir", "prevenir",
"acétylsalicylique", "acetylsalicylique", "angio",
"desc", "diu", "barreau",
"haitz", "alde",
# FP audit OGC 21 — termes médicaux/courants flagués NOM_GLOBAL
"alimentation", "augmentation", "amelioration", "amélioration",
"biliaire", "biliaires", "bili", "voies", "voie",
"apyrexie", "apyréxie", "apyrétique", "apyretique",
"clavulanique", "mecillinam", "sulfamides", "sulfamide",
"tazobactam", "temocilline", "ecoflac", "furanes", "furane",
"exilar", "lipruzet", "mopral",
"sensible", "sensibles", "dossier", "dossiers",
"entero", "entéro", "medecine", "bio",
"aviation", "contention", "isolement",
"elimination", "élimination", "infectieux",
"hémodynamique", "hemodynamique", "pancréatite", "pancreatite",
"cholecystite", "cholécystite", "cholécystectomie", "cholecystectomie",
"appendicectomie", "néoplasie", "neoplasie",
"ovarienne", "prandial", "fébrile", "febrile",
"eupnéique", "eupneique", "normocarde", "normotendue",
"variable", "dosage", "posologie",
# Abréviations diététiques/soins trackare
"bcy", "po2", "po1", "po3", "bha", "atg", "ras", "cat", "ass",
# FP audit OGC 17 CRH
"mode", "retraitee", "retraité", "retraitée", "régression", "regression", "tel",
"strasbourg", "bordeaux", "toulouse", "paris", "lyon", "marseille", "bayonne", "anglet",
"saint-palais", "tarnos", "hendaye", "dax", "orthez", "oloron", "pau", "cambo",
# Spécialités/services récurrents comme FP NOM
"cancérologie", "cancerologie", "réanimation", "reanimation",
"urologie", "néphrologie", "nephrologie", "hématologie", "hematologie",
"gériatrie", "geriatrie", "pédiatrie", "pediatrie",
"ophtalmologie", "stomatologie", "allergologie",
"kinésithérapie", "kinesitherapie", "ergothérapie", "ergotherapie",
"orthopédie", "orthopedie", "traumatologie",
"palliatifs", "palliative", "palliatif",
"addictologie", "alcoologie", "tabacologie",
# FP soignants trackare (mots courants capturés par patterns Note d'évolution / Signé / Flacon)
"discussion", "echelle", "échelle", "scope", "tdm", "bouteille",
"evendol", "relais", "repas", "poursuite", "indication",
# FP pattern timestamp (termes ALL-CAPS capturés par "HH:MM NOM")
"eliminatin", "elimination", "élimination", "preremplie", "pré-remplie",
"thermie", "alim", "alimentation", "admin",
# Médicaments/tests labo capturés par patterns soignants
"biprofenid", "bi-profenid", "phosphatase", "phosphatases",
"ecbu", "suppo", "suppositoire", "mucite", "microlax", "normacol",
"ciprofloxacine", "lavement", "desinfection", "désinfection",
"avaler", "rachis", "lombaire", "thoraco-lombaire",
"cérébrale", "cerebrale", "cérébral", "cerebral", "hépatique", "hepatique",
"thoracique", "abdominale", "abdominal", "pelvienne", "pelvien", "médullaire",
# Dosages et labos pharma (FP fréquents dans prescriptions Trackare)
"faible", "fort", "forte",
"myl", "mylan", "sandoz", "teva", "arrow", "biogaran", "zentiva", "eg",
"arw", "pan", "mso", "bgn", "ratiopharm", "accord", "vts", "viatris",
"abdomino-pelvien", "abdomino", "pelvien", "thoraco-abdominal",
"entree", "entrée", "continu", "continue",
"morphine", "claforan", "skenan", "actiskenan",
# Fragments de noms de médicaments (pdfplumber split)
"sium", "pegic", "fenid", "profenid",
# Catégories cliniques Trackare (en-têtes de section masqués à tort)
"respi", "respiratoire", "nephro", "cardio", "neuro", "onco", "pulmo",
"hemato", "hémato", "infectieux", "thermie", "diurese", "diurèse",
"transit", "anemie", "anémie", "constantes", "examen",
"post-op", "postop", "pré-op", "preop", "chimio", "elim",
"toilette", "sommeil", "hypota", "hypotension", "spo2",
"urine", "urines", "sng",
"rénale", "renale", "rénal", "renal", "cardiaque",
# Termes structurels trackare
"transmissions", "transmission", "releve", "relevé",
"objectif", "objectifs", "evaluation", "évaluation",
"planification", "planifié", "planifiee",
# ── FP détectés automatiquement par audit_fp_detector.py ──
# Lot 2 : tokens basse confiance (DICT_FR seul) clairement non-noms
"acide", "adulte", "ambulatoire", "avenue", "bandelette", "bassin",
"bijoux", "bouche", "bouchon", "changes", "court", "demande", "dessert",
"devenir", "diffusé", "douche", "entrée", "escarre", "espace",
"explications", "fauteuil", "feuillet", "fixateur", "fois", "gamma",
"germes", "glace", "habillage", "liste", "maquillage", "matelas",
"mettre", "obésité", "ongles", "palais", "perlant", "pertes",
"pièce", "plaie", "risque", "saint", "sang", "signe", "sonde",
"tenue", "texte", "transaminases", "transit", "transmis", "urinal",
"vernis", "vessie", "vrac",
# Lot 2 : termes médicaux (préfixes/suffixes)
"anatomo-pathologique", "anemie", "anémie", "angioscanner",
"cétonurie", "cetonurie", "depilation", "dépilation",
"folique", "gastroentérologue", "gastroenterologue",
"microgrammes", "nalidixique", "naso-gastrique",
"angio-irm", "neuro", "neuro-chirurgie", "endoplasmique",
"cyto", "plaie-colle", "bionolyte",
# Lot 1 (103 tokens, confiance >= 0.5) ──
# Anatomie / clinique
"abdomen", "aortique", "bilirubine", "drain", "gastrique", "histologique",
"intraveineuse", "lithiasique", "macroscopie", "miction", "molles", "pelvienne",
"plaquette", "plaquettes", "rectale", "sanguine", "sciatique", "selle", "urinaire",
# Pathologies / symptômes
"algie", "angoisse", "antécédent", "douloureuse", "dyslipidémie",
"hemodialyse", "hemorragique", "hyperthermie", "hématologue",
# Médicaments / matériel médical
"ampoule", "antalgique", "antiseptique", "compresse", "flacon",
"oxygène", "pansement", "vitamine",
# Biologie / examens
"biochimie", "biologie", "fer",
# Actions / états cliniques
"ablation", "absence", "admission", "bloc", "changement", "cliniquement",
"cognitif", "couchage", "dispositif", "dynamique", "entretien", "histoire",
"intervention", "position", "rappel", "relation", "retour", "réalisation",
"résistant", "réévaluation", "situation", "temporaire", "urgence", "urgences",
"urgent", "validation",
# Mots courants / contextuels
"angle", "bille", "boisson", "bureau", "cases", "circuit",
"concubin", "confortable", "demain", "densité", "dernière",
"distant", "domaine", "elle", "fils", "frère", "grand", "horizon",
"hui", "identifiant", "minuit", "murent", "neuf", "original", "pages",
"personne", "premier", "quartier", "retraite", "route", "rés",
"trouve", "verrouillé", "villa", "étage",
# Termes médicaux courants faussement détectés comme NOM (Phase 2 audit mars 2026)
"ains", "ponction", "hanche", "burkitt", "orl", "gds", "oap", "tvp", "epp",
"bronchite", "accueil", "cadre", "transfert", "relecture", "examens",
"traitements", "traitement", "infectiologie", "cancérologie", "cancerologie",
"maternité", "orale", "sachet", "absence",
# FP audit 30 fichiers Phase 2 (mars 2026)
"bouffee", "bouffée", "discontinue", "respimat", "lyoc",
"probnp", "pro-bnp", "nt-probnp",
"bpco", "colle", "gsc", "masse",
"selle", "selles",
# Acronymes médicaux courts (3 lettres) souvent FP comme NOM
"epo", "irc", "sib", "inr", "iec", "ira", "ait", "avc",
"imc", "ipp", "ivo", "amp", "ivg", "img", "had", "ssr",
"hta", "ecg", "irm", "tep", "crp", "nfs", "bhc", "vgm",
"vni", "aeg", "bas", "snv", "hba", "ide", "dci",
# Termes pharmaceutiques FP comme NOM (audit 30 fichiers mars 2026)
"buvable", "buvables", "nominal", "nominaux",
"acide", "principale", "principal", "principaux",
"hyaluronique", "valproique", "valproïque", "tranexamique", "tranéxamique",
"clavulanique", "nalidixique",
"grancher", # Centre de réadaptation (nom d'établissement homonyme)
"experf", # Prestataire HAD (nom commercial homonyme)
# Noms de services hospitaliers (FP comme [NOM])
"ortho", "mobile", "polyvalente", "polyvalent",
"geriatrie", "gériatrie", "ambulatoire", "provisoire",
"intraveineuse", "intraveineux", "sous-cutanee", "sous-cutané",
# Noms de services hospitaliers (aussi patronymes INSEE → FP récurrents)
"viscerale", "viscérale", "vasculaire", "vasculaires",
"conventionnelle", "conventionnel",
"polyvalente", "polyvalent",
"infectieuse", "infectieuses",
# Termes soins infirmiers / activités de la vie quotidienne (FP trackare doc 216)
"aide", "partielle", "partiel", "complete", "complète", "complet",
"contention", "lavabo", "blader", "scan", "post", "lunettes",
"deshabillage", "déshabillage", "habillage",
"surveillance", "surv", "refection", "réfection",
"miction", "toilette", "douche", "changes",
"installation", "transfert", "mobilisation",
"alimentation", "hydratation", "collation",
"stimulation", "prevention", "prévention",
# Termes pharmaceutiques/matériel médical FP (retour relecteur 2026-03-16)
"chlorure",
# Dispositifs médicaux (FP "OXYGENE LUNETTES" → [NOM])
"canule", "canules", "masque", "sonde", "sondes",
# Termes chirurgicaux FP comme [NOM] (retour relecteur 2026-03-17)
"totale", "total", "partielle", "partiel",
"prothese", "prothèse", "protheses", "prothèses", "unicompartimentale",
# Antiseptiques / produits de soins (FP trackare prescriptions)
"betascrub", "hibiscrub", "betadine", "biseptine", "chlorhexidine",
# Nutrition entérale / compléments
"fresubin", "nutrison", "sondalis", "isosource", "novasource",
# Termes médicaux FP dans bactério / texte libre
"nombreuses", "nombreux", "plusieurs", "quelques",
"internationale", "international",
"resorbable", "résorbable", "resorbables", "résorbables",
"alfa", "capsule", "capsules",
# Mots médicaux/techniques/courants qui ne sont pas des noms de personnes.
# Source de vérité externalisée dans data/stopwords_manuels.txt + BDPM/edsnlp.
_MEDICAL_STOP_WORDS_FALLBACK = {
"date",
"note",
"heure",
"type",
"traitement",
"traitements",
"soins",
"surveillance",
"consultation",
"hospitalisation",
}
# Enrichissement automatique avec les ~4000 noms de médicaments d'edsnlp
_MEDICAL_STOP_WORDS_SET.update(_load_edsnlp_drug_names())
# Enrichissement depuis fichier externe (modifiable sans toucher au code)
_stopwords_file = Path(__file__).parent / "data" / "stopwords_manuels.txt"
if _stopwords_file.exists():
try:
_sw_count = 0
for _line in _stopwords_file.read_text(encoding="utf-8").splitlines():
_w = _line.strip()
if _w and not _w.startswith("#"):
_MEDICAL_STOP_WORDS_SET.add(_w)
_sw_count += 1
log.info("Stop-words manuels chargés : %d mots depuis %s", _sw_count, _stopwords_file.name)
except Exception as _exc:
log.error("Stop-words manuels : erreur de lecture %s%s", _stopwords_file, _exc)
else:
log.warning("Stop-words manuels : fichier introuvable %s — qualité dégradée", _stopwords_file)
# Enrichissement BDPM : ~7300 noms commerciaux + DCI/substances actives
_bdpm_path = Path(__file__).parent / "data" / "bdpm" / "medicaments_stopwords.txt"
if _bdpm_path.exists():
try:
_bdpm_count = 0
for _line in _bdpm_path.read_text(encoding="utf-8").splitlines():
_w = _line.strip()
if _w and not _w.startswith("#"):
_MEDICAL_STOP_WORDS_SET.add(_w)
_bdpm_count += 1
log.info("BDPM stop-words chargés : %d mots", _bdpm_count)
except Exception as _exc:
log.error("BDPM stop-words : erreur de lecture %s%s", _bdpm_path, _exc)
else:
log.warning("BDPM stop-words : fichier introuvable %s — qualité dégradée", _bdpm_path)
_MEDICAL_STOP_WORDS = (
r"(?:" + "|".join(re.escape(w) for w in _MEDICAL_STOP_WORDS_SET) + r")"
_MEDICAL_STOP_WORDS_SET = _load_wordlist_file(
Path(__file__).parent / "data" / "stopwords_manuels.txt",
transform=str.lower,
label="Stop-words manuels",
)
_MEDICAL_STOP_WORDS_SET.update(_load_edsnlp_drug_names())
_MEDICAL_STOP_WORDS_SET.update(
_load_wordlist_file(
Path(__file__).parent / "data" / "bdpm" / "medicaments_stopwords.txt",
transform=str.lower,
label="BDPM stop-words",
)
)
if not _MEDICAL_STOP_WORDS_SET:
_MEDICAL_STOP_WORDS_SET = set(_MEDICAL_STOP_WORDS_FALLBACK)
_BASE_MEDICAL_STOP_WORDS_SET = set(_MEDICAL_STOP_WORDS_SET)
def _refresh_medical_stopwords_pattern() -> None:
global _MEDICAL_STOP_WORDS
if not _MEDICAL_STOP_WORDS_SET:
_MEDICAL_STOP_WORDS = r"(?!)"
return
_MEDICAL_STOP_WORDS = (
r"(?:" + "|".join(re.escape(w) for w in sorted(_MEDICAL_STOP_WORDS_SET)) + r")"
)
_refresh_medical_stopwords_pattern()
# Un token de nom : commence par majuscule, lettres/tirets/apostrophes (PAS d'espace ni de point)
_PERSON_TOKEN = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+"
RE_PERSON_CONTEXT = re.compile(
@@ -985,7 +589,17 @@ RE_CIVILITE_INITIALE = re.compile(
# --- N° examen / N° patient imagerie (radiologie) ---
RE_NUM_EXAMEN_PATIENT = re.compile(
r"N[°o]?\s*(?:d['''\u2019]\s*)?(?:examen|patient|accession|passage)\s*[:\-]?\s*([A-Za-z]{0,4}\d{5,12})",
r"N[°o]?\s*(?:d['''\u2019]\s*)?(?:examen|patient(?:\s+imagerie)?|accession|passage)\s*[:\-]?\s*"
r"((?=[A-Za-z0-9\-]{6,20}\b)(?=[A-Za-z0-9\-]*\d)[A-Za-z0-9\-]+)",
re.IGNORECASE,
)
# --- N° bare d'entête labo / imagerie ---
# Exemple:
# N° 23L35781
# Prélevé le 26/07/2023 Enregistré le 27/07/2023
RE_NUM_ACCESSION_HEADER = re.compile(
r"(?:^|\n)\s*N[°o]\s*[:\-]?\s*([A-Za-z0-9\-]{6,20})\s*\n"
r"(?:[^\n]*\n){0,2}\s*(?:Pr[ée]lev[ée]\s+le|Enregistr[ée]\s+le)",
re.IGNORECASE,
)
@@ -1177,6 +791,7 @@ _DPI_LABELS_SET: set = _load_txt_set(
)
if not _DPI_LABELS_SET:
_DPI_LABELS_SET = set(_DPI_LABELS_FALLBACK)
_BASE_DPI_LABELS_SET = set(_DPI_LABELS_SET)
# Companion blacklist : termes EN MAJUSCULES qui ne sont JAMAIS des noms
# (spécialités, labos pharma, mots courants ambigus).
@@ -1189,6 +804,7 @@ _COMPANION_BLACKLIST_SET: set = _load_txt_set(
)
if not _COMPANION_BLACKLIST_SET:
_COMPANION_BLACKLIST_SET = set(_COMPANION_BLACKLIST_FALLBACK)
_BASE_COMPANION_BLACKLIST_SET = set(_COMPANION_BLACKLIST_SET)
_WHITELIST_FUNCTION_WORDS = {
@@ -1223,14 +839,15 @@ def _load_whitelist_phrases(phrases) -> int:
def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]:
cfg = DEFAULTS_CFG.copy()
if config_path and config_path.exists() and yaml is not None:
try:
user = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
for k, v in user.items():
cfg[k] = v
except Exception:
pass
global _MEDICAL_STOP_WORDS_SET, _VILLE_BLACKLIST, _DPI_LABELS_SET, _COMPANION_BLACKLIST_SET
cfg = load_default_dictionaries_dict() if config_path is None else load_effective_dictionaries_dict(config_path)
_MEDICAL_STOP_WORDS_SET = set(_BASE_MEDICAL_STOP_WORDS_SET)
_VILLE_BLACKLIST = set(_BASE_VILLE_BLACKLIST)
_DPI_LABELS_SET = set(_BASE_DPI_LABELS_SET)
_COMPANION_BLACKLIST_SET = set(_BASE_COMPANION_BLACKLIST_SET)
_WHITELIST_NEVER_MASK_TOKENS.clear()
_WHITELIST_NEVER_MASK_PHRASES.clear()
# Charger les stop-words et villes supplémentaires depuis le YAML
extra_sw = cfg.get("additional_stopwords", [])
@@ -1239,6 +856,7 @@ def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]:
if w and str(w).strip():
_MEDICAL_STOP_WORDS_SET.add(str(w).strip().lower())
log.info("Stop-words YAML supplémentaires : %d", len(extra_sw))
_refresh_medical_stopwords_pattern()
extra_villes = cfg.get("additional_villes_blacklist", [])
if extra_villes:
@@ -1871,8 +1489,49 @@ def _mask_critical_in_key(key: str, audit: List[PiiHit], page_idx: int) -> str:
return key
def _replace_captured_value(full_match: str, captured_value: str, placeholder: str) -> str:
start = full_match.find(captured_value)
if start < 0:
return placeholder
end = start + len(captured_value)
return full_match[:start] + placeholder + full_match[end:]
def _mask_structured_line(line: str, audit: List[PiiHit], page_idx: int) -> str:
"""Masque les champs structurés dont la détection dépend du libellé de la ligne."""
def _repl_code_postal(m: re.Match) -> str:
original = m.group(1) or m.group(2) or m.group(0)
audit.append(PiiHit(page_idx, "CODE_POSTAL", original, PLACEHOLDERS["CODE_POSTAL"]))
if m.group(1):
return _replace_captured_value(m.group(0), m.group(1), PLACEHOLDERS["CODE_POSTAL"])
return PLACEHOLDERS["CODE_POSTAL"]
def _repl_num_examen(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "DOSSIER", m.group(1), PLACEHOLDERS["DOSSIER"]))
return _replace_captured_value(m.group(0), m.group(1), PLACEHOLDERS["DOSSIER"])
def _repl_dossier(m: re.Match) -> str:
original = m.group(1) or m.group(2) or m.group(0)
audit.append(PiiHit(page_idx, "DOSSIER", original, PLACEHOLDERS["DOSSIER"]))
return _replace_captured_value(m.group(0), original, PLACEHOLDERS["DOSSIER"])
def _repl_venue(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "NDA", m.group(1), PLACEHOLDERS["NDA"]))
return _replace_captured_value(m.group(0), m.group(1), PLACEHOLDERS["NDA"])
masked = RE_CODE_POSTAL.sub(_repl_code_postal, line)
masked = RE_NUM_EXAMEN_PATIENT.sub(_repl_num_examen, masked)
masked = RE_NUMERO_DOSSIER.sub(_repl_dossier, masked)
masked = RE_VENUE_SEJOUR.sub(_repl_venue, masked)
return masked
def _kv_value_only_mask(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
line = _mask_admin_label(line, audit, page_idx)
structured_line = _mask_structured_line(line, audit, page_idx)
if structured_line != line:
return structured_line
parts = SPLITTER.split(line, maxsplit=1)
if len(parts) == 2:
key, value = parts
@@ -2413,6 +2072,35 @@ def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> Tuple[set, s
for m in _RE_EMAIL_HEADER.finditer(full_text):
_add_tokens_force_all(m.group(1), "EMAIL_HEADER", "medium")
# En-têtes patient en capitales, sans libellé explicite.
# Exemple:
# ETCHEVERRY JEAN CLAUDE
# On reste conservateur: 2-4 tokens uppercase, avec au moins un prénom
# INSEE et un nom de famille INSEE. Les tokens proposés viennent
# exclusivement des dictionnaires INSEE, sans blacklist codée en dur ici.
_UPPER_NAME_LINE_RE = re.compile(
r"^[ \t]*([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ\-' ]+"
r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ])[ \t]*$",
re.MULTILINE,
)
for m in _UPPER_NAME_LINE_RE.finditer(full_text):
raw_line = re.sub(r"\s+", " ", m.group(1)).strip()
tokens = [tok.strip(" .-'") for tok in raw_line.split() if tok.strip(" .-'")]
if len(tokens) < 2 or len(tokens) > 4:
continue
if any(len(tok) < 3 for tok in tokens):
continue
norm_tokens = [_normalize_nfkd_upper(tok) for tok in tokens]
has_prenom = any(tok in _INSEE_PRENOMS_SET for tok in norm_tokens)
has_nom = any(tok in _INSEE_NOMS_FAMILLE for tok in norm_tokens)
if not (has_prenom and has_nom):
continue
for tok, norm_tok in zip(tokens, norm_tokens):
if norm_tok in _INSEE_PRENOMS_SET or norm_tok in _INSEE_NOMS_FAMILLE:
_add_candidate(tok, "UPPER_NAME_LINE", "low", False)
# Pour les noms composés avec tiret (ex: "LACLAU-LACROUTS"),
# ajouter aussi les parties individuelles pour capturer les occurrences standalone.
# _apply_extracted_names traite le composé en premier (plus long) puis les parties.
@@ -2582,10 +2270,10 @@ def _apply_extracted_names(text: str, names: set, audit: List[PiiHit], force_nam
def _apply_trackare_hits_to_text(text: str, audit: List[PiiHit]) -> str:
"""Applique les PiiHit non-NOM dans le texte (NDA footers, EPISODE, RPPS, FINESS, etc.).
"""Applique les PiiHit non-NOM dans le texte (NDA, DOSSIER, EPISODE, RPPS, FINESS, etc.).
Ces hits sont détectés par _extract_trackare_identity ou la phase 0c
mais n'étaient appliqués qu'au PDF raster, pas au fichier .pseudonymise.txt."""
_APPLY_KINDS = {"EPISODE", "RPPS", "FINESS"}
_APPLY_KINDS = {"DOSSIER", "EPISODE", "FINESS", "NDA", "RPPS"}
# Collecter les valeurs à remplacer, groupées par placeholder
replacements: Dict[str, str] = {} # original → placeholder
for h in audit:
@@ -2698,7 +2386,14 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
for m in _RE_IPP_MULTILINE.finditer(full_raw):
audit.append(PiiHit(-1, "IPP", m.group(1), PLACEHOLDERS["IPP"]))
# Phase 0f : DEMANDE N° multiline (DEMANDE N°\n2300261164)
# Phase 0f : numéro d'accession / d'examen en en-tête de labo ou imagerie
# Ex:
# N° 23L35781
# Prélevé le 26/07/2023
for m in RE_NUM_ACCESSION_HEADER.finditer(full_raw):
audit.append(PiiHit(-1, "DOSSIER", m.group(1), PLACEHOLDERS["DOSSIER"]))
# Phase 0g : DEMANDE N° multiline (DEMANDE N°\n2300261164)
_RE_DEMANDE_MULTILINE = re.compile(
r"DEMANDE\s+N[°o]?\s*\n\s*(\d{8,})",
re.IGNORECASE,
@@ -2706,14 +2401,14 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
for m in _RE_DEMANDE_MULTILINE.finditer(full_raw):
audit.append(PiiHit(-1, "DOSSIER", m.group(1), PLACEHOLDERS["DOSSIER"]))
# Phase 0g : N° venue multiline (tableaux BACTERIO : label et valeur séparés)
# Phase 0h : N° venue multiline (tableaux BACTERIO : label et valeur séparés)
_RE_VENUE_MULTILINE = re.compile(
r"N[°o]?\s*venue\s*[:\-]?\s*\n(?:[^\n]*\n){0,2}\s*(\d{6,})",
re.IGNORECASE,
)
for m in _RE_VENUE_MULTILINE.finditer(full_raw):
audit.append(PiiHit(-1, "NDA", m.group(1), PLACEHOLDERS["NDA"]))
# Phase 0g-bis : N° venue inversé (layout-aware réordonne : numéro AVANT label)
# Phase 0h-bis : N° venue inversé (layout-aware réordonne : numéro AVANT label)
_RE_VENUE_REVERSE = re.compile(
r"(?<!\d)(\d{7,10})(?!\d)\s*\n(?:[^\n]*\n){0,4}N[°o]?\s*venue\s*[:\-]?\s*$",
re.IGNORECASE | re.MULTILINE,
@@ -3092,55 +2787,17 @@ def _build_finess_ac():
return
# Mots génériques qui ne doivent jamais être matchés seuls
_ac_generic_blacklist = {
# Types d'établissements
"clinique", "pharmacie", "hopital", "centre", "foyer",
"residence", "maison", "cabinet", "service", "laboratoire",
"institut", "association", "fondation", "mutuelle", "polyclinique",
"dispensaire", "hospice", "annexe", "antenne", "site",
# Mots français courants qui sont aussi des noms d'établissements
"collegiale", "collegial", "cathedral", "cathedrale",
"providence", "esperance", "renaissance", "liberation",
"republique", "fraternite", "solidarite", "independance",
"beauregard", "bellevue", "belvedere",
"promenade", "esplanade", "corniche", "prefecture",
"croissant", "confluence", "bienvenue",
"chartreuse", "commanderie", "chapelle", "basilique",
"departement", "departementale", "communautaire",
# Spécialités médicales / termes cliniques courants
"chirurgicale", "radiologie", "addictologie", "prevention",
"psychotherapique", "ambulatoire", "hospitalisation",
"consultation", "surveillance", "therapeutique",
"readaptation", "reeducation", "reanimation",
"specialisee", "conventionnelle", "professionnelle",
"informatique", "administrative", "regionale",
# Mots communs
"generation", "revolution", "assomption", "visitation",
"consolation", "atlantique", "manutention", "prefiguration",
"intervalle", "pharmaciens", "pharmacien", "transfert",
"comprimee", "comprimees", "injectable", "injectables",
"maintenant", "actuellement", "auparavant", "prochainement",
"rapidement", "correctement", "directement", "simplement",
"internationale", "international", "intercommunal", "intercommunale",
# Termes médicaux homonymes d'établissements FINESS (retour relecteur 2026-03-17)
"resistance", "radiotherapie", "chimiotherapie", "curietherapie",
"hormonotherapie", "immunotherapie", "kinesitherapie",
"ergotherapie", "orthophonie", "psychomotricite",
"reeducation", "readaptation", "convalescence",
"dependance", "autonomie", "gerontologie",
}
_ac_generic_blacklist = _load_wordlist_file(
data_dir / "generic_name_blacklist.txt",
transform=str.lower,
label="FINESS noms génériques blacklist",
)
# Expressions multi-mots trop génériques
_ac_generic_phrases = {
"a domicile", "au domicile", "menage a domicile",
"du nord", "du sud", "de l est", "de l ouest",
"la maison", "la residence", "les jardins",
"le village", "le parc", "la colline",
"au soleil", "en france",
# Expressions médicales homonymes d'établissements FINESS (FP relecteur 2026-03-16)
"long cours", "au long cours",
"le bourg", "le val", "le clos", "le mas",
"les pins", "les chenes", "les oliviers",
}
_ac_generic_phrases = _load_wordlist_file(
data_dir / "generic_phrase_blacklist.txt",
transform=str.lower,
label="FINESS expressions génériques blacklist",
)
# Whitelist explicite de mono-mots < 10 chars considérés comme distinctifs
# (sinon rejetés par le filtre général). Exemple : EMBRUNS (7 chars).
# Alimentée depuis data/finess/mono_mots_distinctifs.txt — curation manuelle.
@@ -3365,8 +3022,11 @@ def _build_finess_addr_ac():
"sentier", "rond-point", "traverse", "esplanade",
"promenade", "montee", "voie", "carrefour", "faubourg"}
# Patterns non-adresse à exclure
_addr_blacklist = {"cabinet medical", "cabinet dentaire", "cabinet infirmier",
"cabinet paramedical", "cabinet sage-femme"}
_addr_blacklist = _load_wordlist_file(
data_dir / "address_blacklist.txt",
transform=str.lower,
label="FINESS adresses blacklist",
)
for line in addr_path.read_text(encoding="utf-8").splitlines():
name = line.strip()
if not name or len(name) < 10:
@@ -3804,11 +3464,19 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
protected = RE_ADRESSE.sub(PLACEHOLDERS["ADRESSE"], protected)
protected = RE_ADRESSE_LIEU_DIT.sub(PLACEHOLDERS["ADRESSE"], protected)
protected = RE_BP.sub(PLACEHOLDERS["ADRESSE"], protected)
protected = RE_CODE_POSTAL.sub(PLACEHOLDERS["CODE_POSTAL"], protected)
def _rescan_code_postal(m: re.Match) -> str:
if m.group(1):
return _replace_captured_value(m.group(0), m.group(1), PLACEHOLDERS["CODE_POSTAL"])
return PLACEHOLDERS["CODE_POSTAL"]
protected = RE_CODE_POSTAL.sub(_rescan_code_postal, protected)
# N° Episode
protected = RE_EPISODE.sub(PLACEHOLDERS["EPISODE"], protected)
# N° venue / séjour
protected = RE_VENUE_SEJOUR.sub(PLACEHOLDERS["NDA"], protected)
protected = RE_VENUE_SEJOUR.sub(
lambda m: _replace_captured_value(m.group(0), m.group(1), PLACEHOLDERS["NDA"]),
protected,
)
# N° RPPS
protected = RE_RPPS.sub(PLACEHOLDERS["RPPS"], protected)
# FINESS par gazetteer (nombres 9 chiffres matchant un vrai numéro FINESS)
@@ -4825,7 +4493,7 @@ if __name__ == "__main__":
ap.add_argument("--out", type=str, default="out")
ap.add_argument("--no-vector", action="store_true")
ap.add_argument("--raster", action="store_true")
ap.add_argument("--config", type=str, default=str(Path("config/dictionnaires.yml")))
ap.add_argument("--config", type=str, default=str(RUNTIME_DICTIONARIES_CONFIG_PATH))
ap.add_argument("--hf", action="store_true", help="Activer NER ONNX sur narratif (nécessite ner_manager_onnx)")
ap.add_argument("--model", type=str, default="cmarkea/distilcamembert-base-ner")
args = ap.parse_args()