diff --git a/anonymizer_core_refactored_onnx.py b/anonymizer_core_refactored_onnx.py index 56f4e33..23bace1 100644 --- a/anonymizer_core_refactored_onnx.py +++ b/anonymizer_core_refactored_onnx.py @@ -385,7 +385,7 @@ _MEDICAL_STOP_WORDS_SET = { "bcy", "po2", "po1", "po3", "bha", "atg", "ras", "cat", "ass", # FP audit OGC 17 CRH "mode", "retraitee", "retraité", "retraitée", "régression", "regression", "tel", - "strasbourg", "bordeaux", "toulouse", "paris", "lyon", "marseille", "bayonne", + "strasbourg", "bordeaux", "toulouse", "paris", "lyon", "marseille", "bayonne", "anglet", # Spécialités/services récurrents comme FP NOM "cancérologie", "cancerologie", "réanimation", "reanimation", "urologie", "néphrologie", "nephrologie", "hématologie", "hematologie", @@ -399,6 +399,50 @@ _MEDICAL_STOP_WORDS_SET = { "transmissions", "transmission", "releve", "relevé", "objectif", "objectifs", "evaluation", "évaluation", "planification", "planifié", "planifiee", + # ── FP détectés automatiquement par audit_fp_detector.py ── + # Lot 2 : tokens basse confiance (DICT_FR seul) clairement non-noms + "acide", "adulte", "ambulatoire", "avenue", "bandelette", "bassin", + "bijoux", "bouche", "bouchon", "changes", "court", "demande", "dessert", + "devenir", "diffusé", "douche", "entrée", "escarre", "espace", + "explications", "fauteuil", "feuillet", "fixateur", "fois", "gamma", + "germes", "glace", "habillage", "liste", "maquillage", "matelas", + "mettre", "obésité", "ongles", "palais", "perlant", "pertes", + "pièce", "plaie", "risque", "saint", "sang", "signe", "sonde", + "tenue", "texte", "transaminases", "transit", "transmis", "urinal", + "vernis", "vessie", "vrac", + # Lot 2 : termes médicaux (préfixes/suffixes) + "anatomo-pathologique", "anemie", "anémie", "angioscanner", + "cétonurie", "cetonurie", "depilation", "dépilation", + "folique", "gastroentérologue", "gastroenterologue", + "microgrammes", "nalidixique", "naso-gastrique", + "angio-irm", "neuro", "neuro-chirurgie", "endoplasmique", + "cyto", "plaie-colle", "bionolyte", + # Lot 1 (103 tokens, confiance >= 0.5) ── + # Anatomie / clinique + "abdomen", "aortique", "bilirubine", "drain", "gastrique", "histologique", + "intraveineuse", "lithiasique", "macroscopie", "miction", "molles", "pelvienne", + "plaquette", "plaquettes", "rectale", "sanguine", "sciatique", "selle", "urinaire", + # Pathologies / symptômes + "algie", "angoisse", "antécédent", "douloureuse", "dyslipidémie", + "hemodialyse", "hemorragique", "hyperthermie", "hématologue", + # Médicaments / matériel médical + "ampoule", "antalgique", "antiseptique", "compresse", "flacon", + "oxygène", "pansement", "vitamine", + # Biologie / examens + "biochimie", "biologie", "fer", + # Actions / états cliniques + "ablation", "absence", "admission", "bloc", "changement", "cliniquement", + "cognitif", "couchage", "dispositif", "dynamique", "entretien", "histoire", + "intervention", "position", "rappel", "relation", "retour", "réalisation", + "résistant", "réévaluation", "situation", "temporaire", "urgence", "urgences", + "urgent", "validation", + # Mots courants / contextuels + "angle", "bille", "boisson", "bureau", "campagne", "cases", "circuit", + "clause", "concubin", "confortable", "demain", "densité", "dernière", + "distant", "domaine", "elle", "fils", "frère", "grand", "horizon", + "hui", "identifiant", "minuit", "murent", "neuf", "original", "pages", + "personne", "premier", "quartier", "retraite", "route", "rés", + "tam", "terrasses", "trouve", "verrouillé", "villa", "étage", } # Enrichissement automatique avec les ~4000 noms de médicaments d'edsnlp _MEDICAL_STOP_WORDS_SET.update(_load_edsnlp_drug_names()) diff --git a/audit_fp_detector.py b/audit_fp_detector.py new file mode 100644 index 0000000..ea66cfb --- /dev/null +++ b/audit_fp_detector.py @@ -0,0 +1,360 @@ +#!/usr/bin/env python3 +"""Détecteur automatique de faux positifs NOM_GLOBAL. + +Analyse les fichiers .audit.jsonl et croise les NOM_GLOBAL avec : +1. Le dictionnaire français système (/usr/share/dict/french) +2. Des patterns morphologiques médicaux (-ite, -ose, -ique, -ine, etc.) +3. La fréquence inter-documents (un vrai nom apparaît rarement dans 1 seul dossier) + +Usage: + python3 audit_fp_detector.py /chemin/vers/anonymise/ + python3 audit_fp_detector.py /chemin/vers/anonymise/ --auto-fix +""" + +import json +import os +import re +import sys +from collections import defaultdict +from pathlib import Path + +# ── Chargement dictionnaire français ──────────────────────────────── +DICT_PATH = Path("/usr/share/dict/french") +_french_words: set = set() + +def _load_french_dict(): + global _french_words + if _french_words: + return + with open(DICT_PATH) as f: + _french_words = set(w.strip().lower() for w in f if len(w.strip()) >= 3) + +# ── Prénoms/noms français courants (homonymes dictionnaire) ──────── +# Ces mots sont à la fois dans le dictionnaire ET sont des prénoms/noms réels. +# Ils ne doivent PAS être flagués comme FP. +_KNOWN_NAME_HOMOPHONES = { + # Prénoms courants qui sont aussi des mots + "martin", "bernard", "petit", "richard", "moreau", "laurent", + "simon", "pierre", "marie", "jean", "paul", "louis", "marc", + "charles", "henry", "victor", "rose", "marguerite", "pascal", + "leon", "léon", "auguste", "clement", "clément", "olive", + "sylvie", "denis", "raymond", "roger", "maxime", "claude", + "marcel", "germaine", "alice", "florence", "dominique", + "christine", "caroline", "elisabeth", "elisabeth", "thomas", + "nicolas", "vincent", "benjamin", "lucien", "gaston", + "annette", "colette", "suzanne", "andre", "andré", "rené", + "yves", "gilles", "noel", "noël", "aimé", "aime", + "guy", "joël", "joelle", "gilbert", "fernand", "édith", + "edith", "agnès", "agnes", "jeanne", "lucie", "laure", + "adrien", "bastien", "julien", "viviane", "constance", + "armand", "blanche", "clémence", "clemence", "prudence", "patience", + "grace", "grâce", "fidèle", "placide", + # Prénoms féminins en -ine/-ine (matchent le suffixe médical) + "adeline", "aline", "amandine", "capucine", "celine", "céline", + "coline", "catherine", "clementine", "clémentine", "delphine", + "emeline", "émeline", "frédérique", "frederique", "ghislaine", + "gwendoline", "justine", "karine", "laurence", "laurie", + "marceline", "marine", "marjolaine", "martine", "madeleine", + "melaine", "moline", "morgane", "nadine", "noémie", "noemie", + "pauline", "perrine", "sabine", "sandrine", "séverine", "severine", + "tiphaine", "virginie", + # Prénoms en -oine/-iste/-ence etc. + "antoine", "baptiste", "patrice", "romain", "charlotte", + "alexandra", "aurore", "jules", "jacques", "mathieu", + "olivier", "ana", "maria", "pascale", "laura", "margot", + "marina", "maite", "maïté", + # Noms de famille courants qui sont aussi des mots + "blanc", "noir", "fort", "brun", "roux", "roy", "fabre", + "page", "comte", "baron", "marin", "fournier", "bonhomme", + "boucher", "berger", "marchand", "chevalier", "gros", + "masson", "bonnet", "vidal", "meunier", "fontaine", + "robin", "gay", "perrin", "roche", "rey", + "maître", "maitre", "salle", "aubry", + # Noms de famille fréquents dans le corpus + "barbotin", "brocard", "brunet", "cailliez", "charrier", + "colas", "combes", "forges", "gaillard", "galidie", + "gendre", "genevois", "grenier", "lemoine", "martel", + "martial", "moulin", "pineau", "piper", "pontier", + "poulot", "rosier", "roussel-fontaine", "saule", "senne", + "serrate", "serre", "taris", "vigneau", "vignes", "voisin", + "barre", "campagnard", "claire", "capera", "bigourdan", + "breton", "rainer", "bataille", "cabaner", "klement", + "boucau", "marte", "dores", "culot", + # Prénoms composés courants (matchent suffixe médical -ine/-ence) + "anne-christine", "marie-christine", "marie-line", + "marie-marceline", "berges", +} + +# ── Patterns morphologiques médicaux ──────────────────────────────── +_MEDICAL_SUFFIXES = re.compile( + r"(?:ite|ose|ique|isme|ine|ome|able|tion|ment|aire|ence|ance" + r"|ectomie|tomie|pathie|scopie|plasie|trophie|graphie" + r"|lyse|émie|emie|urie|pnée|pnee|algie|cyte|gène|gene" + r"|oïde|oide|ïque|phage|logie|thérapie|therapie)$", + re.IGNORECASE, +) + +_MEDICAL_PREFIXES = re.compile( + r"^(?:anti|hyper|hypo|intra|extra|para|péri|peri|poly|mono" + r"|néo|neo|dys|hémo|hemo|héma|hema|gastro|entéro|entero" + r"|broncho|pneumo|cardio|neuro|uro|néphro|nephro" + r"|ostéo|osteo|arthro|dermato|onco|immuno|cyto|histo" + r"|bio|micro|macro|angio|adéno|adeno|cholé|chole" + r"|pancréato|pancreato|hépato|hepato|splén|splen)", + re.IGNORECASE, +) + +# ── Mots structurels / trackare / DPI ────────────────────────────── +_STRUCTURAL_WORDS = { + "observation", "observations", "consultation", "prescripteur", + "administration", "evaluation", "évaluation", "indication", + "surveillance", "transmission", "transmissions", + "preparation", "préparation", "planification", + "validation", "notification", "recommandation", + "intervention", "installation", "mobilisation", + "exploration", "perfusion", "transfusion", "aspiration", + "nutrition", "hydratation", "oxygénation", "oxygenation", + "cicatrisation", "désinfection", "desinfection", + "compensation", "stabilisation", "régularisation", + "hospitalisation", "réhospitalisation", + "amélioration", "amelioration", "dégradation", "degradation", + "aggravation", "complication", "résolution", "resolution", + "progression", "régression", "regression", + "rééducation", "reeducation", "adaptation", "réadaptation", + "orientation", "coordination", "organisation", +} + + +def analyze_audit_dir(audit_dir: str) -> dict: + """Analyse tous les .audit.jsonl d'un répertoire. + + Returns dict with: + - fp_candidates: list of (token, confidence, reasons) + - stats: summary statistics + """ + _load_french_dict() + + audit_dir = Path(audit_dir) + audit_files = sorted(audit_dir.glob("*.audit.jsonl")) + + if not audit_files: + print(f"Aucun fichier .audit.jsonl trouvé dans {audit_dir}") + return {"fp_candidates": [], "stats": {}} + + # ── Collecter NOM_GLOBAL par fichier ──────────────────────────── + token_files = defaultdict(set) # token → set of filenames + token_counts = defaultdict(int) # token → total occurrences + all_kinds = defaultdict(int) + + for af in audit_files: + fname = af.stem.replace(".audit", "") + with open(af) as f: + for line in f: + line = line.strip() + if not line: + continue + try: + h = json.loads(line) + all_kinds[h["kind"]] += 1 + if h["kind"] == "NOM_GLOBAL": + token = h["original"] + token_files[token].add(fname) + token_counts[token] += 1 + except Exception: + pass + + # ── Analyser chaque token ─────────────────────────────────────── + fp_candidates = [] + + for token in sorted(token_files.keys()): + reasons = [] + confidence = 0.0 + token_lower = token.lower() + n_files = len(token_files[token]) + n_total = token_counts[token] + + # Skip known name homophones + if token_lower in _KNOWN_NAME_HOMOPHONES: + continue + + # 1. Dans le dictionnaire français ? + in_dict = token_lower in _french_words + if in_dict: + reasons.append("DICT_FR") + confidence += 0.4 + + # 2. Suffixe médical ? (exclure -ine court qui matche les prénoms) + has_medical_suffix = bool(_MEDICAL_SUFFIXES.search(token_lower)) + if has_medical_suffix: + # -ine seul trop large pour les mots courts (prénoms) + suffix_match = _MEDICAL_SUFFIXES.search(token_lower) + if suffix_match and suffix_match.group() == "ine" and len(token_lower) < 8: + has_medical_suffix = False + elif suffix_match and suffix_match.group() in ("tion", "ment", "ence", "ance", "aire") and len(token_lower) < 6: + has_medical_suffix = False + if has_medical_suffix: + reasons.append("SUFFIXE_MED") + confidence += 0.3 + + # 3. Préfixe médical ? + has_medical_prefix = bool(_MEDICAL_PREFIXES.search(token_lower)) + if has_medical_prefix: + reasons.append("PREFIXE_MED") + confidence += 0.3 + + # 4. Mot structurel DPI ? + if token_lower in _STRUCTURAL_WORDS: + reasons.append("STRUCT_DPI") + confidence += 0.5 + + # 5. Tout en minuscule (les vrais noms sont Capitalisés ou MAJUSCULES) + if token.islower() and len(token) > 3: + reasons.append("MINUSCULE") + confidence += 0.2 + + # 6. Très court (<=3) et ALL-CAPS → souvent abréviation + if len(token) <= 3 and token.isupper(): + reasons.append("ABREV_3CH") + confidence += 0.2 + + # 7. Apparaît dans 1 seul fichier + dans le dico → très suspect + if n_files == 1 and in_dict: + reasons.append("1_SEUL_DOC") + confidence += 0.2 + + # 8. Mot composé avec tiret contenant un mot du dico + if "-" in token: + parts = token.split("-") + dict_parts = [p for p in parts if p.lower() in _french_words and p.lower() not in _KNOWN_NAME_HOMOPHONES] + if dict_parts and len(dict_parts) == len(parts): + reasons.append("COMPOSE_DICT") + confidence += 0.3 + + # Seuil : au moins une raison + if reasons and confidence >= 0.3: + fp_candidates.append({ + "token": token, + "confidence": round(min(confidence, 1.0), 2), + "reasons": reasons, + "occurrences": n_total, + "n_files": n_files, + "files": sorted(token_files[token])[:3], + }) + + # Trier par confiance décroissante + fp_candidates.sort(key=lambda x: (-x["confidence"], x["token"])) + + stats = { + "audit_files": len(audit_files), + "total_nom_global_unique": len(token_files), + "total_nom_global_occurrences": sum(token_counts.values()), + "fp_candidates": len(fp_candidates), + "already_in_stopwords": 0, # filled below + } + + # Vérifier lesquels sont déjà dans les stop words + try: + sys.path.insert(0, str(Path(__file__).parent)) + from anonymizer_core_refactored_onnx import _MEDICAL_STOP_WORDS_SET + already = [c for c in fp_candidates if c["token"].lower() in _MEDICAL_STOP_WORDS_SET] + stats["already_in_stopwords"] = len(already) + for c in fp_candidates: + if c["token"].lower() in _MEDICAL_STOP_WORDS_SET: + c["already_stopped"] = True + else: + c["already_stopped"] = False + except ImportError: + for c in fp_candidates: + c["already_stopped"] = None + + return {"fp_candidates": fp_candidates, "stats": stats} + + +def print_report(result: dict): + """Affiche un rapport lisible.""" + stats = result["stats"] + candidates = result["fp_candidates"] + + print("=" * 70) + print(" DÉTECTION AUTOMATIQUE FAUX POSITIFS NOM_GLOBAL") + print("=" * 70) + print(f" Fichiers audit analysés : {stats['audit_files']}") + print(f" NOM_GLOBAL uniques : {stats['total_nom_global_unique']}") + print(f" NOM_GLOBAL occurrences : {stats['total_nom_global_occurrences']}") + print(f" Candidats FP détectés : {stats['fp_candidates']}") + print(f" Déjà dans stop words : {stats['already_in_stopwords']}") + print() + + # Séparer nouveaux vs déjà traités + new_fp = [c for c in candidates if not c.get("already_stopped")] + old_fp = [c for c in candidates if c.get("already_stopped")] + + if new_fp: + print(f"{'─'*70}") + print(f" NOUVEAUX FP À AJOUTER AUX STOP WORDS ({len(new_fp)})") + print(f"{'─'*70}") + print(f" {'Token':<25s} {'Conf':>5s} {'Occ':>4s} {'Docs':>4s} Raisons") + print(f" {'─'*24} {'─'*5} {'─'*4} {'─'*4} {'─'*30}") + for c in new_fp: + reasons = ", ".join(c["reasons"]) + print(f" {c['token']:<25s} {c['confidence']:>5.2f} {c['occurrences']:>4d} {c['n_files']:>4d} {reasons}") + + # Générer le code Python à copier + print(f"\n{'─'*70}") + print(f" CODE À AJOUTER dans _MEDICAL_STOP_WORDS_SET :") + print(f"{'─'*70}") + tokens_to_add = sorted(set(c["token"].lower() for c in new_fp)) + line = " " + for i, t in enumerate(tokens_to_add): + entry = f'"{t}", ' + if len(line) + len(entry) > 95: + print(line.rstrip(", ")) + line = " " + line += entry + if line.strip(): + print(line.rstrip(", ")) + else: + print(" Aucun nouveau FP détecté !") + + if old_fp: + print(f"\n{'─'*70}") + print(f" DÉJÀ DANS STOP WORDS ({len(old_fp)}) — OK") + print(f"{'─'*70}") + for c in old_fp: + print(f" ✓ {c['token']}") + + print() + + +def auto_fix(result: dict, core_path: str = None): + """Ajoute automatiquement les FP détectés aux stop words du core.""" + new_fp = [c for c in result["fp_candidates"] + if not c.get("already_stopped") and c["confidence"] >= 0.5] + + if not new_fp: + print("Aucun FP à haute confiance à ajouter automatiquement.") + return [] + + tokens = sorted(set(c["token"].lower() for c in new_fp)) + print(f"\n{len(tokens)} tokens à ajouter automatiquement (confiance >= 0.5):") + for t in tokens: + print(f" + {t}") + + return tokens + + +if __name__ == "__main__": + if len(sys.argv) < 2: + print(f"Usage: {sys.argv[0]} [--auto-fix]") + sys.exit(1) + + audit_dir = sys.argv[1] + do_auto_fix = "--auto-fix" in sys.argv + + result = analyze_audit_dir(audit_dir) + print_report(result) + + if do_auto_fix: + tokens = auto_fix(result) + if tokens: + print(f"\nTokens à insérer : {tokens}")