#!/usr/bin/env python3 """Détecteur automatique de faux positifs NOM_GLOBAL. Analyse les fichiers .audit.jsonl et croise les NOM_GLOBAL avec : 1. Le dictionnaire français système (/usr/share/dict/french) 2. Des patterns morphologiques médicaux (-ite, -ose, -ique, -ine, etc.) 3. La fréquence inter-documents (un vrai nom apparaît rarement dans 1 seul dossier) Usage: python3 audit_fp_detector.py /chemin/vers/anonymise/ python3 audit_fp_detector.py /chemin/vers/anonymise/ --auto-fix """ import json import os import re import sys from collections import defaultdict from pathlib import Path # ── Chargement dictionnaire français ──────────────────────────────── DICT_PATH = Path("/usr/share/dict/french") _french_words: set = set() def _load_french_dict(): global _french_words if _french_words: return with open(DICT_PATH) as f: _french_words = set(w.strip().lower() for w in f if len(w.strip()) >= 3) # ── Prénoms/noms français courants (homonymes dictionnaire) ──────── # Ces mots sont à la fois dans le dictionnaire ET sont des prénoms/noms réels. # Ils ne doivent PAS être flagués comme FP. _KNOWN_NAME_HOMOPHONES = { # Prénoms courants qui sont aussi des mots "martin", "bernard", "petit", "richard", "moreau", "laurent", "simon", "pierre", "marie", "jean", "paul", "louis", "marc", "charles", "henry", "victor", "rose", "marguerite", "pascal", "leon", "léon", "auguste", "clement", "clément", "olive", "sylvie", "denis", "raymond", "roger", "maxime", "claude", "marcel", "germaine", "alice", "florence", "dominique", "christine", "caroline", "elisabeth", "elisabeth", "thomas", "nicolas", "vincent", "benjamin", "lucien", "gaston", "annette", "colette", "suzanne", "andre", "andré", "rené", "yves", "gilles", "noel", "noël", "aimé", "aime", "guy", "joël", "joelle", "gilbert", "fernand", "édith", "edith", "agnès", "agnes", "jeanne", "lucie", "laure", "adrien", "bastien", "julien", "viviane", "constance", "armand", "blanche", "clémence", "clemence", "prudence", "patience", "grace", "grâce", "fidèle", "placide", # Prénoms féminins en -ine/-ine (matchent le suffixe médical) "adeline", "aline", "amandine", "capucine", "celine", "céline", "coline", "catherine", "clementine", "clémentine", "delphine", "emeline", "émeline", "frédérique", "frederique", "ghislaine", "gwendoline", "justine", "karine", "laurence", "laurie", "marceline", "marine", "marjolaine", "martine", "madeleine", "melaine", "moline", "morgane", "nadine", "noémie", "noemie", "pauline", "perrine", "sabine", "sandrine", "séverine", "severine", "tiphaine", "virginie", # Prénoms en -oine/-iste/-ence etc. "antoine", "baptiste", "patrice", "romain", "charlotte", "alexandra", "aurore", "jules", "jacques", "mathieu", "olivier", "ana", "maria", "pascale", "laura", "margot", "marina", "maite", "maïté", # Noms de famille courants qui sont aussi des mots "blanc", "noir", "fort", "brun", "roux", "roy", "fabre", "page", "comte", "baron", "marin", "fournier", "bonhomme", "boucher", "berger", "marchand", "chevalier", "gros", "masson", "bonnet", "vidal", "meunier", "fontaine", "robin", "gay", "perrin", "roche", "rey", "maître", "maitre", "salle", "aubry", # Noms de famille fréquents dans le corpus "barbotin", "brocard", "brunet", "cailliez", "charrier", "colas", "combes", "forges", "gaillard", "galidie", "gendre", "genevois", "grenier", "lemoine", "martel", "martial", "moulin", "pineau", "piper", "pontier", "poulot", "rosier", "roussel-fontaine", "saule", "senne", "serrate", "serre", "taris", "vigneau", "vignes", "voisin", "barre", "campagnard", "claire", "capera", "bigourdan", "breton", "rainer", "bataille", "cabaner", "klement", "boucau", "marte", "dores", "culot", # Prénoms composés courants (matchent suffixe médical -ine/-ence) "anne-christine", "marie-christine", "marie-line", "marie-marceline", "berges", } # ── Patterns morphologiques médicaux ──────────────────────────────── _MEDICAL_SUFFIXES = re.compile( r"(?:ite|ose|ique|isme|ine|ome|able|tion|ment|aire|ence|ance" r"|ectomie|tomie|pathie|scopie|plasie|trophie|graphie" r"|lyse|émie|emie|urie|pnée|pnee|algie|cyte|gène|gene" r"|oïde|oide|ïque|phage|logie|thérapie|therapie)$", re.IGNORECASE, ) _MEDICAL_PREFIXES = re.compile( r"^(?:anti|hyper|hypo|intra|extra|para|péri|peri|poly|mono" r"|néo|neo|dys|hémo|hemo|héma|hema|gastro|entéro|entero" r"|broncho|pneumo|cardio|neuro|uro|néphro|nephro" r"|ostéo|osteo|arthro|dermato|onco|immuno|cyto|histo" r"|bio|micro|macro|angio|adéno|adeno|cholé|chole" r"|pancréato|pancreato|hépato|hepato|splén|splen)", re.IGNORECASE, ) # ── Mots structurels / trackare / DPI ────────────────────────────── _STRUCTURAL_WORDS = { "observation", "observations", "consultation", "prescripteur", "administration", "evaluation", "évaluation", "indication", "surveillance", "transmission", "transmissions", "preparation", "préparation", "planification", "validation", "notification", "recommandation", "intervention", "installation", "mobilisation", "exploration", "perfusion", "transfusion", "aspiration", "nutrition", "hydratation", "oxygénation", "oxygenation", "cicatrisation", "désinfection", "desinfection", "compensation", "stabilisation", "régularisation", "hospitalisation", "réhospitalisation", "amélioration", "amelioration", "dégradation", "degradation", "aggravation", "complication", "résolution", "resolution", "progression", "régression", "regression", "rééducation", "reeducation", "adaptation", "réadaptation", "orientation", "coordination", "organisation", } def analyze_audit_dir(audit_dir: str) -> dict: """Analyse tous les .audit.jsonl d'un répertoire. Returns dict with: - fp_candidates: list of (token, confidence, reasons) - stats: summary statistics """ _load_french_dict() audit_dir = Path(audit_dir) audit_files = sorted(audit_dir.glob("*.audit.jsonl")) if not audit_files: print(f"Aucun fichier .audit.jsonl trouvé dans {audit_dir}") return {"fp_candidates": [], "stats": {}} # ── Collecter NOM_GLOBAL par fichier ──────────────────────────── token_files = defaultdict(set) # token → set of filenames token_counts = defaultdict(int) # token → total occurrences all_kinds = defaultdict(int) for af in audit_files: fname = af.stem.replace(".audit", "") with open(af) as f: for line in f: line = line.strip() if not line: continue try: h = json.loads(line) all_kinds[h["kind"]] += 1 if h["kind"] == "NOM_GLOBAL": token = h["original"] token_files[token].add(fname) token_counts[token] += 1 except Exception: pass # ── Analyser chaque token ─────────────────────────────────────── fp_candidates = [] for token in sorted(token_files.keys()): reasons = [] confidence = 0.0 token_lower = token.lower() n_files = len(token_files[token]) n_total = token_counts[token] # Skip known name homophones if token_lower in _KNOWN_NAME_HOMOPHONES: continue # 1. Dans le dictionnaire français ? in_dict = token_lower in _french_words if in_dict: reasons.append("DICT_FR") confidence += 0.4 # 2. Suffixe médical ? (exclure -ine court qui matche les prénoms) has_medical_suffix = bool(_MEDICAL_SUFFIXES.search(token_lower)) if has_medical_suffix: # -ine seul trop large pour les mots courts (prénoms) suffix_match = _MEDICAL_SUFFIXES.search(token_lower) if suffix_match and suffix_match.group() == "ine" and len(token_lower) < 8: has_medical_suffix = False elif suffix_match and suffix_match.group() in ("tion", "ment", "ence", "ance", "aire") and len(token_lower) < 6: has_medical_suffix = False if has_medical_suffix: reasons.append("SUFFIXE_MED") confidence += 0.3 # 3. Préfixe médical ? has_medical_prefix = bool(_MEDICAL_PREFIXES.search(token_lower)) if has_medical_prefix: reasons.append("PREFIXE_MED") confidence += 0.3 # 4. Mot structurel DPI ? if token_lower in _STRUCTURAL_WORDS: reasons.append("STRUCT_DPI") confidence += 0.5 # 5. Tout en minuscule (les vrais noms sont Capitalisés ou MAJUSCULES) if token.islower() and len(token) > 3: reasons.append("MINUSCULE") confidence += 0.2 # 6. Très court (<=3) et ALL-CAPS → souvent abréviation if len(token) <= 3 and token.isupper(): reasons.append("ABREV_3CH") confidence += 0.2 # 7. Apparaît dans 1 seul fichier + dans le dico → très suspect if n_files == 1 and in_dict: reasons.append("1_SEUL_DOC") confidence += 0.2 # 8. Mot composé avec tiret contenant un mot du dico if "-" in token: parts = token.split("-") dict_parts = [p for p in parts if p.lower() in _french_words and p.lower() not in _KNOWN_NAME_HOMOPHONES] if dict_parts and len(dict_parts) == len(parts): reasons.append("COMPOSE_DICT") confidence += 0.3 # Seuil : au moins une raison if reasons and confidence >= 0.3: fp_candidates.append({ "token": token, "confidence": round(min(confidence, 1.0), 2), "reasons": reasons, "occurrences": n_total, "n_files": n_files, "files": sorted(token_files[token])[:3], }) # Trier par confiance décroissante fp_candidates.sort(key=lambda x: (-x["confidence"], x["token"])) stats = { "audit_files": len(audit_files), "total_nom_global_unique": len(token_files), "total_nom_global_occurrences": sum(token_counts.values()), "fp_candidates": len(fp_candidates), "already_in_stopwords": 0, # filled below } # Vérifier lesquels sont déjà dans les stop words try: sys.path.insert(0, str(Path(__file__).parent)) from anonymizer_core_refactored_onnx import _MEDICAL_STOP_WORDS_SET already = [c for c in fp_candidates if c["token"].lower() in _MEDICAL_STOP_WORDS_SET] stats["already_in_stopwords"] = len(already) for c in fp_candidates: if c["token"].lower() in _MEDICAL_STOP_WORDS_SET: c["already_stopped"] = True else: c["already_stopped"] = False except ImportError: for c in fp_candidates: c["already_stopped"] = None return {"fp_candidates": fp_candidates, "stats": stats} def print_report(result: dict): """Affiche un rapport lisible.""" stats = result["stats"] candidates = result["fp_candidates"] print("=" * 70) print(" DÉTECTION AUTOMATIQUE FAUX POSITIFS NOM_GLOBAL") print("=" * 70) print(f" Fichiers audit analysés : {stats['audit_files']}") print(f" NOM_GLOBAL uniques : {stats['total_nom_global_unique']}") print(f" NOM_GLOBAL occurrences : {stats['total_nom_global_occurrences']}") print(f" Candidats FP détectés : {stats['fp_candidates']}") print(f" Déjà dans stop words : {stats['already_in_stopwords']}") print() # Séparer nouveaux vs déjà traités new_fp = [c for c in candidates if not c.get("already_stopped")] old_fp = [c for c in candidates if c.get("already_stopped")] if new_fp: print(f"{'─'*70}") print(f" NOUVEAUX FP À AJOUTER AUX STOP WORDS ({len(new_fp)})") print(f"{'─'*70}") print(f" {'Token':<25s} {'Conf':>5s} {'Occ':>4s} {'Docs':>4s} Raisons") print(f" {'─'*24} {'─'*5} {'─'*4} {'─'*4} {'─'*30}") for c in new_fp: reasons = ", ".join(c["reasons"]) print(f" {c['token']:<25s} {c['confidence']:>5.2f} {c['occurrences']:>4d} {c['n_files']:>4d} {reasons}") # Générer le code Python à copier print(f"\n{'─'*70}") print(f" CODE À AJOUTER dans _MEDICAL_STOP_WORDS_SET :") print(f"{'─'*70}") tokens_to_add = sorted(set(c["token"].lower() for c in new_fp)) line = " " for i, t in enumerate(tokens_to_add): entry = f'"{t}", ' if len(line) + len(entry) > 95: print(line.rstrip(", ")) line = " " line += entry if line.strip(): print(line.rstrip(", ")) else: print(" Aucun nouveau FP détecté !") if old_fp: print(f"\n{'─'*70}") print(f" DÉJÀ DANS STOP WORDS ({len(old_fp)}) — OK") print(f"{'─'*70}") for c in old_fp: print(f" ✓ {c['token']}") print() def auto_fix(result: dict, core_path: str = None): """Ajoute automatiquement les FP détectés aux stop words du core.""" new_fp = [c for c in result["fp_candidates"] if not c.get("already_stopped") and c["confidence"] >= 0.5] if not new_fp: print("Aucun FP à haute confiance à ajouter automatiquement.") return [] tokens = sorted(set(c["token"].lower() for c in new_fp)) print(f"\n{len(tokens)} tokens à ajouter automatiquement (confiance >= 0.5):") for t in tokens: print(f" + {t}") return tokens if __name__ == "__main__": if len(sys.argv) < 2: print(f"Usage: {sys.argv[0]} [--auto-fix]") sys.exit(1) audit_dir = sys.argv[1] do_auto_fix = "--auto-fix" in sys.argv result = analyze_audit_dir(audit_dir) print_report(result) if do_auto_fix: tokens = auto_fix(result) if tokens: print(f"\nTokens à insérer : {tokens}")