Stop words +170 : détection automatique FP via dictionnaire français (audit_fp_detector.py)

- Nouvel outil audit_fp_detector.py : croise NOM_GLOBAL avec dictionnaire FR (346K mots), patterns morphologiques médicaux, mots structurels DPI, fréquence inter-documents - +170 stop words en 2 lots : termes médicaux (abdomen, bilirubine, gastrique...), soins infirmiers (bijoux, ongles, maquillage, habillage...), mots courants (angle, bureau...) - Ville basque ajoutée : anglet - Résultat : 192/199 FP détectés couverts, 7 restants = artefacts OCR de vrais noms - Total stop words : 5076 tokens Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-28 10:04:33 +01:00
parent 84be2a5176
commit cb84698c2d
2 changed files with 405 additions and 1 deletions
--- a/anonymizer_core_refactored_onnx.py
+++ b/anonymizer_core_refactored_onnx.py
@@ -385,7 +385,7 @@ _MEDICAL_STOP_WORDS_SET = {
    "bcy", "po2", "po1", "po3", "bha", "atg", "ras", "cat", "ass",
    # FP audit OGC 17 CRH
    "mode", "retraitee", "retraité", "retraitée", "régression", "regression", "tel",
-    "strasbourg", "bordeaux", "toulouse", "paris", "lyon", "marseille", "bayonne",
+    "strasbourg", "bordeaux", "toulouse", "paris", "lyon", "marseille", "bayonne", "anglet",
    # Spécialités/services récurrents comme FP NOM
    "cancérologie", "cancerologie", "réanimation", "reanimation",
    "urologie", "néphrologie", "nephrologie", "hématologie", "hematologie",
@@ -399,6 +399,50 @@ _MEDICAL_STOP_WORDS_SET = {
    "transmissions", "transmission", "releve", "relevé",
    "objectif", "objectifs", "evaluation", "évaluation",
    "planification", "planifié", "planifiee",
+    # ── FP détectés automatiquement par audit_fp_detector.py ──
+    # Lot 2 : tokens basse confiance (DICT_FR seul) clairement non-noms
+    "acide", "adulte", "ambulatoire", "avenue", "bandelette", "bassin",
+    "bijoux", "bouche", "bouchon", "changes", "court", "demande", "dessert",
+    "devenir", "diffusé", "douche", "entrée", "escarre", "espace",
+    "explications", "fauteuil", "feuillet", "fixateur", "fois", "gamma",
+    "germes", "glace", "habillage", "liste", "maquillage", "matelas",
+    "mettre", "obésité", "ongles", "palais", "perlant", "pertes",
+    "pièce", "plaie", "risque", "saint", "sang", "signe", "sonde",
+    "tenue", "texte", "transaminases", "transit", "transmis", "urinal",
+    "vernis", "vessie", "vrac",
+    # Lot 2 : termes médicaux (préfixes/suffixes)
+    "anatomo-pathologique", "anemie", "anémie", "angioscanner",
+    "cétonurie", "cetonurie", "depilation", "dépilation",
+    "folique", "gastroentérologue", "gastroenterologue",
+    "microgrammes", "nalidixique", "naso-gastrique",
+    "angio-irm", "neuro", "neuro-chirurgie", "endoplasmique",
+    "cyto", "plaie-colle", "bionolyte",
+    # Lot 1 (103 tokens, confiance >= 0.5) ──
+    # Anatomie / clinique
+    "abdomen", "aortique", "bilirubine", "drain", "gastrique", "histologique",
+    "intraveineuse", "lithiasique", "macroscopie", "miction", "molles", "pelvienne",
+    "plaquette", "plaquettes", "rectale", "sanguine", "sciatique", "selle", "urinaire",
+    # Pathologies / symptômes
+    "algie", "angoisse", "antécédent", "douloureuse", "dyslipidémie",
+    "hemodialyse", "hemorragique", "hyperthermie", "hématologue",
+    # Médicaments / matériel médical
+    "ampoule", "antalgique", "antiseptique", "compresse", "flacon",
+    "oxygène", "pansement", "vitamine",
+    # Biologie / examens
+    "biochimie", "biologie", "fer",
+    # Actions / états cliniques
+    "ablation", "absence", "admission", "bloc", "changement", "cliniquement",
+    "cognitif", "couchage", "dispositif", "dynamique", "entretien", "histoire",
+    "intervention", "position", "rappel", "relation", "retour", "réalisation",
+    "résistant", "réévaluation", "situation", "temporaire", "urgence", "urgences",
+    "urgent", "validation",
+    # Mots courants / contextuels
+    "angle", "bille", "boisson", "bureau", "campagne", "cases", "circuit",
+    "clause", "concubin", "confortable", "demain", "densité", "dernière",
+    "distant", "domaine", "elle", "fils", "frère", "grand", "horizon",
+    "hui", "identifiant", "minuit", "murent", "neuf", "original", "pages",
+    "personne", "premier", "quartier", "retraite", "route", "rés",
+    "tam", "terrasses", "trouve", "verrouillé", "villa", "étage",
 }
 # Enrichissement automatique avec les ~4000 noms de médicaments d'edsnlp
 _MEDICAL_STOP_WORDS_SET.update(_load_edsnlp_drug_names())
--- a/audit_fp_detector.py
+++ b/audit_fp_detector.py
@@ -0,0 +1,360 @@
+#!/usr/bin/env python3
+"""Détecteur automatique de faux positifs NOM_GLOBAL.
+
+Analyse les fichiers .audit.jsonl et croise les NOM_GLOBAL avec :
+1. Le dictionnaire français système (/usr/share/dict/french)
+2. Des patterns morphologiques médicaux (-ite, -ose, -ique, -ine, etc.)
+3. La fréquence inter-documents (un vrai nom apparaît rarement dans 1 seul dossier)
+
+Usage:
+    python3 audit_fp_detector.py /chemin/vers/anonymise/
+    python3 audit_fp_detector.py /chemin/vers/anonymise/ --auto-fix
+"""
+
+import json
+import os
+import re
+import sys
+from collections import defaultdict
+from pathlib import Path
+
+# ── Chargement dictionnaire français ────────────────────────────────
+DICT_PATH = Path("/usr/share/dict/french")
+_french_words: set = set()
+
+def _load_french_dict():
+    global _french_words
+    if _french_words:
+        return
+    with open(DICT_PATH) as f:
+        _french_words = set(w.strip().lower() for w in f if len(w.strip()) >= 3)
+
+# ── Prénoms/noms français courants (homonymes dictionnaire) ────────
+# Ces mots sont à la fois dans le dictionnaire ET sont des prénoms/noms réels.
+# Ils ne doivent PAS être flagués comme FP.
+_KNOWN_NAME_HOMOPHONES = {
+    # Prénoms courants qui sont aussi des mots
+    "martin", "bernard", "petit", "richard", "moreau", "laurent",
+    "simon", "pierre", "marie", "jean", "paul", "louis", "marc",
+    "charles", "henry", "victor", "rose", "marguerite", "pascal",
+    "leon", "léon", "auguste", "clement", "clément", "olive",
+    "sylvie", "denis", "raymond", "roger", "maxime", "claude",
+    "marcel", "germaine", "alice", "florence", "dominique",
+    "christine", "caroline", "elisabeth", "elisabeth", "thomas",
+    "nicolas", "vincent", "benjamin", "lucien", "gaston",
+    "annette", "colette", "suzanne", "andre", "andré", "rené",
+    "yves", "gilles", "noel", "noël", "aimé", "aime",
+    "guy", "joël", "joelle", "gilbert", "fernand", "édith",
+    "edith", "agnès", "agnes", "jeanne", "lucie", "laure",
+    "adrien", "bastien", "julien", "viviane", "constance",
+    "armand", "blanche", "clémence", "clemence", "prudence", "patience",
+    "grace", "grâce", "fidèle", "placide",
+    # Prénoms féminins en -ine/-ine (matchent le suffixe médical)
+    "adeline", "aline", "amandine", "capucine", "celine", "céline",
+    "coline", "catherine", "clementine", "clémentine", "delphine",
+    "emeline", "émeline", "frédérique", "frederique", "ghislaine",
+    "gwendoline", "justine", "karine", "laurence", "laurie",
+    "marceline", "marine", "marjolaine", "martine", "madeleine",
+    "melaine", "moline", "morgane", "nadine", "noémie", "noemie",
+    "pauline", "perrine", "sabine", "sandrine", "séverine", "severine",
+    "tiphaine", "virginie",
+    # Prénoms en -oine/-iste/-ence etc.
+    "antoine", "baptiste", "patrice", "romain", "charlotte",
+    "alexandra", "aurore", "jules", "jacques", "mathieu",
+    "olivier", "ana", "maria", "pascale", "laura", "margot",
+    "marina", "maite", "maïté",
+    # Noms de famille courants qui sont aussi des mots
+    "blanc", "noir", "fort", "brun", "roux", "roy", "fabre",
+    "page", "comte", "baron", "marin", "fournier", "bonhomme",
+    "boucher", "berger", "marchand", "chevalier", "gros",
+    "masson", "bonnet", "vidal", "meunier", "fontaine",
+    "robin", "gay", "perrin", "roche", "rey",
+    "maître", "maitre", "salle", "aubry",
+    # Noms de famille fréquents dans le corpus
+    "barbotin", "brocard", "brunet", "cailliez", "charrier",
+    "colas", "combes", "forges", "gaillard", "galidie",
+    "gendre", "genevois", "grenier", "lemoine", "martel",
+    "martial", "moulin", "pineau", "piper", "pontier",
+    "poulot", "rosier", "roussel-fontaine", "saule", "senne",
+    "serrate", "serre", "taris", "vigneau", "vignes", "voisin",
+    "barre", "campagnard", "claire", "capera", "bigourdan",
+    "breton", "rainer", "bataille", "cabaner", "klement",
+    "boucau", "marte", "dores", "culot",
+    # Prénoms composés courants (matchent suffixe médical -ine/-ence)
+    "anne-christine", "marie-christine", "marie-line",
+    "marie-marceline", "berges",
+}
+
+# ── Patterns morphologiques médicaux ────────────────────────────────
+_MEDICAL_SUFFIXES = re.compile(
+    r"(?:ite|ose|ique|isme|ine|ome|able|tion|ment|aire|ence|ance"
+    r"|ectomie|tomie|pathie|scopie|plasie|trophie|graphie"
+    r"|lyse|émie|emie|urie|pnée|pnee|algie|cyte|gène|gene"
+    r"|oïde|oide|ïque|phage|logie|thérapie|therapie)$",
+    re.IGNORECASE,
+)
+
+_MEDICAL_PREFIXES = re.compile(
+    r"^(?:anti|hyper|hypo|intra|extra|para|péri|peri|poly|mono"
+    r"|néo|neo|dys|hémo|hemo|héma|hema|gastro|entéro|entero"
+    r"|broncho|pneumo|cardio|neuro|uro|néphro|nephro"
+    r"|ostéo|osteo|arthro|dermato|onco|immuno|cyto|histo"
+    r"|bio|micro|macro|angio|adéno|adeno|cholé|chole"
+    r"|pancréato|pancreato|hépato|hepato|splén|splen)",
+    re.IGNORECASE,
+)
+
+# ── Mots structurels / trackare / DPI ──────────────────────────────
+_STRUCTURAL_WORDS = {
+    "observation", "observations", "consultation", "prescripteur",
+    "administration", "evaluation", "évaluation", "indication",
+    "surveillance", "transmission", "transmissions",
+    "preparation", "préparation", "planification",
+    "validation", "notification", "recommandation",
+    "intervention", "installation", "mobilisation",
+    "exploration", "perfusion", "transfusion", "aspiration",
+    "nutrition", "hydratation", "oxygénation", "oxygenation",
+    "cicatrisation", "désinfection", "desinfection",
+    "compensation", "stabilisation", "régularisation",
+    "hospitalisation", "réhospitalisation",
+    "amélioration", "amelioration", "dégradation", "degradation",
+    "aggravation", "complication", "résolution", "resolution",
+    "progression", "régression", "regression",
+    "rééducation", "reeducation", "adaptation", "réadaptation",
+    "orientation", "coordination", "organisation",
+}
+
+
+def analyze_audit_dir(audit_dir: str) -> dict:
+    """Analyse tous les .audit.jsonl d'un répertoire.
+
+    Returns dict with:
+        - fp_candidates: list of (token, confidence, reasons)
+        - stats: summary statistics
+    """
+    _load_french_dict()
+
+    audit_dir = Path(audit_dir)
+    audit_files = sorted(audit_dir.glob("*.audit.jsonl"))
+
+    if not audit_files:
+        print(f"Aucun fichier .audit.jsonl trouvé dans {audit_dir}")
+        return {"fp_candidates": [], "stats": {}}
+
+    # ── Collecter NOM_GLOBAL par fichier ────────────────────────────
+    token_files = defaultdict(set)      # token → set of filenames
+    token_counts = defaultdict(int)     # token → total occurrences
+    all_kinds = defaultdict(int)
+
+    for af in audit_files:
+        fname = af.stem.replace(".audit", "")
+        with open(af) as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    h = json.loads(line)
+                    all_kinds[h["kind"]] += 1
+                    if h["kind"] == "NOM_GLOBAL":
+                        token = h["original"]
+                        token_files[token].add(fname)
+                        token_counts[token] += 1
+                except Exception:
+                    pass
+
+    # ── Analyser chaque token ───────────────────────────────────────
+    fp_candidates = []
+
+    for token in sorted(token_files.keys()):
+        reasons = []
+        confidence = 0.0
+        token_lower = token.lower()
+        n_files = len(token_files[token])
+        n_total = token_counts[token]
+
+        # Skip known name homophones
+        if token_lower in _KNOWN_NAME_HOMOPHONES:
+            continue
+
+        # 1. Dans le dictionnaire français ?
+        in_dict = token_lower in _french_words
+        if in_dict:
+            reasons.append("DICT_FR")
+            confidence += 0.4
+
+        # 2. Suffixe médical ? (exclure -ine court qui matche les prénoms)
+        has_medical_suffix = bool(_MEDICAL_SUFFIXES.search(token_lower))
+        if has_medical_suffix:
+            # -ine seul trop large pour les mots courts (prénoms)
+            suffix_match = _MEDICAL_SUFFIXES.search(token_lower)
+            if suffix_match and suffix_match.group() == "ine" and len(token_lower) < 8:
+                has_medical_suffix = False
+            elif suffix_match and suffix_match.group() in ("tion", "ment", "ence", "ance", "aire") and len(token_lower) < 6:
+                has_medical_suffix = False
+        if has_medical_suffix:
+            reasons.append("SUFFIXE_MED")
+            confidence += 0.3
+
+        # 3. Préfixe médical ?
+        has_medical_prefix = bool(_MEDICAL_PREFIXES.search(token_lower))
+        if has_medical_prefix:
+            reasons.append("PREFIXE_MED")
+            confidence += 0.3
+
+        # 4. Mot structurel DPI ?
+        if token_lower in _STRUCTURAL_WORDS:
+            reasons.append("STRUCT_DPI")
+            confidence += 0.5
+
+        # 5. Tout en minuscule (les vrais noms sont Capitalisés ou MAJUSCULES)
+        if token.islower() and len(token) > 3:
+            reasons.append("MINUSCULE")
+            confidence += 0.2
+
+        # 6. Très court (<=3) et ALL-CAPS → souvent abréviation
+        if len(token) <= 3 and token.isupper():
+            reasons.append("ABREV_3CH")
+            confidence += 0.2
+
+        # 7. Apparaît dans 1 seul fichier + dans le dico → très suspect
+        if n_files == 1 and in_dict:
+            reasons.append("1_SEUL_DOC")
+            confidence += 0.2
+
+        # 8. Mot composé avec tiret contenant un mot du dico
+        if "-" in token:
+            parts = token.split("-")
+            dict_parts = [p for p in parts if p.lower() in _french_words and p.lower() not in _KNOWN_NAME_HOMOPHONES]
+            if dict_parts and len(dict_parts) == len(parts):
+                reasons.append("COMPOSE_DICT")
+                confidence += 0.3
+
+        # Seuil : au moins une raison
+        if reasons and confidence >= 0.3:
+            fp_candidates.append({
+                "token": token,
+                "confidence": round(min(confidence, 1.0), 2),
+                "reasons": reasons,
+                "occurrences": n_total,
+                "n_files": n_files,
+                "files": sorted(token_files[token])[:3],
+            })
+
+    # Trier par confiance décroissante
+    fp_candidates.sort(key=lambda x: (-x["confidence"], x["token"]))
+
+    stats = {
+        "audit_files": len(audit_files),
+        "total_nom_global_unique": len(token_files),
+        "total_nom_global_occurrences": sum(token_counts.values()),
+        "fp_candidates": len(fp_candidates),
+        "already_in_stopwords": 0,  # filled below
+    }
+
+    # Vérifier lesquels sont déjà dans les stop words
+    try:
+        sys.path.insert(0, str(Path(__file__).parent))
+        from anonymizer_core_refactored_onnx import _MEDICAL_STOP_WORDS_SET
+        already = [c for c in fp_candidates if c["token"].lower() in _MEDICAL_STOP_WORDS_SET]
+        stats["already_in_stopwords"] = len(already)
+        for c in fp_candidates:
+            if c["token"].lower() in _MEDICAL_STOP_WORDS_SET:
+                c["already_stopped"] = True
+            else:
+                c["already_stopped"] = False
+    except ImportError:
+        for c in fp_candidates:
+            c["already_stopped"] = None
+
+    return {"fp_candidates": fp_candidates, "stats": stats}
+
+
+def print_report(result: dict):
+    """Affiche un rapport lisible."""
+    stats = result["stats"]
+    candidates = result["fp_candidates"]
+
+    print("=" * 70)
+    print("  DÉTECTION AUTOMATIQUE FAUX POSITIFS NOM_GLOBAL")
+    print("=" * 70)
+    print(f"  Fichiers audit analysés : {stats['audit_files']}")
+    print(f"  NOM_GLOBAL uniques      : {stats['total_nom_global_unique']}")
+    print(f"  NOM_GLOBAL occurrences  : {stats['total_nom_global_occurrences']}")
+    print(f"  Candidats FP détectés   : {stats['fp_candidates']}")
+    print(f"  Déjà dans stop words    : {stats['already_in_stopwords']}")
+    print()
+
+    # Séparer nouveaux vs déjà traités
+    new_fp = [c for c in candidates if not c.get("already_stopped")]
+    old_fp = [c for c in candidates if c.get("already_stopped")]
+
+    if new_fp:
+        print(f"{'─'*70}")
+        print(f"  NOUVEAUX FP À AJOUTER AUX STOP WORDS ({len(new_fp)})")
+        print(f"{'─'*70}")
+        print(f"  {'Token':<25s} {'Conf':>5s}  {'Occ':>4s}  {'Docs':>4s}  Raisons")
+        print(f"  {'─'*24} {'─'*5}  {'─'*4}  {'─'*4}  {'─'*30}")
+        for c in new_fp:
+            reasons = ", ".join(c["reasons"])
+            print(f"  {c['token']:<25s} {c['confidence']:>5.2f}  {c['occurrences']:>4d}  {c['n_files']:>4d}  {reasons}")
+
+        # Générer le code Python à copier
+        print(f"\n{'─'*70}")
+        print(f"  CODE À AJOUTER dans _MEDICAL_STOP_WORDS_SET :")
+        print(f"{'─'*70}")
+        tokens_to_add = sorted(set(c["token"].lower() for c in new_fp))
+        line = "    "
+        for i, t in enumerate(tokens_to_add):
+            entry = f'"{t}", '
+            if len(line) + len(entry) > 95:
+                print(line.rstrip(", "))
+                line = "    "
+            line += entry
+        if line.strip():
+            print(line.rstrip(", "))
+    else:
+        print("  Aucun nouveau FP détecté !")
+
+    if old_fp:
+        print(f"\n{'─'*70}")
+        print(f"  DÉJÀ DANS STOP WORDS ({len(old_fp)}) — OK")
+        print(f"{'─'*70}")
+        for c in old_fp:
+            print(f"  ✓ {c['token']}")
+
+    print()
+
+
+def auto_fix(result: dict, core_path: str = None):
+    """Ajoute automatiquement les FP détectés aux stop words du core."""
+    new_fp = [c for c in result["fp_candidates"]
+              if not c.get("already_stopped") and c["confidence"] >= 0.5]
+
+    if not new_fp:
+        print("Aucun FP à haute confiance à ajouter automatiquement.")
+        return []
+
+    tokens = sorted(set(c["token"].lower() for c in new_fp))
+    print(f"\n{len(tokens)} tokens à ajouter automatiquement (confiance >= 0.5):")
+    for t in tokens:
+        print(f"  + {t}")
+
+    return tokens
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print(f"Usage: {sys.argv[0]} <audit_dir> [--auto-fix]")
+        sys.exit(1)
+
+    audit_dir = sys.argv[1]
+    do_auto_fix = "--auto-fix" in sys.argv
+
+    result = analyze_audit_dir(audit_dir)
+    print_report(result)
+
+    if do_auto_fix:
+        tokens = auto_fix(result)
+        if tokens:
+            print(f"\nTokens à insérer : {tokens}")