Stop words +170 : détection automatique FP via dictionnaire français (audit_fp_detector.py)

- Nouvel outil audit_fp_detector.py : croise NOM_GLOBAL avec dictionnaire FR (346K mots), patterns morphologiques médicaux, mots structurels DPI, fréquence inter-documents - +170 stop words en 2 lots : termes médicaux (abdomen, bilirubine, gastrique...), soins infirmiers (bijoux, ongles, maquillage, habillage...), mots courants (angle, bureau...) - Ville basque ajoutée : anglet - Résultat : 192/199 FP détectés couverts, 7 restants = artefacts OCR de vrais noms - Total stop words : 5076 tokens Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-28 10:04:33 +01:00
parent 84be2a5176
commit cb84698c2d
2 changed files with 405 additions and 1 deletions
--- a/anonymizer_core_refactored_onnx.py
+++ b/anonymizer_core_refactored_onnx.py
@@ -385,7 +385,7 @@ _MEDICAL_STOP_WORDS_SET = {
    "bcy", "po2", "po1", "po3", "bha", "atg", "ras", "cat", "ass",
    # FP audit OGC 17 CRH
    "mode", "retraitee", "retraité", "retraitée", "régression", "regression", "tel",
-    "strasbourg", "bordeaux", "toulouse", "paris", "lyon", "marseille", "bayonne",
+    "strasbourg", "bordeaux", "toulouse", "paris", "lyon", "marseille", "bayonne", "anglet",
    # Spécialités/services récurrents comme FP NOM
    "cancérologie", "cancerologie", "réanimation", "reanimation",
    "urologie", "néphrologie", "nephrologie", "hématologie", "hematologie",
@@ -399,6 +399,50 @@ _MEDICAL_STOP_WORDS_SET = {
    "transmissions", "transmission", "releve", "relevé",
    "objectif", "objectifs", "evaluation", "évaluation",
    "planification", "planifié", "planifiee",
    # ── FP détectés automatiquement par audit_fp_detector.py ──
    # Lot 2 : tokens basse confiance (DICT_FR seul) clairement non-noms
    "acide", "adulte", "ambulatoire", "avenue", "bandelette", "bassin",
    "bijoux", "bouche", "bouchon", "changes", "court", "demande", "dessert",
    "devenir", "diffusé", "douche", "entrée", "escarre", "espace",
    "explications", "fauteuil", "feuillet", "fixateur", "fois", "gamma",
    "germes", "glace", "habillage", "liste", "maquillage", "matelas",
    "mettre", "obésité", "ongles", "palais", "perlant", "pertes",
    "pièce", "plaie", "risque", "saint", "sang", "signe", "sonde",
    "tenue", "texte", "transaminases", "transit", "transmis", "urinal",
    "vernis", "vessie", "vrac",
    # Lot 2 : termes médicaux (préfixes/suffixes)
    "anatomo-pathologique", "anemie", "anémie", "angioscanner",
    "cétonurie", "cetonurie", "depilation", "dépilation",
    "folique", "gastroentérologue", "gastroenterologue",
    "microgrammes", "nalidixique", "naso-gastrique",
    "angio-irm", "neuro", "neuro-chirurgie", "endoplasmique",
    "cyto", "plaie-colle", "bionolyte",
    # Lot 1 (103 tokens, confiance >= 0.5) ──
    # Anatomie / clinique
    "abdomen", "aortique", "bilirubine", "drain", "gastrique", "histologique",
    "intraveineuse", "lithiasique", "macroscopie", "miction", "molles", "pelvienne",
    "plaquette", "plaquettes", "rectale", "sanguine", "sciatique", "selle", "urinaire",
    # Pathologies / symptômes
    "algie", "angoisse", "antécédent", "douloureuse", "dyslipidémie",
    "hemodialyse", "hemorragique", "hyperthermie", "hématologue",
    # Médicaments / matériel médical
    "ampoule", "antalgique", "antiseptique", "compresse", "flacon",
    "oxygène", "pansement", "vitamine",
    # Biologie / examens
    "biochimie", "biologie", "fer",
    # Actions / états cliniques
    "ablation", "absence", "admission", "bloc", "changement", "cliniquement",
    "cognitif", "couchage", "dispositif", "dynamique", "entretien", "histoire",
    "intervention", "position", "rappel", "relation", "retour", "réalisation",
    "résistant", "réévaluation", "situation", "temporaire", "urgence", "urgences",
    "urgent", "validation",
    # Mots courants / contextuels
    "angle", "bille", "boisson", "bureau", "campagne", "cases", "circuit",
    "clause", "concubin", "confortable", "demain", "densité", "dernière",
    "distant", "domaine", "elle", "fils", "frère", "grand", "horizon",
    "hui", "identifiant", "minuit", "murent", "neuf", "original", "pages",
    "personne", "premier", "quartier", "retraite", "route", "rés",
    "tam", "terrasses", "trouve", "verrouillé", "villa", "étage",
 }
 # Enrichissement automatique avec les ~4000 noms de médicaments d'edsnlp
 _MEDICAL_STOP_WORDS_SET.update(_load_edsnlp_drug_names())
--- a/audit_fp_detector.py
+++ b/audit_fp_detector.py
@@ -0,0 +1,360 @@
 #!/usr/bin/env python3
 """Détecteur automatique de faux positifs NOM_GLOBAL.
 Analyse les fichiers .audit.jsonl et croise les NOM_GLOBAL avec :
 1. Le dictionnaire français système (/usr/share/dict/french)
 2. Des patterns morphologiques médicaux (-ite, -ose, -ique, -ine, etc.)
 3. La fréquence inter-documents (un vrai nom apparaît rarement dans 1 seul dossier)
 Usage:
    python3 audit_fp_detector.py /chemin/vers/anonymise/
    python3 audit_fp_detector.py /chemin/vers/anonymise/ --auto-fix
 """
 import json
 import os
 import re
 import sys
 from collections import defaultdict
 from pathlib import Path
 # ── Chargement dictionnaire français ────────────────────────────────
 DICT_PATH = Path("/usr/share/dict/french")
 _french_words: set = set()
 def _load_french_dict():
    global _french_words
    if _french_words:
        return
    with open(DICT_PATH) as f:
        _french_words = set(w.strip().lower() for w in f if len(w.strip()) >= 3)
 # ── Prénoms/noms français courants (homonymes dictionnaire) ────────
 # Ces mots sont à la fois dans le dictionnaire ET sont des prénoms/noms réels.
 # Ils ne doivent PAS être flagués comme FP.
 _KNOWN_NAME_HOMOPHONES = {
    # Prénoms courants qui sont aussi des mots
    "martin", "bernard", "petit", "richard", "moreau", "laurent",
    "simon", "pierre", "marie", "jean", "paul", "louis", "marc",
    "charles", "henry", "victor", "rose", "marguerite", "pascal",
    "leon", "léon", "auguste", "clement", "clément", "olive",
    "sylvie", "denis", "raymond", "roger", "maxime", "claude",
    "marcel", "germaine", "alice", "florence", "dominique",
    "christine", "caroline", "elisabeth", "elisabeth", "thomas",
    "nicolas", "vincent", "benjamin", "lucien", "gaston",
    "annette", "colette", "suzanne", "andre", "andré", "rené",
    "yves", "gilles", "noel", "noël", "aimé", "aime",
    "guy", "joël", "joelle", "gilbert", "fernand", "édith",
    "edith", "agnès", "agnes", "jeanne", "lucie", "laure",
    "adrien", "bastien", "julien", "viviane", "constance",
    "armand", "blanche", "clémence", "clemence", "prudence", "patience",
    "grace", "grâce", "fidèle", "placide",
    # Prénoms féminins en -ine/-ine (matchent le suffixe médical)
    "adeline", "aline", "amandine", "capucine", "celine", "céline",
    "coline", "catherine", "clementine", "clémentine", "delphine",
    "emeline", "émeline", "frédérique", "frederique", "ghislaine",
    "gwendoline", "justine", "karine", "laurence", "laurie",
    "marceline", "marine", "marjolaine", "martine", "madeleine",
    "melaine", "moline", "morgane", "nadine", "noémie", "noemie",
    "pauline", "perrine", "sabine", "sandrine", "séverine", "severine",
    "tiphaine", "virginie",
    # Prénoms en -oine/-iste/-ence etc.
    "antoine", "baptiste", "patrice", "romain", "charlotte",
    "alexandra", "aurore", "jules", "jacques", "mathieu",
    "olivier", "ana", "maria", "pascale", "laura", "margot",
    "marina", "maite", "maïté",
    # Noms de famille courants qui sont aussi des mots
    "blanc", "noir", "fort", "brun", "roux", "roy", "fabre",
    "page", "comte", "baron", "marin", "fournier", "bonhomme",
    "boucher", "berger", "marchand", "chevalier", "gros",
    "masson", "bonnet", "vidal", "meunier", "fontaine",
    "robin", "gay", "perrin", "roche", "rey",
    "maître", "maitre", "salle", "aubry",
    # Noms de famille fréquents dans le corpus
    "barbotin", "brocard", "brunet", "cailliez", "charrier",
    "colas", "combes", "forges", "gaillard", "galidie",
    "gendre", "genevois", "grenier", "lemoine", "martel",
    "martial", "moulin", "pineau", "piper", "pontier",
    "poulot", "rosier", "roussel-fontaine", "saule", "senne",
    "serrate", "serre", "taris", "vigneau", "vignes", "voisin",
    "barre", "campagnard", "claire", "capera", "bigourdan",
    "breton", "rainer", "bataille", "cabaner", "klement",
    "boucau", "marte", "dores", "culot",
    # Prénoms composés courants (matchent suffixe médical -ine/-ence)
    "anne-christine", "marie-christine", "marie-line",
    "marie-marceline", "berges",
 }
 # ── Patterns morphologiques médicaux ────────────────────────────────
 _MEDICAL_SUFFIXES = re.compile(
    r"(?:ite|ose|ique|isme|ine|ome|able|tion|ment|aire|ence|ance"
    r"|ectomie|tomie|pathie|scopie|plasie|trophie|graphie"
    r"|lyse|émie|emie|urie|pnée|pnee|algie|cyte|gène|gene"
    r"|oïde|oide|ïque|phage|logie|thérapie|therapie)$",
    re.IGNORECASE,
 )
 _MEDICAL_PREFIXES = re.compile(
    r"^(?:anti|hyper|hypo|intra|extra|para|péri|peri|poly|mono"
    r"|néo|neo|dys|hémo|hemo|héma|hema|gastro|entéro|entero"
    r"|broncho|pneumo|cardio|neuro|uro|néphro|nephro"
    r"|ostéo|osteo|arthro|dermato|onco|immuno|cyto|histo"
    r"|bio|micro|macro|angio|adéno|adeno|cholé|chole"
    r"|pancréato|pancreato|hépato|hepato|splén|splen)",
    re.IGNORECASE,
 )
 # ── Mots structurels / trackare / DPI ──────────────────────────────
 _STRUCTURAL_WORDS = {
    "observation", "observations", "consultation", "prescripteur",
    "administration", "evaluation", "évaluation", "indication",
    "surveillance", "transmission", "transmissions",
    "preparation", "préparation", "planification",
    "validation", "notification", "recommandation",
    "intervention", "installation", "mobilisation",
    "exploration", "perfusion", "transfusion", "aspiration",
    "nutrition", "hydratation", "oxygénation", "oxygenation",
    "cicatrisation", "désinfection", "desinfection",
    "compensation", "stabilisation", "régularisation",
    "hospitalisation", "réhospitalisation",
    "amélioration", "amelioration", "dégradation", "degradation",
    "aggravation", "complication", "résolution", "resolution",
    "progression", "régression", "regression",
    "rééducation", "reeducation", "adaptation", "réadaptation",
    "orientation", "coordination", "organisation",
 }
 def analyze_audit_dir(audit_dir: str) -> dict:
    """Analyse tous les .audit.jsonl d'un répertoire.
    Returns dict with:
        - fp_candidates: list of (token, confidence, reasons)
        - stats: summary statistics
    """
    _load_french_dict()
    audit_dir = Path(audit_dir)
    audit_files = sorted(audit_dir.glob("*.audit.jsonl"))
    if not audit_files:
        print(f"Aucun fichier .audit.jsonl trouvé dans {audit_dir}")
        return {"fp_candidates": [], "stats": {}}
    # ── Collecter NOM_GLOBAL par fichier ────────────────────────────
    token_files = defaultdict(set)      # token → set of filenames
    token_counts = defaultdict(int)     # token → total occurrences
    all_kinds = defaultdict(int)
    for af in audit_files:
        fname = af.stem.replace(".audit", "")
        with open(af) as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                try:
                    h = json.loads(line)
                    all_kinds[h["kind"]] += 1
                    if h["kind"] == "NOM_GLOBAL":
                        token = h["original"]
                        token_files[token].add(fname)
                        token_counts[token] += 1
                except Exception:
                    pass
    # ── Analyser chaque token ───────────────────────────────────────
    fp_candidates = []
    for token in sorted(token_files.keys()):
        reasons = []
        confidence = 0.0
        token_lower = token.lower()
        n_files = len(token_files[token])
        n_total = token_counts[token]
        # Skip known name homophones
        if token_lower in _KNOWN_NAME_HOMOPHONES:
            continue
        # 1. Dans le dictionnaire français ?
        in_dict = token_lower in _french_words
        if in_dict:
            reasons.append("DICT_FR")
            confidence += 0.4
        # 2. Suffixe médical ? (exclure -ine court qui matche les prénoms)
        has_medical_suffix = bool(_MEDICAL_SUFFIXES.search(token_lower))
        if has_medical_suffix:
            # -ine seul trop large pour les mots courts (prénoms)
            suffix_match = _MEDICAL_SUFFIXES.search(token_lower)
            if suffix_match and suffix_match.group() == "ine" and len(token_lower) < 8:
                has_medical_suffix = False
            elif suffix_match and suffix_match.group() in ("tion", "ment", "ence", "ance", "aire") and len(token_lower) < 6:
                has_medical_suffix = False
        if has_medical_suffix:
            reasons.append("SUFFIXE_MED")
            confidence += 0.3
        # 3. Préfixe médical ?
        has_medical_prefix = bool(_MEDICAL_PREFIXES.search(token_lower))
        if has_medical_prefix:
            reasons.append("PREFIXE_MED")
            confidence += 0.3
        # 4. Mot structurel DPI ?
        if token_lower in _STRUCTURAL_WORDS:
            reasons.append("STRUCT_DPI")
            confidence += 0.5
        # 5. Tout en minuscule (les vrais noms sont Capitalisés ou MAJUSCULES)
        if token.islower() and len(token) > 3:
            reasons.append("MINUSCULE")
            confidence += 0.2
        # 6. Très court (<=3) et ALL-CAPS → souvent abréviation
        if len(token) <= 3 and token.isupper():
            reasons.append("ABREV_3CH")
            confidence += 0.2
        # 7. Apparaît dans 1 seul fichier + dans le dico → très suspect
        if n_files == 1 and in_dict:
            reasons.append("1_SEUL_DOC")
            confidence += 0.2
        # 8. Mot composé avec tiret contenant un mot du dico
        if "-" in token:
            parts = token.split("-")
            dict_parts = [p for p in parts if p.lower() in _french_words and p.lower() not in _KNOWN_NAME_HOMOPHONES]
            if dict_parts and len(dict_parts) == len(parts):
                reasons.append("COMPOSE_DICT")
                confidence += 0.3
        # Seuil : au moins une raison
        if reasons and confidence >= 0.3:
            fp_candidates.append({
                "token": token,
                "confidence": round(min(confidence, 1.0), 2),
                "reasons": reasons,
                "occurrences": n_total,
                "n_files": n_files,
                "files": sorted(token_files[token])[:3],
            })
    # Trier par confiance décroissante
    fp_candidates.sort(key=lambda x: (-x["confidence"], x["token"]))
    stats = {
        "audit_files": len(audit_files),
        "total_nom_global_unique": len(token_files),
        "total_nom_global_occurrences": sum(token_counts.values()),
        "fp_candidates": len(fp_candidates),
        "already_in_stopwords": 0,  # filled below
    }
    # Vérifier lesquels sont déjà dans les stop words
    try:
        sys.path.insert(0, str(Path(__file__).parent))
        from anonymizer_core_refactored_onnx import _MEDICAL_STOP_WORDS_SET
        already = [c for c in fp_candidates if c["token"].lower() in _MEDICAL_STOP_WORDS_SET]
        stats["already_in_stopwords"] = len(already)
        for c in fp_candidates:
            if c["token"].lower() in _MEDICAL_STOP_WORDS_SET:
                c["already_stopped"] = True
            else:
                c["already_stopped"] = False
    except ImportError:
        for c in fp_candidates:
            c["already_stopped"] = None
    return {"fp_candidates": fp_candidates, "stats": stats}
 def print_report(result: dict):
    """Affiche un rapport lisible."""
    stats = result["stats"]
    candidates = result["fp_candidates"]
    print("=" * 70)
    print("  DÉTECTION AUTOMATIQUE FAUX POSITIFS NOM_GLOBAL")
    print("=" * 70)
    print(f"  Fichiers audit analysés : {stats['audit_files']}")
    print(f"  NOM_GLOBAL uniques      : {stats['total_nom_global_unique']}")
    print(f"  NOM_GLOBAL occurrences  : {stats['total_nom_global_occurrences']}")
    print(f"  Candidats FP détectés   : {stats['fp_candidates']}")
    print(f"  Déjà dans stop words    : {stats['already_in_stopwords']}")
    print()
    # Séparer nouveaux vs déjà traités
    new_fp = [c for c in candidates if not c.get("already_stopped")]
    old_fp = [c for c in candidates if c.get("already_stopped")]
    if new_fp:
        print(f"{'─'*70}")
        print(f"  NOUVEAUX FP À AJOUTER AUX STOP WORDS ({len(new_fp)})")
        print(f"{'─'*70}")
        print(f"  {'Token':<25s} {'Conf':>5s}  {'Occ':>4s}  {'Docs':>4s}  Raisons")
        print(f"  {'─'*24} {'─'*5}  {'─'*4}  {'─'*4}  {'─'*30}")
        for c in new_fp:
            reasons = ", ".join(c["reasons"])
            print(f"  {c['token']:<25s} {c['confidence']:>5.2f}  {c['occurrences']:>4d}  {c['n_files']:>4d}  {reasons}")
        # Générer le code Python à copier
        print(f"\n{'─'*70}")
        print(f"  CODE À AJOUTER dans _MEDICAL_STOP_WORDS_SET :")
        print(f"{'─'*70}")
        tokens_to_add = sorted(set(c["token"].lower() for c in new_fp))
        line = "    "
        for i, t in enumerate(tokens_to_add):
            entry = f'"{t}", '
            if len(line) + len(entry) > 95:
                print(line.rstrip(", "))
                line = "    "
            line += entry
        if line.strip():
            print(line.rstrip(", "))
    else:
        print("  Aucun nouveau FP détecté !")
    if old_fp:
        print(f"\n{'─'*70}")
        print(f"  DÉJÀ DANS STOP WORDS ({len(old_fp)}) — OK")
        print(f"{'─'*70}")
        for c in old_fp:
            print(f"  ✓ {c['token']}")
    print()
 def auto_fix(result: dict, core_path: str = None):
    """Ajoute automatiquement les FP détectés aux stop words du core."""
    new_fp = [c for c in result["fp_candidates"]
              if not c.get("already_stopped") and c["confidence"] >= 0.5]
    if not new_fp:
        print("Aucun FP à haute confiance à ajouter automatiquement.")
        return []
    tokens = sorted(set(c["token"].lower() for c in new_fp))
    print(f"\n{len(tokens)} tokens à ajouter automatiquement (confiance >= 0.5):")
    for t in tokens:
        print(f"  + {t}")
    return tokens
 if __name__ == "__main__":
    if len(sys.argv) < 2:
        print(f"Usage: {sys.argv[0]} <audit_dir> [--auto-fix]")
        sys.exit(1)
    audit_dir = sys.argv[1]
    do_auto_fix = "--auto-fix" in sys.argv
    result = analyze_audit_dir(audit_dir)
    print_report(result)
    if do_auto_fix:
        tokens = auto_fix(result)
        if tokens:
            print(f"\nTokens à insérer : {tokens}")