anonymisation/audit_fp_detector.py

#!/usr/bin/env python3
"""Détecteur automatique de faux positifs NOM_GLOBAL.

Analyse les fichiers .audit.jsonl et croise les NOM_GLOBAL avec :
1. Le dictionnaire français système (/usr/share/dict/french)
2. Des patterns morphologiques médicaux (-ite, -ose, -ique, -ine, etc.)
3. La fréquence inter-documents (un vrai nom apparaît rarement dans 1 seul dossier)

Usage:
    python3 audit_fp_detector.py /chemin/vers/anonymise/
    python3 audit_fp_detector.py /chemin/vers/anonymise/ --auto-fix
"""

import json
import os
import re
import sys
from collections import defaultdict
from pathlib import Path

# ── Chargement dictionnaire français ────────────────────────────────
DICT_PATH = Path("/usr/share/dict/french")
_french_words: set = set()

def _load_french_dict():
    global _french_words
    if _french_words:
        return
    with open(DICT_PATH) as f:
        _french_words = set(w.strip().lower() for w in f if len(w.strip()) >= 3)

# ── Prénoms/noms français courants (homonymes dictionnaire) ────────
# Ces mots sont à la fois dans le dictionnaire ET sont des prénoms/noms réels.
# Ils ne doivent PAS être flagués comme FP.
_KNOWN_NAME_HOMOPHONES = {
    # Prénoms courants qui sont aussi des mots
    "martin", "bernard", "petit", "richard", "moreau", "laurent",
    "simon", "pierre", "marie", "jean", "paul", "louis", "marc",
    "charles", "henry", "victor", "rose", "marguerite", "pascal",
    "leon", "léon", "auguste", "clement", "clément", "olive",
    "sylvie", "denis", "raymond", "roger", "maxime", "claude",
    "marcel", "germaine", "alice", "florence", "dominique",
    "christine", "caroline", "elisabeth", "elisabeth", "thomas",
    "nicolas", "vincent", "benjamin", "lucien", "gaston",
    "annette", "colette", "suzanne", "andre", "andré", "rené",
    "yves", "gilles", "noel", "noël", "aimé", "aime",
    "guy", "joël", "joelle", "gilbert", "fernand", "édith",
    "edith", "agnès", "agnes", "jeanne", "lucie", "laure",
    "adrien", "bastien", "julien", "viviane", "constance",
    "armand", "blanche", "clémence", "clemence", "prudence", "patience",
    "grace", "grâce", "fidèle", "placide",
    # Prénoms féminins en -ine/-ine (matchent le suffixe médical)
    "adeline", "aline", "amandine", "capucine", "celine", "céline",
    "coline", "catherine", "clementine", "clémentine", "delphine",
    "emeline", "émeline", "frédérique", "frederique", "ghislaine",
    "gwendoline", "justine", "karine", "laurence", "laurie",
    "marceline", "marine", "marjolaine", "martine", "madeleine",
    "melaine", "moline", "morgane", "nadine", "noémie", "noemie",
    "pauline", "perrine", "sabine", "sandrine", "séverine", "severine",
    "tiphaine", "virginie",
    # Prénoms en -oine/-iste/-ence etc.
    "antoine", "baptiste", "patrice", "romain", "charlotte",
    "alexandra", "aurore", "jules", "jacques", "mathieu",
    "olivier", "ana", "maria", "pascale", "laura", "margot",
    "marina", "maite", "maïté",
    # Noms de famille courants qui sont aussi des mots
    "blanc", "noir", "fort", "brun", "roux", "roy", "fabre",
    "page", "comte", "baron", "marin", "fournier", "bonhomme",
    "boucher", "berger", "marchand", "chevalier", "gros",
    "masson", "bonnet", "vidal", "meunier", "fontaine",
    "robin", "gay", "perrin", "roche", "rey",
    "maître", "maitre", "salle", "aubry",
    # Noms de famille fréquents dans le corpus
    "barbotin", "brocard", "brunet", "cailliez", "charrier",
    "colas", "combes", "forges", "gaillard", "galidie",
    "gendre", "genevois", "grenier", "lemoine", "martel",
    "martial", "moulin", "pineau", "piper", "pontier",
    "poulot", "rosier", "roussel-fontaine", "saule", "senne",
    "serrate", "serre", "taris", "vigneau", "vignes", "voisin",
    "barre", "campagnard", "claire", "capera", "bigourdan",
    "breton", "rainer", "bataille", "cabaner", "klement",
    "boucau", "marte", "dores", "culot",
    # Prénoms composés courants (matchent suffixe médical -ine/-ence)
    "anne-christine", "marie-christine", "marie-line",
    "marie-marceline", "berges",
}

# ── Patterns morphologiques médicaux ────────────────────────────────
_MEDICAL_SUFFIXES = re.compile(
    r"(?:ite|ose|ique|isme|ine|ome|able|tion|ment|aire|ence|ance"
    r"|ectomie|tomie|pathie|scopie|plasie|trophie|graphie"
    r"|lyse|émie|emie|urie|pnée|pnee|algie|cyte|gène|gene"
    r"|oïde|oide|ïque|phage|logie|thérapie|therapie)$",
    re.IGNORECASE,
)

_MEDICAL_PREFIXES = re.compile(
    r"^(?:anti|hyper|hypo|intra|extra|para|péri|peri|poly|mono"
    r"|néo|neo|dys|hémo|hemo|héma|hema|gastro|entéro|entero"
    r"|broncho|pneumo|cardio|neuro|uro|néphro|nephro"
    r"|ostéo|osteo|arthro|dermato|onco|immuno|cyto|histo"
    r"|bio|micro|macro|angio|adéno|adeno|cholé|chole"
    r"|pancréato|pancreato|hépato|hepato|splén|splen)",
    re.IGNORECASE,
)

# ── Mots structurels / trackare / DPI ──────────────────────────────
_STRUCTURAL_WORDS = {
    "observation", "observations", "consultation", "prescripteur",
    "administration", "evaluation", "évaluation", "indication",
    "surveillance", "transmission", "transmissions",
    "preparation", "préparation", "planification",
    "validation", "notification", "recommandation",
    "intervention", "installation", "mobilisation",
    "exploration", "perfusion", "transfusion", "aspiration",
    "nutrition", "hydratation", "oxygénation", "oxygenation",
    "cicatrisation", "désinfection", "desinfection",
    "compensation", "stabilisation", "régularisation",
    "hospitalisation", "réhospitalisation",
    "amélioration", "amelioration", "dégradation", "degradation",
    "aggravation", "complication", "résolution", "resolution",
    "progression", "régression", "regression",
    "rééducation", "reeducation", "adaptation", "réadaptation",
    "orientation", "coordination", "organisation",
}


def analyze_audit_dir(audit_dir: str) -> dict:
    """Analyse tous les .audit.jsonl d'un répertoire.

    Returns dict with:
        - fp_candidates: list of (token, confidence, reasons)
        - stats: summary statistics
    """
    _load_french_dict()

    audit_dir = Path(audit_dir)
    audit_files = sorted(audit_dir.glob("*.audit.jsonl"))

    if not audit_files:
        print(f"Aucun fichier .audit.jsonl trouvé dans {audit_dir}")
        return {"fp_candidates": [], "stats": {}}

    # ── Collecter NOM_GLOBAL par fichier ────────────────────────────
    token_files = defaultdict(set)      # token → set of filenames
    token_counts = defaultdict(int)     # token → total occurrences
    all_kinds = defaultdict(int)

    for af in audit_files:
        fname = af.stem.replace(".audit", "")
        with open(af) as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                try:
                    h = json.loads(line)
                    all_kinds[h["kind"]] += 1
                    if h["kind"] == "NOM_GLOBAL":
                        token = h["original"]
                        token_files[token].add(fname)
                        token_counts[token] += 1
                except Exception:
                    pass

    # ── Analyser chaque token ───────────────────────────────────────
    fp_candidates = []

    for token in sorted(token_files.keys()):
        reasons = []
        confidence = 0.0
        token_lower = token.lower()
        n_files = len(token_files[token])
        n_total = token_counts[token]

        # Skip known name homophones
        if token_lower in _KNOWN_NAME_HOMOPHONES:
            continue

        # 1. Dans le dictionnaire français ?
        in_dict = token_lower in _french_words
        if in_dict:
            reasons.append("DICT_FR")
            confidence += 0.4

        # 2. Suffixe médical ? (exclure -ine court qui matche les prénoms)
        has_medical_suffix = bool(_MEDICAL_SUFFIXES.search(token_lower))
        if has_medical_suffix:
            # -ine seul trop large pour les mots courts (prénoms)
            suffix_match = _MEDICAL_SUFFIXES.search(token_lower)
            if suffix_match and suffix_match.group() == "ine" and len(token_lower) < 8:
                has_medical_suffix = False
            elif suffix_match and suffix_match.group() in ("tion", "ment", "ence", "ance", "aire") and len(token_lower) < 6:
                has_medical_suffix = False
        if has_medical_suffix:
            reasons.append("SUFFIXE_MED")
            confidence += 0.3

        # 3. Préfixe médical ?
        has_medical_prefix = bool(_MEDICAL_PREFIXES.search(token_lower))
        if has_medical_prefix:
            reasons.append("PREFIXE_MED")
            confidence += 0.3

        # 4. Mot structurel DPI ?
        if token_lower in _STRUCTURAL_WORDS:
            reasons.append("STRUCT_DPI")
            confidence += 0.5

        # 5. Tout en minuscule (les vrais noms sont Capitalisés ou MAJUSCULES)
        if token.islower() and len(token) > 3:
            reasons.append("MINUSCULE")
            confidence += 0.2

        # 6. Très court (<=3) et ALL-CAPS → souvent abréviation
        if len(token) <= 3 and token.isupper():
            reasons.append("ABREV_3CH")
            confidence += 0.2

        # 7. Apparaît dans 1 seul fichier + dans le dico → très suspect
        if n_files == 1 and in_dict:
            reasons.append("1_SEUL_DOC")
            confidence += 0.2

        # 8. Mot composé avec tiret contenant un mot du dico
        if "-" in token:
            parts = token.split("-")
            dict_parts = [p for p in parts if p.lower() in _french_words and p.lower() not in _KNOWN_NAME_HOMOPHONES]
            if dict_parts and len(dict_parts) == len(parts):
                reasons.append("COMPOSE_DICT")
                confidence += 0.3

        # Seuil : au moins une raison
        if reasons and confidence >= 0.3:
            fp_candidates.append({
                "token": token,
                "confidence": round(min(confidence, 1.0), 2),
                "reasons": reasons,
                "occurrences": n_total,
                "n_files": n_files,
                "files": sorted(token_files[token])[:3],
            })

    # Trier par confiance décroissante
    fp_candidates.sort(key=lambda x: (-x["confidence"], x["token"]))

    stats = {
        "audit_files": len(audit_files),
        "total_nom_global_unique": len(token_files),
        "total_nom_global_occurrences": sum(token_counts.values()),
        "fp_candidates": len(fp_candidates),
        "already_in_stopwords": 0,  # filled below
    }

    # Vérifier lesquels sont déjà dans les stop words
    try:
        sys.path.insert(0, str(Path(__file__).parent))
        from anonymizer_core_refactored_onnx import _MEDICAL_STOP_WORDS_SET
        already = [c for c in fp_candidates if c["token"].lower() in _MEDICAL_STOP_WORDS_SET]
        stats["already_in_stopwords"] = len(already)
        for c in fp_candidates:
            if c["token"].lower() in _MEDICAL_STOP_WORDS_SET:
                c["already_stopped"] = True
            else:
                c["already_stopped"] = False
    except ImportError:
        for c in fp_candidates:
            c["already_stopped"] = None

    return {"fp_candidates": fp_candidates, "stats": stats}


def print_report(result: dict):
    """Affiche un rapport lisible."""
    stats = result["stats"]
    candidates = result["fp_candidates"]

    print("=" * 70)
    print("  DÉTECTION AUTOMATIQUE FAUX POSITIFS NOM_GLOBAL")
    print("=" * 70)
    print(f"  Fichiers audit analysés : {stats['audit_files']}")
    print(f"  NOM_GLOBAL uniques      : {stats['total_nom_global_unique']}")
    print(f"  NOM_GLOBAL occurrences  : {stats['total_nom_global_occurrences']}")
    print(f"  Candidats FP détectés   : {stats['fp_candidates']}")
    print(f"  Déjà dans stop words    : {stats['already_in_stopwords']}")
    print()

    # Séparer nouveaux vs déjà traités
    new_fp = [c for c in candidates if not c.get("already_stopped")]
    old_fp = [c for c in candidates if c.get("already_stopped")]

    if new_fp:
        print(f"{'─'*70}")
        print(f"  NOUVEAUX FP À AJOUTER AUX STOP WORDS ({len(new_fp)})")
        print(f"{'─'*70}")
        print(f"  {'Token':<25s} {'Conf':>5s}  {'Occ':>4s}  {'Docs':>4s}  Raisons")
        print(f"  {'─'*24} {'─'*5}  {'─'*4}  {'─'*4}  {'─'*30}")
        for c in new_fp:
            reasons = ", ".join(c["reasons"])
            print(f"  {c['token']:<25s} {c['confidence']:>5.2f}  {c['occurrences']:>4d}  {c['n_files']:>4d}  {reasons}")

        # Générer le code Python à copier
        print(f"\n{'─'*70}")
        print(f"  CODE À AJOUTER dans _MEDICAL_STOP_WORDS_SET :")
        print(f"{'─'*70}")
        tokens_to_add = sorted(set(c["token"].lower() for c in new_fp))
        line = "    "
        for i, t in enumerate(tokens_to_add):
            entry = f'"{t}", '
            if len(line) + len(entry) > 95:
                print(line.rstrip(", "))
                line = "    "
            line += entry
        if line.strip():
            print(line.rstrip(", "))
    else:
        print("  Aucun nouveau FP détecté !")

    if old_fp:
        print(f"\n{'─'*70}")
        print(f"  DÉJÀ DANS STOP WORDS ({len(old_fp)}) — OK")
        print(f"{'─'*70}")
        for c in old_fp:
            print(f"  ✓ {c['token']}")

    print()


def auto_fix(result: dict, core_path: str = None):
    """Ajoute automatiquement les FP détectés aux stop words du core."""
    new_fp = [c for c in result["fp_candidates"]
              if not c.get("already_stopped") and c["confidence"] >= 0.5]

    if not new_fp:
        print("Aucun FP à haute confiance à ajouter automatiquement.")
        return []

    tokens = sorted(set(c["token"].lower() for c in new_fp))
    print(f"\n{len(tokens)} tokens à ajouter automatiquement (confiance >= 0.5):")
    for t in tokens:
        print(f"  + {t}")

    return tokens


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print(f"Usage: {sys.argv[0]} <audit_dir> [--auto-fix]")
        sys.exit(1)

    audit_dir = sys.argv[1]
    do_auto_fix = "--auto-fix" in sys.argv

    result = analyze_audit_dir(audit_dir)
    print_report(result)

    if do_auto_fix:
        tokens = auto_fix(result)
        if tokens:
            print(f"\nTokens à insérer : {tokens}")