- Nouvel outil audit_fp_detector.py : croise NOM_GLOBAL avec dictionnaire FR (346K mots), patterns morphologiques médicaux, mots structurels DPI, fréquence inter-documents - +170 stop words en 2 lots : termes médicaux (abdomen, bilirubine, gastrique...), soins infirmiers (bijoux, ongles, maquillage, habillage...), mots courants (angle, bureau...) - Ville basque ajoutée : anglet - Résultat : 192/199 FP détectés couverts, 7 restants = artefacts OCR de vrais noms - Total stop words : 5076 tokens Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
361 lines
14 KiB
Python
361 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""Détecteur automatique de faux positifs NOM_GLOBAL.
|
|
|
|
Analyse les fichiers .audit.jsonl et croise les NOM_GLOBAL avec :
|
|
1. Le dictionnaire français système (/usr/share/dict/french)
|
|
2. Des patterns morphologiques médicaux (-ite, -ose, -ique, -ine, etc.)
|
|
3. La fréquence inter-documents (un vrai nom apparaît rarement dans 1 seul dossier)
|
|
|
|
Usage:
|
|
python3 audit_fp_detector.py /chemin/vers/anonymise/
|
|
python3 audit_fp_detector.py /chemin/vers/anonymise/ --auto-fix
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
|
|
# ── Chargement dictionnaire français ────────────────────────────────
|
|
DICT_PATH = Path("/usr/share/dict/french")
|
|
_french_words: set = set()
|
|
|
|
def _load_french_dict():
|
|
global _french_words
|
|
if _french_words:
|
|
return
|
|
with open(DICT_PATH) as f:
|
|
_french_words = set(w.strip().lower() for w in f if len(w.strip()) >= 3)
|
|
|
|
# ── Prénoms/noms français courants (homonymes dictionnaire) ────────
|
|
# Ces mots sont à la fois dans le dictionnaire ET sont des prénoms/noms réels.
|
|
# Ils ne doivent PAS être flagués comme FP.
|
|
_KNOWN_NAME_HOMOPHONES = {
|
|
# Prénoms courants qui sont aussi des mots
|
|
"martin", "bernard", "petit", "richard", "moreau", "laurent",
|
|
"simon", "pierre", "marie", "jean", "paul", "louis", "marc",
|
|
"charles", "henry", "victor", "rose", "marguerite", "pascal",
|
|
"leon", "léon", "auguste", "clement", "clément", "olive",
|
|
"sylvie", "denis", "raymond", "roger", "maxime", "claude",
|
|
"marcel", "germaine", "alice", "florence", "dominique",
|
|
"christine", "caroline", "elisabeth", "elisabeth", "thomas",
|
|
"nicolas", "vincent", "benjamin", "lucien", "gaston",
|
|
"annette", "colette", "suzanne", "andre", "andré", "rené",
|
|
"yves", "gilles", "noel", "noël", "aimé", "aime",
|
|
"guy", "joël", "joelle", "gilbert", "fernand", "édith",
|
|
"edith", "agnès", "agnes", "jeanne", "lucie", "laure",
|
|
"adrien", "bastien", "julien", "viviane", "constance",
|
|
"armand", "blanche", "clémence", "clemence", "prudence", "patience",
|
|
"grace", "grâce", "fidèle", "placide",
|
|
# Prénoms féminins en -ine/-ine (matchent le suffixe médical)
|
|
"adeline", "aline", "amandine", "capucine", "celine", "céline",
|
|
"coline", "catherine", "clementine", "clémentine", "delphine",
|
|
"emeline", "émeline", "frédérique", "frederique", "ghislaine",
|
|
"gwendoline", "justine", "karine", "laurence", "laurie",
|
|
"marceline", "marine", "marjolaine", "martine", "madeleine",
|
|
"melaine", "moline", "morgane", "nadine", "noémie", "noemie",
|
|
"pauline", "perrine", "sabine", "sandrine", "séverine", "severine",
|
|
"tiphaine", "virginie",
|
|
# Prénoms en -oine/-iste/-ence etc.
|
|
"antoine", "baptiste", "patrice", "romain", "charlotte",
|
|
"alexandra", "aurore", "jules", "jacques", "mathieu",
|
|
"olivier", "ana", "maria", "pascale", "laura", "margot",
|
|
"marina", "maite", "maïté",
|
|
# Noms de famille courants qui sont aussi des mots
|
|
"blanc", "noir", "fort", "brun", "roux", "roy", "fabre",
|
|
"page", "comte", "baron", "marin", "fournier", "bonhomme",
|
|
"boucher", "berger", "marchand", "chevalier", "gros",
|
|
"masson", "bonnet", "vidal", "meunier", "fontaine",
|
|
"robin", "gay", "perrin", "roche", "rey",
|
|
"maître", "maitre", "salle", "aubry",
|
|
# Noms de famille fréquents dans le corpus
|
|
"barbotin", "brocard", "brunet", "cailliez", "charrier",
|
|
"colas", "combes", "forges", "gaillard", "galidie",
|
|
"gendre", "genevois", "grenier", "lemoine", "martel",
|
|
"martial", "moulin", "pineau", "piper", "pontier",
|
|
"poulot", "rosier", "roussel-fontaine", "saule", "senne",
|
|
"serrate", "serre", "taris", "vigneau", "vignes", "voisin",
|
|
"barre", "campagnard", "claire", "capera", "bigourdan",
|
|
"breton", "rainer", "bataille", "cabaner", "klement",
|
|
"boucau", "marte", "dores", "culot",
|
|
# Prénoms composés courants (matchent suffixe médical -ine/-ence)
|
|
"anne-christine", "marie-christine", "marie-line",
|
|
"marie-marceline", "berges",
|
|
}
|
|
|
|
# ── Patterns morphologiques médicaux ────────────────────────────────
|
|
_MEDICAL_SUFFIXES = re.compile(
|
|
r"(?:ite|ose|ique|isme|ine|ome|able|tion|ment|aire|ence|ance"
|
|
r"|ectomie|tomie|pathie|scopie|plasie|trophie|graphie"
|
|
r"|lyse|émie|emie|urie|pnée|pnee|algie|cyte|gène|gene"
|
|
r"|oïde|oide|ïque|phage|logie|thérapie|therapie)$",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
_MEDICAL_PREFIXES = re.compile(
|
|
r"^(?:anti|hyper|hypo|intra|extra|para|péri|peri|poly|mono"
|
|
r"|néo|neo|dys|hémo|hemo|héma|hema|gastro|entéro|entero"
|
|
r"|broncho|pneumo|cardio|neuro|uro|néphro|nephro"
|
|
r"|ostéo|osteo|arthro|dermato|onco|immuno|cyto|histo"
|
|
r"|bio|micro|macro|angio|adéno|adeno|cholé|chole"
|
|
r"|pancréato|pancreato|hépato|hepato|splén|splen)",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
# ── Mots structurels / trackare / DPI ──────────────────────────────
|
|
_STRUCTURAL_WORDS = {
|
|
"observation", "observations", "consultation", "prescripteur",
|
|
"administration", "evaluation", "évaluation", "indication",
|
|
"surveillance", "transmission", "transmissions",
|
|
"preparation", "préparation", "planification",
|
|
"validation", "notification", "recommandation",
|
|
"intervention", "installation", "mobilisation",
|
|
"exploration", "perfusion", "transfusion", "aspiration",
|
|
"nutrition", "hydratation", "oxygénation", "oxygenation",
|
|
"cicatrisation", "désinfection", "desinfection",
|
|
"compensation", "stabilisation", "régularisation",
|
|
"hospitalisation", "réhospitalisation",
|
|
"amélioration", "amelioration", "dégradation", "degradation",
|
|
"aggravation", "complication", "résolution", "resolution",
|
|
"progression", "régression", "regression",
|
|
"rééducation", "reeducation", "adaptation", "réadaptation",
|
|
"orientation", "coordination", "organisation",
|
|
}
|
|
|
|
|
|
def analyze_audit_dir(audit_dir: str) -> dict:
|
|
"""Analyse tous les .audit.jsonl d'un répertoire.
|
|
|
|
Returns dict with:
|
|
- fp_candidates: list of (token, confidence, reasons)
|
|
- stats: summary statistics
|
|
"""
|
|
_load_french_dict()
|
|
|
|
audit_dir = Path(audit_dir)
|
|
audit_files = sorted(audit_dir.glob("*.audit.jsonl"))
|
|
|
|
if not audit_files:
|
|
print(f"Aucun fichier .audit.jsonl trouvé dans {audit_dir}")
|
|
return {"fp_candidates": [], "stats": {}}
|
|
|
|
# ── Collecter NOM_GLOBAL par fichier ────────────────────────────
|
|
token_files = defaultdict(set) # token → set of filenames
|
|
token_counts = defaultdict(int) # token → total occurrences
|
|
all_kinds = defaultdict(int)
|
|
|
|
for af in audit_files:
|
|
fname = af.stem.replace(".audit", "")
|
|
with open(af) as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
try:
|
|
h = json.loads(line)
|
|
all_kinds[h["kind"]] += 1
|
|
if h["kind"] == "NOM_GLOBAL":
|
|
token = h["original"]
|
|
token_files[token].add(fname)
|
|
token_counts[token] += 1
|
|
except Exception:
|
|
pass
|
|
|
|
# ── Analyser chaque token ───────────────────────────────────────
|
|
fp_candidates = []
|
|
|
|
for token in sorted(token_files.keys()):
|
|
reasons = []
|
|
confidence = 0.0
|
|
token_lower = token.lower()
|
|
n_files = len(token_files[token])
|
|
n_total = token_counts[token]
|
|
|
|
# Skip known name homophones
|
|
if token_lower in _KNOWN_NAME_HOMOPHONES:
|
|
continue
|
|
|
|
# 1. Dans le dictionnaire français ?
|
|
in_dict = token_lower in _french_words
|
|
if in_dict:
|
|
reasons.append("DICT_FR")
|
|
confidence += 0.4
|
|
|
|
# 2. Suffixe médical ? (exclure -ine court qui matche les prénoms)
|
|
has_medical_suffix = bool(_MEDICAL_SUFFIXES.search(token_lower))
|
|
if has_medical_suffix:
|
|
# -ine seul trop large pour les mots courts (prénoms)
|
|
suffix_match = _MEDICAL_SUFFIXES.search(token_lower)
|
|
if suffix_match and suffix_match.group() == "ine" and len(token_lower) < 8:
|
|
has_medical_suffix = False
|
|
elif suffix_match and suffix_match.group() in ("tion", "ment", "ence", "ance", "aire") and len(token_lower) < 6:
|
|
has_medical_suffix = False
|
|
if has_medical_suffix:
|
|
reasons.append("SUFFIXE_MED")
|
|
confidence += 0.3
|
|
|
|
# 3. Préfixe médical ?
|
|
has_medical_prefix = bool(_MEDICAL_PREFIXES.search(token_lower))
|
|
if has_medical_prefix:
|
|
reasons.append("PREFIXE_MED")
|
|
confidence += 0.3
|
|
|
|
# 4. Mot structurel DPI ?
|
|
if token_lower in _STRUCTURAL_WORDS:
|
|
reasons.append("STRUCT_DPI")
|
|
confidence += 0.5
|
|
|
|
# 5. Tout en minuscule (les vrais noms sont Capitalisés ou MAJUSCULES)
|
|
if token.islower() and len(token) > 3:
|
|
reasons.append("MINUSCULE")
|
|
confidence += 0.2
|
|
|
|
# 6. Très court (<=3) et ALL-CAPS → souvent abréviation
|
|
if len(token) <= 3 and token.isupper():
|
|
reasons.append("ABREV_3CH")
|
|
confidence += 0.2
|
|
|
|
# 7. Apparaît dans 1 seul fichier + dans le dico → très suspect
|
|
if n_files == 1 and in_dict:
|
|
reasons.append("1_SEUL_DOC")
|
|
confidence += 0.2
|
|
|
|
# 8. Mot composé avec tiret contenant un mot du dico
|
|
if "-" in token:
|
|
parts = token.split("-")
|
|
dict_parts = [p for p in parts if p.lower() in _french_words and p.lower() not in _KNOWN_NAME_HOMOPHONES]
|
|
if dict_parts and len(dict_parts) == len(parts):
|
|
reasons.append("COMPOSE_DICT")
|
|
confidence += 0.3
|
|
|
|
# Seuil : au moins une raison
|
|
if reasons and confidence >= 0.3:
|
|
fp_candidates.append({
|
|
"token": token,
|
|
"confidence": round(min(confidence, 1.0), 2),
|
|
"reasons": reasons,
|
|
"occurrences": n_total,
|
|
"n_files": n_files,
|
|
"files": sorted(token_files[token])[:3],
|
|
})
|
|
|
|
# Trier par confiance décroissante
|
|
fp_candidates.sort(key=lambda x: (-x["confidence"], x["token"]))
|
|
|
|
stats = {
|
|
"audit_files": len(audit_files),
|
|
"total_nom_global_unique": len(token_files),
|
|
"total_nom_global_occurrences": sum(token_counts.values()),
|
|
"fp_candidates": len(fp_candidates),
|
|
"already_in_stopwords": 0, # filled below
|
|
}
|
|
|
|
# Vérifier lesquels sont déjà dans les stop words
|
|
try:
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
from anonymizer_core_refactored_onnx import _MEDICAL_STOP_WORDS_SET
|
|
already = [c for c in fp_candidates if c["token"].lower() in _MEDICAL_STOP_WORDS_SET]
|
|
stats["already_in_stopwords"] = len(already)
|
|
for c in fp_candidates:
|
|
if c["token"].lower() in _MEDICAL_STOP_WORDS_SET:
|
|
c["already_stopped"] = True
|
|
else:
|
|
c["already_stopped"] = False
|
|
except ImportError:
|
|
for c in fp_candidates:
|
|
c["already_stopped"] = None
|
|
|
|
return {"fp_candidates": fp_candidates, "stats": stats}
|
|
|
|
|
|
def print_report(result: dict):
|
|
"""Affiche un rapport lisible."""
|
|
stats = result["stats"]
|
|
candidates = result["fp_candidates"]
|
|
|
|
print("=" * 70)
|
|
print(" DÉTECTION AUTOMATIQUE FAUX POSITIFS NOM_GLOBAL")
|
|
print("=" * 70)
|
|
print(f" Fichiers audit analysés : {stats['audit_files']}")
|
|
print(f" NOM_GLOBAL uniques : {stats['total_nom_global_unique']}")
|
|
print(f" NOM_GLOBAL occurrences : {stats['total_nom_global_occurrences']}")
|
|
print(f" Candidats FP détectés : {stats['fp_candidates']}")
|
|
print(f" Déjà dans stop words : {stats['already_in_stopwords']}")
|
|
print()
|
|
|
|
# Séparer nouveaux vs déjà traités
|
|
new_fp = [c for c in candidates if not c.get("already_stopped")]
|
|
old_fp = [c for c in candidates if c.get("already_stopped")]
|
|
|
|
if new_fp:
|
|
print(f"{'─'*70}")
|
|
print(f" NOUVEAUX FP À AJOUTER AUX STOP WORDS ({len(new_fp)})")
|
|
print(f"{'─'*70}")
|
|
print(f" {'Token':<25s} {'Conf':>5s} {'Occ':>4s} {'Docs':>4s} Raisons")
|
|
print(f" {'─'*24} {'─'*5} {'─'*4} {'─'*4} {'─'*30}")
|
|
for c in new_fp:
|
|
reasons = ", ".join(c["reasons"])
|
|
print(f" {c['token']:<25s} {c['confidence']:>5.2f} {c['occurrences']:>4d} {c['n_files']:>4d} {reasons}")
|
|
|
|
# Générer le code Python à copier
|
|
print(f"\n{'─'*70}")
|
|
print(f" CODE À AJOUTER dans _MEDICAL_STOP_WORDS_SET :")
|
|
print(f"{'─'*70}")
|
|
tokens_to_add = sorted(set(c["token"].lower() for c in new_fp))
|
|
line = " "
|
|
for i, t in enumerate(tokens_to_add):
|
|
entry = f'"{t}", '
|
|
if len(line) + len(entry) > 95:
|
|
print(line.rstrip(", "))
|
|
line = " "
|
|
line += entry
|
|
if line.strip():
|
|
print(line.rstrip(", "))
|
|
else:
|
|
print(" Aucun nouveau FP détecté !")
|
|
|
|
if old_fp:
|
|
print(f"\n{'─'*70}")
|
|
print(f" DÉJÀ DANS STOP WORDS ({len(old_fp)}) — OK")
|
|
print(f"{'─'*70}")
|
|
for c in old_fp:
|
|
print(f" ✓ {c['token']}")
|
|
|
|
print()
|
|
|
|
|
|
def auto_fix(result: dict, core_path: str = None):
|
|
"""Ajoute automatiquement les FP détectés aux stop words du core."""
|
|
new_fp = [c for c in result["fp_candidates"]
|
|
if not c.get("already_stopped") and c["confidence"] >= 0.5]
|
|
|
|
if not new_fp:
|
|
print("Aucun FP à haute confiance à ajouter automatiquement.")
|
|
return []
|
|
|
|
tokens = sorted(set(c["token"].lower() for c in new_fp))
|
|
print(f"\n{len(tokens)} tokens à ajouter automatiquement (confiance >= 0.5):")
|
|
for t in tokens:
|
|
print(f" + {t}")
|
|
|
|
return tokens
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) < 2:
|
|
print(f"Usage: {sys.argv[0]} <audit_dir> [--auto-fix]")
|
|
sys.exit(1)
|
|
|
|
audit_dir = sys.argv[1]
|
|
do_auto_fix = "--auto-fix" in sys.argv
|
|
|
|
result = analyze_audit_dir(audit_dir)
|
|
print_report(result)
|
|
|
|
if do_auto_fix:
|
|
tokens = auto_fix(result)
|
|
if tokens:
|
|
print(f"\nTokens à insérer : {tokens}")
|