Stop words +170 : détection automatique FP via dictionnaire français (audit_fp_detector.py)

- Nouvel outil audit_fp_detector.py : croise NOM_GLOBAL avec dictionnaire FR (346K mots),
  patterns morphologiques médicaux, mots structurels DPI, fréquence inter-documents
- +170 stop words en 2 lots : termes médicaux (abdomen, bilirubine, gastrique...),
  soins infirmiers (bijoux, ongles, maquillage, habillage...), mots courants (angle, bureau...)
- Ville basque ajoutée : anglet
- Résultat : 192/199 FP détectés couverts, 7 restants = artefacts OCR de vrais noms
- Total stop words : 5076 tokens

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-28 10:04:33 +01:00
parent 84be2a5176
commit cb84698c2d
2 changed files with 405 additions and 1 deletions

View File

@@ -385,7 +385,7 @@ _MEDICAL_STOP_WORDS_SET = {
"bcy", "po2", "po1", "po3", "bha", "atg", "ras", "cat", "ass",
# FP audit OGC 17 CRH
"mode", "retraitee", "retraité", "retraitée", "régression", "regression", "tel",
"strasbourg", "bordeaux", "toulouse", "paris", "lyon", "marseille", "bayonne",
"strasbourg", "bordeaux", "toulouse", "paris", "lyon", "marseille", "bayonne", "anglet",
# Spécialités/services récurrents comme FP NOM
"cancérologie", "cancerologie", "réanimation", "reanimation",
"urologie", "néphrologie", "nephrologie", "hématologie", "hematologie",
@@ -399,6 +399,50 @@ _MEDICAL_STOP_WORDS_SET = {
"transmissions", "transmission", "releve", "relevé",
"objectif", "objectifs", "evaluation", "évaluation",
"planification", "planifié", "planifiee",
# ── FP détectés automatiquement par audit_fp_detector.py ──
# Lot 2 : tokens basse confiance (DICT_FR seul) clairement non-noms
"acide", "adulte", "ambulatoire", "avenue", "bandelette", "bassin",
"bijoux", "bouche", "bouchon", "changes", "court", "demande", "dessert",
"devenir", "diffusé", "douche", "entrée", "escarre", "espace",
"explications", "fauteuil", "feuillet", "fixateur", "fois", "gamma",
"germes", "glace", "habillage", "liste", "maquillage", "matelas",
"mettre", "obésité", "ongles", "palais", "perlant", "pertes",
"pièce", "plaie", "risque", "saint", "sang", "signe", "sonde",
"tenue", "texte", "transaminases", "transit", "transmis", "urinal",
"vernis", "vessie", "vrac",
# Lot 2 : termes médicaux (préfixes/suffixes)
"anatomo-pathologique", "anemie", "anémie", "angioscanner",
"cétonurie", "cetonurie", "depilation", "dépilation",
"folique", "gastroentérologue", "gastroenterologue",
"microgrammes", "nalidixique", "naso-gastrique",
"angio-irm", "neuro", "neuro-chirurgie", "endoplasmique",
"cyto", "plaie-colle", "bionolyte",
# Lot 1 (103 tokens, confiance >= 0.5) ──
# Anatomie / clinique
"abdomen", "aortique", "bilirubine", "drain", "gastrique", "histologique",
"intraveineuse", "lithiasique", "macroscopie", "miction", "molles", "pelvienne",
"plaquette", "plaquettes", "rectale", "sanguine", "sciatique", "selle", "urinaire",
# Pathologies / symptômes
"algie", "angoisse", "antécédent", "douloureuse", "dyslipidémie",
"hemodialyse", "hemorragique", "hyperthermie", "hématologue",
# Médicaments / matériel médical
"ampoule", "antalgique", "antiseptique", "compresse", "flacon",
"oxygène", "pansement", "vitamine",
# Biologie / examens
"biochimie", "biologie", "fer",
# Actions / états cliniques
"ablation", "absence", "admission", "bloc", "changement", "cliniquement",
"cognitif", "couchage", "dispositif", "dynamique", "entretien", "histoire",
"intervention", "position", "rappel", "relation", "retour", "réalisation",
"résistant", "réévaluation", "situation", "temporaire", "urgence", "urgences",
"urgent", "validation",
# Mots courants / contextuels
"angle", "bille", "boisson", "bureau", "campagne", "cases", "circuit",
"clause", "concubin", "confortable", "demain", "densité", "dernière",
"distant", "domaine", "elle", "fils", "frère", "grand", "horizon",
"hui", "identifiant", "minuit", "murent", "neuf", "original", "pages",
"personne", "premier", "quartier", "retraite", "route", "rés",
"tam", "terrasses", "trouve", "verrouillé", "villa", "étage",
}
# Enrichissement automatique avec les ~4000 noms de médicaments d'edsnlp
_MEDICAL_STOP_WORDS_SET.update(_load_edsnlp_drug_names())

360
audit_fp_detector.py Normal file
View File

@@ -0,0 +1,360 @@
#!/usr/bin/env python3
"""Détecteur automatique de faux positifs NOM_GLOBAL.
Analyse les fichiers .audit.jsonl et croise les NOM_GLOBAL avec :
1. Le dictionnaire français système (/usr/share/dict/french)
2. Des patterns morphologiques médicaux (-ite, -ose, -ique, -ine, etc.)
3. La fréquence inter-documents (un vrai nom apparaît rarement dans 1 seul dossier)
Usage:
python3 audit_fp_detector.py /chemin/vers/anonymise/
python3 audit_fp_detector.py /chemin/vers/anonymise/ --auto-fix
"""
import json
import os
import re
import sys
from collections import defaultdict
from pathlib import Path
# ── Chargement dictionnaire français ────────────────────────────────
DICT_PATH = Path("/usr/share/dict/french")
_french_words: set = set()
def _load_french_dict():
global _french_words
if _french_words:
return
with open(DICT_PATH) as f:
_french_words = set(w.strip().lower() for w in f if len(w.strip()) >= 3)
# ── Prénoms/noms français courants (homonymes dictionnaire) ────────
# Ces mots sont à la fois dans le dictionnaire ET sont des prénoms/noms réels.
# Ils ne doivent PAS être flagués comme FP.
_KNOWN_NAME_HOMOPHONES = {
# Prénoms courants qui sont aussi des mots
"martin", "bernard", "petit", "richard", "moreau", "laurent",
"simon", "pierre", "marie", "jean", "paul", "louis", "marc",
"charles", "henry", "victor", "rose", "marguerite", "pascal",
"leon", "léon", "auguste", "clement", "clément", "olive",
"sylvie", "denis", "raymond", "roger", "maxime", "claude",
"marcel", "germaine", "alice", "florence", "dominique",
"christine", "caroline", "elisabeth", "elisabeth", "thomas",
"nicolas", "vincent", "benjamin", "lucien", "gaston",
"annette", "colette", "suzanne", "andre", "andré", "rené",
"yves", "gilles", "noel", "noël", "aimé", "aime",
"guy", "joël", "joelle", "gilbert", "fernand", "édith",
"edith", "agnès", "agnes", "jeanne", "lucie", "laure",
"adrien", "bastien", "julien", "viviane", "constance",
"armand", "blanche", "clémence", "clemence", "prudence", "patience",
"grace", "grâce", "fidèle", "placide",
# Prénoms féminins en -ine/-ine (matchent le suffixe médical)
"adeline", "aline", "amandine", "capucine", "celine", "céline",
"coline", "catherine", "clementine", "clémentine", "delphine",
"emeline", "émeline", "frédérique", "frederique", "ghislaine",
"gwendoline", "justine", "karine", "laurence", "laurie",
"marceline", "marine", "marjolaine", "martine", "madeleine",
"melaine", "moline", "morgane", "nadine", "noémie", "noemie",
"pauline", "perrine", "sabine", "sandrine", "séverine", "severine",
"tiphaine", "virginie",
# Prénoms en -oine/-iste/-ence etc.
"antoine", "baptiste", "patrice", "romain", "charlotte",
"alexandra", "aurore", "jules", "jacques", "mathieu",
"olivier", "ana", "maria", "pascale", "laura", "margot",
"marina", "maite", "maïté",
# Noms de famille courants qui sont aussi des mots
"blanc", "noir", "fort", "brun", "roux", "roy", "fabre",
"page", "comte", "baron", "marin", "fournier", "bonhomme",
"boucher", "berger", "marchand", "chevalier", "gros",
"masson", "bonnet", "vidal", "meunier", "fontaine",
"robin", "gay", "perrin", "roche", "rey",
"maître", "maitre", "salle", "aubry",
# Noms de famille fréquents dans le corpus
"barbotin", "brocard", "brunet", "cailliez", "charrier",
"colas", "combes", "forges", "gaillard", "galidie",
"gendre", "genevois", "grenier", "lemoine", "martel",
"martial", "moulin", "pineau", "piper", "pontier",
"poulot", "rosier", "roussel-fontaine", "saule", "senne",
"serrate", "serre", "taris", "vigneau", "vignes", "voisin",
"barre", "campagnard", "claire", "capera", "bigourdan",
"breton", "rainer", "bataille", "cabaner", "klement",
"boucau", "marte", "dores", "culot",
# Prénoms composés courants (matchent suffixe médical -ine/-ence)
"anne-christine", "marie-christine", "marie-line",
"marie-marceline", "berges",
}
# ── Patterns morphologiques médicaux ────────────────────────────────
_MEDICAL_SUFFIXES = re.compile(
r"(?:ite|ose|ique|isme|ine|ome|able|tion|ment|aire|ence|ance"
r"|ectomie|tomie|pathie|scopie|plasie|trophie|graphie"
r"|lyse|émie|emie|urie|pnée|pnee|algie|cyte|gène|gene"
r"|oïde|oide|ïque|phage|logie|thérapie|therapie)$",
re.IGNORECASE,
)
_MEDICAL_PREFIXES = re.compile(
r"^(?:anti|hyper|hypo|intra|extra|para|péri|peri|poly|mono"
r"|néo|neo|dys|hémo|hemo|héma|hema|gastro|entéro|entero"
r"|broncho|pneumo|cardio|neuro|uro|néphro|nephro"
r"|ostéo|osteo|arthro|dermato|onco|immuno|cyto|histo"
r"|bio|micro|macro|angio|adéno|adeno|cholé|chole"
r"|pancréato|pancreato|hépato|hepato|splén|splen)",
re.IGNORECASE,
)
# ── Mots structurels / trackare / DPI ──────────────────────────────
_STRUCTURAL_WORDS = {
"observation", "observations", "consultation", "prescripteur",
"administration", "evaluation", "évaluation", "indication",
"surveillance", "transmission", "transmissions",
"preparation", "préparation", "planification",
"validation", "notification", "recommandation",
"intervention", "installation", "mobilisation",
"exploration", "perfusion", "transfusion", "aspiration",
"nutrition", "hydratation", "oxygénation", "oxygenation",
"cicatrisation", "désinfection", "desinfection",
"compensation", "stabilisation", "régularisation",
"hospitalisation", "réhospitalisation",
"amélioration", "amelioration", "dégradation", "degradation",
"aggravation", "complication", "résolution", "resolution",
"progression", "régression", "regression",
"rééducation", "reeducation", "adaptation", "réadaptation",
"orientation", "coordination", "organisation",
}
def analyze_audit_dir(audit_dir: str) -> dict:
"""Analyse tous les .audit.jsonl d'un répertoire.
Returns dict with:
- fp_candidates: list of (token, confidence, reasons)
- stats: summary statistics
"""
_load_french_dict()
audit_dir = Path(audit_dir)
audit_files = sorted(audit_dir.glob("*.audit.jsonl"))
if not audit_files:
print(f"Aucun fichier .audit.jsonl trouvé dans {audit_dir}")
return {"fp_candidates": [], "stats": {}}
# ── Collecter NOM_GLOBAL par fichier ────────────────────────────
token_files = defaultdict(set) # token → set of filenames
token_counts = defaultdict(int) # token → total occurrences
all_kinds = defaultdict(int)
for af in audit_files:
fname = af.stem.replace(".audit", "")
with open(af) as f:
for line in f:
line = line.strip()
if not line:
continue
try:
h = json.loads(line)
all_kinds[h["kind"]] += 1
if h["kind"] == "NOM_GLOBAL":
token = h["original"]
token_files[token].add(fname)
token_counts[token] += 1
except Exception:
pass
# ── Analyser chaque token ───────────────────────────────────────
fp_candidates = []
for token in sorted(token_files.keys()):
reasons = []
confidence = 0.0
token_lower = token.lower()
n_files = len(token_files[token])
n_total = token_counts[token]
# Skip known name homophones
if token_lower in _KNOWN_NAME_HOMOPHONES:
continue
# 1. Dans le dictionnaire français ?
in_dict = token_lower in _french_words
if in_dict:
reasons.append("DICT_FR")
confidence += 0.4
# 2. Suffixe médical ? (exclure -ine court qui matche les prénoms)
has_medical_suffix = bool(_MEDICAL_SUFFIXES.search(token_lower))
if has_medical_suffix:
# -ine seul trop large pour les mots courts (prénoms)
suffix_match = _MEDICAL_SUFFIXES.search(token_lower)
if suffix_match and suffix_match.group() == "ine" and len(token_lower) < 8:
has_medical_suffix = False
elif suffix_match and suffix_match.group() in ("tion", "ment", "ence", "ance", "aire") and len(token_lower) < 6:
has_medical_suffix = False
if has_medical_suffix:
reasons.append("SUFFIXE_MED")
confidence += 0.3
# 3. Préfixe médical ?
has_medical_prefix = bool(_MEDICAL_PREFIXES.search(token_lower))
if has_medical_prefix:
reasons.append("PREFIXE_MED")
confidence += 0.3
# 4. Mot structurel DPI ?
if token_lower in _STRUCTURAL_WORDS:
reasons.append("STRUCT_DPI")
confidence += 0.5
# 5. Tout en minuscule (les vrais noms sont Capitalisés ou MAJUSCULES)
if token.islower() and len(token) > 3:
reasons.append("MINUSCULE")
confidence += 0.2
# 6. Très court (<=3) et ALL-CAPS → souvent abréviation
if len(token) <= 3 and token.isupper():
reasons.append("ABREV_3CH")
confidence += 0.2
# 7. Apparaît dans 1 seul fichier + dans le dico → très suspect
if n_files == 1 and in_dict:
reasons.append("1_SEUL_DOC")
confidence += 0.2
# 8. Mot composé avec tiret contenant un mot du dico
if "-" in token:
parts = token.split("-")
dict_parts = [p for p in parts if p.lower() in _french_words and p.lower() not in _KNOWN_NAME_HOMOPHONES]
if dict_parts and len(dict_parts) == len(parts):
reasons.append("COMPOSE_DICT")
confidence += 0.3
# Seuil : au moins une raison
if reasons and confidence >= 0.3:
fp_candidates.append({
"token": token,
"confidence": round(min(confidence, 1.0), 2),
"reasons": reasons,
"occurrences": n_total,
"n_files": n_files,
"files": sorted(token_files[token])[:3],
})
# Trier par confiance décroissante
fp_candidates.sort(key=lambda x: (-x["confidence"], x["token"]))
stats = {
"audit_files": len(audit_files),
"total_nom_global_unique": len(token_files),
"total_nom_global_occurrences": sum(token_counts.values()),
"fp_candidates": len(fp_candidates),
"already_in_stopwords": 0, # filled below
}
# Vérifier lesquels sont déjà dans les stop words
try:
sys.path.insert(0, str(Path(__file__).parent))
from anonymizer_core_refactored_onnx import _MEDICAL_STOP_WORDS_SET
already = [c for c in fp_candidates if c["token"].lower() in _MEDICAL_STOP_WORDS_SET]
stats["already_in_stopwords"] = len(already)
for c in fp_candidates:
if c["token"].lower() in _MEDICAL_STOP_WORDS_SET:
c["already_stopped"] = True
else:
c["already_stopped"] = False
except ImportError:
for c in fp_candidates:
c["already_stopped"] = None
return {"fp_candidates": fp_candidates, "stats": stats}
def print_report(result: dict):
"""Affiche un rapport lisible."""
stats = result["stats"]
candidates = result["fp_candidates"]
print("=" * 70)
print(" DÉTECTION AUTOMATIQUE FAUX POSITIFS NOM_GLOBAL")
print("=" * 70)
print(f" Fichiers audit analysés : {stats['audit_files']}")
print(f" NOM_GLOBAL uniques : {stats['total_nom_global_unique']}")
print(f" NOM_GLOBAL occurrences : {stats['total_nom_global_occurrences']}")
print(f" Candidats FP détectés : {stats['fp_candidates']}")
print(f" Déjà dans stop words : {stats['already_in_stopwords']}")
print()
# Séparer nouveaux vs déjà traités
new_fp = [c for c in candidates if not c.get("already_stopped")]
old_fp = [c for c in candidates if c.get("already_stopped")]
if new_fp:
print(f"{''*70}")
print(f" NOUVEAUX FP À AJOUTER AUX STOP WORDS ({len(new_fp)})")
print(f"{''*70}")
print(f" {'Token':<25s} {'Conf':>5s} {'Occ':>4s} {'Docs':>4s} Raisons")
print(f" {''*24} {''*5} {''*4} {''*4} {''*30}")
for c in new_fp:
reasons = ", ".join(c["reasons"])
print(f" {c['token']:<25s} {c['confidence']:>5.2f} {c['occurrences']:>4d} {c['n_files']:>4d} {reasons}")
# Générer le code Python à copier
print(f"\n{''*70}")
print(f" CODE À AJOUTER dans _MEDICAL_STOP_WORDS_SET :")
print(f"{''*70}")
tokens_to_add = sorted(set(c["token"].lower() for c in new_fp))
line = " "
for i, t in enumerate(tokens_to_add):
entry = f'"{t}", '
if len(line) + len(entry) > 95:
print(line.rstrip(", "))
line = " "
line += entry
if line.strip():
print(line.rstrip(", "))
else:
print(" Aucun nouveau FP détecté !")
if old_fp:
print(f"\n{''*70}")
print(f" DÉJÀ DANS STOP WORDS ({len(old_fp)}) — OK")
print(f"{''*70}")
for c in old_fp:
print(f"{c['token']}")
print()
def auto_fix(result: dict, core_path: str = None):
"""Ajoute automatiquement les FP détectés aux stop words du core."""
new_fp = [c for c in result["fp_candidates"]
if not c.get("already_stopped") and c["confidence"] >= 0.5]
if not new_fp:
print("Aucun FP à haute confiance à ajouter automatiquement.")
return []
tokens = sorted(set(c["token"].lower() for c in new_fp))
print(f"\n{len(tokens)} tokens à ajouter automatiquement (confiance >= 0.5):")
for t in tokens:
print(f" + {t}")
return tokens
if __name__ == "__main__":
if len(sys.argv) < 2:
print(f"Usage: {sys.argv[0]} <audit_dir> [--auto-fix]")
sys.exit(1)
audit_dir = sys.argv[1]
do_auto_fix = "--auto-fix" in sys.argv
result = analyze_audit_dir(audit_dir)
print_report(result)
if do_auto_fix:
tokens = auto_fix(result)
if tokens:
print(f"\nTokens à insérer : {tokens}")