Stop words +170 : détection automatique FP via dictionnaire français (audit_fp_detector.py)
- Nouvel outil audit_fp_detector.py : croise NOM_GLOBAL avec dictionnaire FR (346K mots), patterns morphologiques médicaux, mots structurels DPI, fréquence inter-documents - +170 stop words en 2 lots : termes médicaux (abdomen, bilirubine, gastrique...), soins infirmiers (bijoux, ongles, maquillage, habillage...), mots courants (angle, bureau...) - Ville basque ajoutée : anglet - Résultat : 192/199 FP détectés couverts, 7 restants = artefacts OCR de vrais noms - Total stop words : 5076 tokens Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -385,7 +385,7 @@ _MEDICAL_STOP_WORDS_SET = {
|
||||
"bcy", "po2", "po1", "po3", "bha", "atg", "ras", "cat", "ass",
|
||||
# FP audit OGC 17 CRH
|
||||
"mode", "retraitee", "retraité", "retraitée", "régression", "regression", "tel",
|
||||
"strasbourg", "bordeaux", "toulouse", "paris", "lyon", "marseille", "bayonne",
|
||||
"strasbourg", "bordeaux", "toulouse", "paris", "lyon", "marseille", "bayonne", "anglet",
|
||||
# Spécialités/services récurrents comme FP NOM
|
||||
"cancérologie", "cancerologie", "réanimation", "reanimation",
|
||||
"urologie", "néphrologie", "nephrologie", "hématologie", "hematologie",
|
||||
@@ -399,6 +399,50 @@ _MEDICAL_STOP_WORDS_SET = {
|
||||
"transmissions", "transmission", "releve", "relevé",
|
||||
"objectif", "objectifs", "evaluation", "évaluation",
|
||||
"planification", "planifié", "planifiee",
|
||||
# ── FP détectés automatiquement par audit_fp_detector.py ──
|
||||
# Lot 2 : tokens basse confiance (DICT_FR seul) clairement non-noms
|
||||
"acide", "adulte", "ambulatoire", "avenue", "bandelette", "bassin",
|
||||
"bijoux", "bouche", "bouchon", "changes", "court", "demande", "dessert",
|
||||
"devenir", "diffusé", "douche", "entrée", "escarre", "espace",
|
||||
"explications", "fauteuil", "feuillet", "fixateur", "fois", "gamma",
|
||||
"germes", "glace", "habillage", "liste", "maquillage", "matelas",
|
||||
"mettre", "obésité", "ongles", "palais", "perlant", "pertes",
|
||||
"pièce", "plaie", "risque", "saint", "sang", "signe", "sonde",
|
||||
"tenue", "texte", "transaminases", "transit", "transmis", "urinal",
|
||||
"vernis", "vessie", "vrac",
|
||||
# Lot 2 : termes médicaux (préfixes/suffixes)
|
||||
"anatomo-pathologique", "anemie", "anémie", "angioscanner",
|
||||
"cétonurie", "cetonurie", "depilation", "dépilation",
|
||||
"folique", "gastroentérologue", "gastroenterologue",
|
||||
"microgrammes", "nalidixique", "naso-gastrique",
|
||||
"angio-irm", "neuro", "neuro-chirurgie", "endoplasmique",
|
||||
"cyto", "plaie-colle", "bionolyte",
|
||||
# Lot 1 (103 tokens, confiance >= 0.5) ──
|
||||
# Anatomie / clinique
|
||||
"abdomen", "aortique", "bilirubine", "drain", "gastrique", "histologique",
|
||||
"intraveineuse", "lithiasique", "macroscopie", "miction", "molles", "pelvienne",
|
||||
"plaquette", "plaquettes", "rectale", "sanguine", "sciatique", "selle", "urinaire",
|
||||
# Pathologies / symptômes
|
||||
"algie", "angoisse", "antécédent", "douloureuse", "dyslipidémie",
|
||||
"hemodialyse", "hemorragique", "hyperthermie", "hématologue",
|
||||
# Médicaments / matériel médical
|
||||
"ampoule", "antalgique", "antiseptique", "compresse", "flacon",
|
||||
"oxygène", "pansement", "vitamine",
|
||||
# Biologie / examens
|
||||
"biochimie", "biologie", "fer",
|
||||
# Actions / états cliniques
|
||||
"ablation", "absence", "admission", "bloc", "changement", "cliniquement",
|
||||
"cognitif", "couchage", "dispositif", "dynamique", "entretien", "histoire",
|
||||
"intervention", "position", "rappel", "relation", "retour", "réalisation",
|
||||
"résistant", "réévaluation", "situation", "temporaire", "urgence", "urgences",
|
||||
"urgent", "validation",
|
||||
# Mots courants / contextuels
|
||||
"angle", "bille", "boisson", "bureau", "campagne", "cases", "circuit",
|
||||
"clause", "concubin", "confortable", "demain", "densité", "dernière",
|
||||
"distant", "domaine", "elle", "fils", "frère", "grand", "horizon",
|
||||
"hui", "identifiant", "minuit", "murent", "neuf", "original", "pages",
|
||||
"personne", "premier", "quartier", "retraite", "route", "rés",
|
||||
"tam", "terrasses", "trouve", "verrouillé", "villa", "étage",
|
||||
}
|
||||
# Enrichissement automatique avec les ~4000 noms de médicaments d'edsnlp
|
||||
_MEDICAL_STOP_WORDS_SET.update(_load_edsnlp_drug_names())
|
||||
|
||||
360
audit_fp_detector.py
Normal file
360
audit_fp_detector.py
Normal file
@@ -0,0 +1,360 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Détecteur automatique de faux positifs NOM_GLOBAL.
|
||||
|
||||
Analyse les fichiers .audit.jsonl et croise les NOM_GLOBAL avec :
|
||||
1. Le dictionnaire français système (/usr/share/dict/french)
|
||||
2. Des patterns morphologiques médicaux (-ite, -ose, -ique, -ine, etc.)
|
||||
3. La fréquence inter-documents (un vrai nom apparaît rarement dans 1 seul dossier)
|
||||
|
||||
Usage:
|
||||
python3 audit_fp_detector.py /chemin/vers/anonymise/
|
||||
python3 audit_fp_detector.py /chemin/vers/anonymise/ --auto-fix
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
# ── Chargement dictionnaire français ────────────────────────────────
|
||||
DICT_PATH = Path("/usr/share/dict/french")
|
||||
_french_words: set = set()
|
||||
|
||||
def _load_french_dict():
|
||||
global _french_words
|
||||
if _french_words:
|
||||
return
|
||||
with open(DICT_PATH) as f:
|
||||
_french_words = set(w.strip().lower() for w in f if len(w.strip()) >= 3)
|
||||
|
||||
# ── Prénoms/noms français courants (homonymes dictionnaire) ────────
|
||||
# Ces mots sont à la fois dans le dictionnaire ET sont des prénoms/noms réels.
|
||||
# Ils ne doivent PAS être flagués comme FP.
|
||||
_KNOWN_NAME_HOMOPHONES = {
|
||||
# Prénoms courants qui sont aussi des mots
|
||||
"martin", "bernard", "petit", "richard", "moreau", "laurent",
|
||||
"simon", "pierre", "marie", "jean", "paul", "louis", "marc",
|
||||
"charles", "henry", "victor", "rose", "marguerite", "pascal",
|
||||
"leon", "léon", "auguste", "clement", "clément", "olive",
|
||||
"sylvie", "denis", "raymond", "roger", "maxime", "claude",
|
||||
"marcel", "germaine", "alice", "florence", "dominique",
|
||||
"christine", "caroline", "elisabeth", "elisabeth", "thomas",
|
||||
"nicolas", "vincent", "benjamin", "lucien", "gaston",
|
||||
"annette", "colette", "suzanne", "andre", "andré", "rené",
|
||||
"yves", "gilles", "noel", "noël", "aimé", "aime",
|
||||
"guy", "joël", "joelle", "gilbert", "fernand", "édith",
|
||||
"edith", "agnès", "agnes", "jeanne", "lucie", "laure",
|
||||
"adrien", "bastien", "julien", "viviane", "constance",
|
||||
"armand", "blanche", "clémence", "clemence", "prudence", "patience",
|
||||
"grace", "grâce", "fidèle", "placide",
|
||||
# Prénoms féminins en -ine/-ine (matchent le suffixe médical)
|
||||
"adeline", "aline", "amandine", "capucine", "celine", "céline",
|
||||
"coline", "catherine", "clementine", "clémentine", "delphine",
|
||||
"emeline", "émeline", "frédérique", "frederique", "ghislaine",
|
||||
"gwendoline", "justine", "karine", "laurence", "laurie",
|
||||
"marceline", "marine", "marjolaine", "martine", "madeleine",
|
||||
"melaine", "moline", "morgane", "nadine", "noémie", "noemie",
|
||||
"pauline", "perrine", "sabine", "sandrine", "séverine", "severine",
|
||||
"tiphaine", "virginie",
|
||||
# Prénoms en -oine/-iste/-ence etc.
|
||||
"antoine", "baptiste", "patrice", "romain", "charlotte",
|
||||
"alexandra", "aurore", "jules", "jacques", "mathieu",
|
||||
"olivier", "ana", "maria", "pascale", "laura", "margot",
|
||||
"marina", "maite", "maïté",
|
||||
# Noms de famille courants qui sont aussi des mots
|
||||
"blanc", "noir", "fort", "brun", "roux", "roy", "fabre",
|
||||
"page", "comte", "baron", "marin", "fournier", "bonhomme",
|
||||
"boucher", "berger", "marchand", "chevalier", "gros",
|
||||
"masson", "bonnet", "vidal", "meunier", "fontaine",
|
||||
"robin", "gay", "perrin", "roche", "rey",
|
||||
"maître", "maitre", "salle", "aubry",
|
||||
# Noms de famille fréquents dans le corpus
|
||||
"barbotin", "brocard", "brunet", "cailliez", "charrier",
|
||||
"colas", "combes", "forges", "gaillard", "galidie",
|
||||
"gendre", "genevois", "grenier", "lemoine", "martel",
|
||||
"martial", "moulin", "pineau", "piper", "pontier",
|
||||
"poulot", "rosier", "roussel-fontaine", "saule", "senne",
|
||||
"serrate", "serre", "taris", "vigneau", "vignes", "voisin",
|
||||
"barre", "campagnard", "claire", "capera", "bigourdan",
|
||||
"breton", "rainer", "bataille", "cabaner", "klement",
|
||||
"boucau", "marte", "dores", "culot",
|
||||
# Prénoms composés courants (matchent suffixe médical -ine/-ence)
|
||||
"anne-christine", "marie-christine", "marie-line",
|
||||
"marie-marceline", "berges",
|
||||
}
|
||||
|
||||
# ── Patterns morphologiques médicaux ────────────────────────────────
|
||||
_MEDICAL_SUFFIXES = re.compile(
|
||||
r"(?:ite|ose|ique|isme|ine|ome|able|tion|ment|aire|ence|ance"
|
||||
r"|ectomie|tomie|pathie|scopie|plasie|trophie|graphie"
|
||||
r"|lyse|émie|emie|urie|pnée|pnee|algie|cyte|gène|gene"
|
||||
r"|oïde|oide|ïque|phage|logie|thérapie|therapie)$",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
_MEDICAL_PREFIXES = re.compile(
|
||||
r"^(?:anti|hyper|hypo|intra|extra|para|péri|peri|poly|mono"
|
||||
r"|néo|neo|dys|hémo|hemo|héma|hema|gastro|entéro|entero"
|
||||
r"|broncho|pneumo|cardio|neuro|uro|néphro|nephro"
|
||||
r"|ostéo|osteo|arthro|dermato|onco|immuno|cyto|histo"
|
||||
r"|bio|micro|macro|angio|adéno|adeno|cholé|chole"
|
||||
r"|pancréato|pancreato|hépato|hepato|splén|splen)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# ── Mots structurels / trackare / DPI ──────────────────────────────
|
||||
_STRUCTURAL_WORDS = {
|
||||
"observation", "observations", "consultation", "prescripteur",
|
||||
"administration", "evaluation", "évaluation", "indication",
|
||||
"surveillance", "transmission", "transmissions",
|
||||
"preparation", "préparation", "planification",
|
||||
"validation", "notification", "recommandation",
|
||||
"intervention", "installation", "mobilisation",
|
||||
"exploration", "perfusion", "transfusion", "aspiration",
|
||||
"nutrition", "hydratation", "oxygénation", "oxygenation",
|
||||
"cicatrisation", "désinfection", "desinfection",
|
||||
"compensation", "stabilisation", "régularisation",
|
||||
"hospitalisation", "réhospitalisation",
|
||||
"amélioration", "amelioration", "dégradation", "degradation",
|
||||
"aggravation", "complication", "résolution", "resolution",
|
||||
"progression", "régression", "regression",
|
||||
"rééducation", "reeducation", "adaptation", "réadaptation",
|
||||
"orientation", "coordination", "organisation",
|
||||
}
|
||||
|
||||
|
||||
def analyze_audit_dir(audit_dir: str) -> dict:
|
||||
"""Analyse tous les .audit.jsonl d'un répertoire.
|
||||
|
||||
Returns dict with:
|
||||
- fp_candidates: list of (token, confidence, reasons)
|
||||
- stats: summary statistics
|
||||
"""
|
||||
_load_french_dict()
|
||||
|
||||
audit_dir = Path(audit_dir)
|
||||
audit_files = sorted(audit_dir.glob("*.audit.jsonl"))
|
||||
|
||||
if not audit_files:
|
||||
print(f"Aucun fichier .audit.jsonl trouvé dans {audit_dir}")
|
||||
return {"fp_candidates": [], "stats": {}}
|
||||
|
||||
# ── Collecter NOM_GLOBAL par fichier ────────────────────────────
|
||||
token_files = defaultdict(set) # token → set of filenames
|
||||
token_counts = defaultdict(int) # token → total occurrences
|
||||
all_kinds = defaultdict(int)
|
||||
|
||||
for af in audit_files:
|
||||
fname = af.stem.replace(".audit", "")
|
||||
with open(af) as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
h = json.loads(line)
|
||||
all_kinds[h["kind"]] += 1
|
||||
if h["kind"] == "NOM_GLOBAL":
|
||||
token = h["original"]
|
||||
token_files[token].add(fname)
|
||||
token_counts[token] += 1
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ── Analyser chaque token ───────────────────────────────────────
|
||||
fp_candidates = []
|
||||
|
||||
for token in sorted(token_files.keys()):
|
||||
reasons = []
|
||||
confidence = 0.0
|
||||
token_lower = token.lower()
|
||||
n_files = len(token_files[token])
|
||||
n_total = token_counts[token]
|
||||
|
||||
# Skip known name homophones
|
||||
if token_lower in _KNOWN_NAME_HOMOPHONES:
|
||||
continue
|
||||
|
||||
# 1. Dans le dictionnaire français ?
|
||||
in_dict = token_lower in _french_words
|
||||
if in_dict:
|
||||
reasons.append("DICT_FR")
|
||||
confidence += 0.4
|
||||
|
||||
# 2. Suffixe médical ? (exclure -ine court qui matche les prénoms)
|
||||
has_medical_suffix = bool(_MEDICAL_SUFFIXES.search(token_lower))
|
||||
if has_medical_suffix:
|
||||
# -ine seul trop large pour les mots courts (prénoms)
|
||||
suffix_match = _MEDICAL_SUFFIXES.search(token_lower)
|
||||
if suffix_match and suffix_match.group() == "ine" and len(token_lower) < 8:
|
||||
has_medical_suffix = False
|
||||
elif suffix_match and suffix_match.group() in ("tion", "ment", "ence", "ance", "aire") and len(token_lower) < 6:
|
||||
has_medical_suffix = False
|
||||
if has_medical_suffix:
|
||||
reasons.append("SUFFIXE_MED")
|
||||
confidence += 0.3
|
||||
|
||||
# 3. Préfixe médical ?
|
||||
has_medical_prefix = bool(_MEDICAL_PREFIXES.search(token_lower))
|
||||
if has_medical_prefix:
|
||||
reasons.append("PREFIXE_MED")
|
||||
confidence += 0.3
|
||||
|
||||
# 4. Mot structurel DPI ?
|
||||
if token_lower in _STRUCTURAL_WORDS:
|
||||
reasons.append("STRUCT_DPI")
|
||||
confidence += 0.5
|
||||
|
||||
# 5. Tout en minuscule (les vrais noms sont Capitalisés ou MAJUSCULES)
|
||||
if token.islower() and len(token) > 3:
|
||||
reasons.append("MINUSCULE")
|
||||
confidence += 0.2
|
||||
|
||||
# 6. Très court (<=3) et ALL-CAPS → souvent abréviation
|
||||
if len(token) <= 3 and token.isupper():
|
||||
reasons.append("ABREV_3CH")
|
||||
confidence += 0.2
|
||||
|
||||
# 7. Apparaît dans 1 seul fichier + dans le dico → très suspect
|
||||
if n_files == 1 and in_dict:
|
||||
reasons.append("1_SEUL_DOC")
|
||||
confidence += 0.2
|
||||
|
||||
# 8. Mot composé avec tiret contenant un mot du dico
|
||||
if "-" in token:
|
||||
parts = token.split("-")
|
||||
dict_parts = [p for p in parts if p.lower() in _french_words and p.lower() not in _KNOWN_NAME_HOMOPHONES]
|
||||
if dict_parts and len(dict_parts) == len(parts):
|
||||
reasons.append("COMPOSE_DICT")
|
||||
confidence += 0.3
|
||||
|
||||
# Seuil : au moins une raison
|
||||
if reasons and confidence >= 0.3:
|
||||
fp_candidates.append({
|
||||
"token": token,
|
||||
"confidence": round(min(confidence, 1.0), 2),
|
||||
"reasons": reasons,
|
||||
"occurrences": n_total,
|
||||
"n_files": n_files,
|
||||
"files": sorted(token_files[token])[:3],
|
||||
})
|
||||
|
||||
# Trier par confiance décroissante
|
||||
fp_candidates.sort(key=lambda x: (-x["confidence"], x["token"]))
|
||||
|
||||
stats = {
|
||||
"audit_files": len(audit_files),
|
||||
"total_nom_global_unique": len(token_files),
|
||||
"total_nom_global_occurrences": sum(token_counts.values()),
|
||||
"fp_candidates": len(fp_candidates),
|
||||
"already_in_stopwords": 0, # filled below
|
||||
}
|
||||
|
||||
# Vérifier lesquels sont déjà dans les stop words
|
||||
try:
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
from anonymizer_core_refactored_onnx import _MEDICAL_STOP_WORDS_SET
|
||||
already = [c for c in fp_candidates if c["token"].lower() in _MEDICAL_STOP_WORDS_SET]
|
||||
stats["already_in_stopwords"] = len(already)
|
||||
for c in fp_candidates:
|
||||
if c["token"].lower() in _MEDICAL_STOP_WORDS_SET:
|
||||
c["already_stopped"] = True
|
||||
else:
|
||||
c["already_stopped"] = False
|
||||
except ImportError:
|
||||
for c in fp_candidates:
|
||||
c["already_stopped"] = None
|
||||
|
||||
return {"fp_candidates": fp_candidates, "stats": stats}
|
||||
|
||||
|
||||
def print_report(result: dict):
|
||||
"""Affiche un rapport lisible."""
|
||||
stats = result["stats"]
|
||||
candidates = result["fp_candidates"]
|
||||
|
||||
print("=" * 70)
|
||||
print(" DÉTECTION AUTOMATIQUE FAUX POSITIFS NOM_GLOBAL")
|
||||
print("=" * 70)
|
||||
print(f" Fichiers audit analysés : {stats['audit_files']}")
|
||||
print(f" NOM_GLOBAL uniques : {stats['total_nom_global_unique']}")
|
||||
print(f" NOM_GLOBAL occurrences : {stats['total_nom_global_occurrences']}")
|
||||
print(f" Candidats FP détectés : {stats['fp_candidates']}")
|
||||
print(f" Déjà dans stop words : {stats['already_in_stopwords']}")
|
||||
print()
|
||||
|
||||
# Séparer nouveaux vs déjà traités
|
||||
new_fp = [c for c in candidates if not c.get("already_stopped")]
|
||||
old_fp = [c for c in candidates if c.get("already_stopped")]
|
||||
|
||||
if new_fp:
|
||||
print(f"{'─'*70}")
|
||||
print(f" NOUVEAUX FP À AJOUTER AUX STOP WORDS ({len(new_fp)})")
|
||||
print(f"{'─'*70}")
|
||||
print(f" {'Token':<25s} {'Conf':>5s} {'Occ':>4s} {'Docs':>4s} Raisons")
|
||||
print(f" {'─'*24} {'─'*5} {'─'*4} {'─'*4} {'─'*30}")
|
||||
for c in new_fp:
|
||||
reasons = ", ".join(c["reasons"])
|
||||
print(f" {c['token']:<25s} {c['confidence']:>5.2f} {c['occurrences']:>4d} {c['n_files']:>4d} {reasons}")
|
||||
|
||||
# Générer le code Python à copier
|
||||
print(f"\n{'─'*70}")
|
||||
print(f" CODE À AJOUTER dans _MEDICAL_STOP_WORDS_SET :")
|
||||
print(f"{'─'*70}")
|
||||
tokens_to_add = sorted(set(c["token"].lower() for c in new_fp))
|
||||
line = " "
|
||||
for i, t in enumerate(tokens_to_add):
|
||||
entry = f'"{t}", '
|
||||
if len(line) + len(entry) > 95:
|
||||
print(line.rstrip(", "))
|
||||
line = " "
|
||||
line += entry
|
||||
if line.strip():
|
||||
print(line.rstrip(", "))
|
||||
else:
|
||||
print(" Aucun nouveau FP détecté !")
|
||||
|
||||
if old_fp:
|
||||
print(f"\n{'─'*70}")
|
||||
print(f" DÉJÀ DANS STOP WORDS ({len(old_fp)}) — OK")
|
||||
print(f"{'─'*70}")
|
||||
for c in old_fp:
|
||||
print(f" ✓ {c['token']}")
|
||||
|
||||
print()
|
||||
|
||||
|
||||
def auto_fix(result: dict, core_path: str = None):
|
||||
"""Ajoute automatiquement les FP détectés aux stop words du core."""
|
||||
new_fp = [c for c in result["fp_candidates"]
|
||||
if not c.get("already_stopped") and c["confidence"] >= 0.5]
|
||||
|
||||
if not new_fp:
|
||||
print("Aucun FP à haute confiance à ajouter automatiquement.")
|
||||
return []
|
||||
|
||||
tokens = sorted(set(c["token"].lower() for c in new_fp))
|
||||
print(f"\n{len(tokens)} tokens à ajouter automatiquement (confiance >= 0.5):")
|
||||
for t in tokens:
|
||||
print(f" + {t}")
|
||||
|
||||
return tokens
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 2:
|
||||
print(f"Usage: {sys.argv[0]} <audit_dir> [--auto-fix]")
|
||||
sys.exit(1)
|
||||
|
||||
audit_dir = sys.argv[1]
|
||||
do_auto_fix = "--auto-fix" in sys.argv
|
||||
|
||||
result = analyze_audit_dir(audit_dir)
|
||||
print_report(result)
|
||||
|
||||
if do_auto_fix:
|
||||
tokens = auto_fix(result)
|
||||
if tokens:
|
||||
print(f"\nTokens à insérer : {tokens}")
|
||||
Reference in New Issue
Block a user