Stop words +170 : détection automatique FP via dictionnaire français (audit_fp_detector.py)
- Nouvel outil audit_fp_detector.py : croise NOM_GLOBAL avec dictionnaire FR (346K mots), patterns morphologiques médicaux, mots structurels DPI, fréquence inter-documents - +170 stop words en 2 lots : termes médicaux (abdomen, bilirubine, gastrique...), soins infirmiers (bijoux, ongles, maquillage, habillage...), mots courants (angle, bureau...) - Ville basque ajoutée : anglet - Résultat : 192/199 FP détectés couverts, 7 restants = artefacts OCR de vrais noms - Total stop words : 5076 tokens Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -385,7 +385,7 @@ _MEDICAL_STOP_WORDS_SET = {
|
|||||||
"bcy", "po2", "po1", "po3", "bha", "atg", "ras", "cat", "ass",
|
"bcy", "po2", "po1", "po3", "bha", "atg", "ras", "cat", "ass",
|
||||||
# FP audit OGC 17 CRH
|
# FP audit OGC 17 CRH
|
||||||
"mode", "retraitee", "retraité", "retraitée", "régression", "regression", "tel",
|
"mode", "retraitee", "retraité", "retraitée", "régression", "regression", "tel",
|
||||||
"strasbourg", "bordeaux", "toulouse", "paris", "lyon", "marseille", "bayonne",
|
"strasbourg", "bordeaux", "toulouse", "paris", "lyon", "marseille", "bayonne", "anglet",
|
||||||
# Spécialités/services récurrents comme FP NOM
|
# Spécialités/services récurrents comme FP NOM
|
||||||
"cancérologie", "cancerologie", "réanimation", "reanimation",
|
"cancérologie", "cancerologie", "réanimation", "reanimation",
|
||||||
"urologie", "néphrologie", "nephrologie", "hématologie", "hematologie",
|
"urologie", "néphrologie", "nephrologie", "hématologie", "hematologie",
|
||||||
@@ -399,6 +399,50 @@ _MEDICAL_STOP_WORDS_SET = {
|
|||||||
"transmissions", "transmission", "releve", "relevé",
|
"transmissions", "transmission", "releve", "relevé",
|
||||||
"objectif", "objectifs", "evaluation", "évaluation",
|
"objectif", "objectifs", "evaluation", "évaluation",
|
||||||
"planification", "planifié", "planifiee",
|
"planification", "planifié", "planifiee",
|
||||||
|
# ── FP détectés automatiquement par audit_fp_detector.py ──
|
||||||
|
# Lot 2 : tokens basse confiance (DICT_FR seul) clairement non-noms
|
||||||
|
"acide", "adulte", "ambulatoire", "avenue", "bandelette", "bassin",
|
||||||
|
"bijoux", "bouche", "bouchon", "changes", "court", "demande", "dessert",
|
||||||
|
"devenir", "diffusé", "douche", "entrée", "escarre", "espace",
|
||||||
|
"explications", "fauteuil", "feuillet", "fixateur", "fois", "gamma",
|
||||||
|
"germes", "glace", "habillage", "liste", "maquillage", "matelas",
|
||||||
|
"mettre", "obésité", "ongles", "palais", "perlant", "pertes",
|
||||||
|
"pièce", "plaie", "risque", "saint", "sang", "signe", "sonde",
|
||||||
|
"tenue", "texte", "transaminases", "transit", "transmis", "urinal",
|
||||||
|
"vernis", "vessie", "vrac",
|
||||||
|
# Lot 2 : termes médicaux (préfixes/suffixes)
|
||||||
|
"anatomo-pathologique", "anemie", "anémie", "angioscanner",
|
||||||
|
"cétonurie", "cetonurie", "depilation", "dépilation",
|
||||||
|
"folique", "gastroentérologue", "gastroenterologue",
|
||||||
|
"microgrammes", "nalidixique", "naso-gastrique",
|
||||||
|
"angio-irm", "neuro", "neuro-chirurgie", "endoplasmique",
|
||||||
|
"cyto", "plaie-colle", "bionolyte",
|
||||||
|
# Lot 1 (103 tokens, confiance >= 0.5) ──
|
||||||
|
# Anatomie / clinique
|
||||||
|
"abdomen", "aortique", "bilirubine", "drain", "gastrique", "histologique",
|
||||||
|
"intraveineuse", "lithiasique", "macroscopie", "miction", "molles", "pelvienne",
|
||||||
|
"plaquette", "plaquettes", "rectale", "sanguine", "sciatique", "selle", "urinaire",
|
||||||
|
# Pathologies / symptômes
|
||||||
|
"algie", "angoisse", "antécédent", "douloureuse", "dyslipidémie",
|
||||||
|
"hemodialyse", "hemorragique", "hyperthermie", "hématologue",
|
||||||
|
# Médicaments / matériel médical
|
||||||
|
"ampoule", "antalgique", "antiseptique", "compresse", "flacon",
|
||||||
|
"oxygène", "pansement", "vitamine",
|
||||||
|
# Biologie / examens
|
||||||
|
"biochimie", "biologie", "fer",
|
||||||
|
# Actions / états cliniques
|
||||||
|
"ablation", "absence", "admission", "bloc", "changement", "cliniquement",
|
||||||
|
"cognitif", "couchage", "dispositif", "dynamique", "entretien", "histoire",
|
||||||
|
"intervention", "position", "rappel", "relation", "retour", "réalisation",
|
||||||
|
"résistant", "réévaluation", "situation", "temporaire", "urgence", "urgences",
|
||||||
|
"urgent", "validation",
|
||||||
|
# Mots courants / contextuels
|
||||||
|
"angle", "bille", "boisson", "bureau", "campagne", "cases", "circuit",
|
||||||
|
"clause", "concubin", "confortable", "demain", "densité", "dernière",
|
||||||
|
"distant", "domaine", "elle", "fils", "frère", "grand", "horizon",
|
||||||
|
"hui", "identifiant", "minuit", "murent", "neuf", "original", "pages",
|
||||||
|
"personne", "premier", "quartier", "retraite", "route", "rés",
|
||||||
|
"tam", "terrasses", "trouve", "verrouillé", "villa", "étage",
|
||||||
}
|
}
|
||||||
# Enrichissement automatique avec les ~4000 noms de médicaments d'edsnlp
|
# Enrichissement automatique avec les ~4000 noms de médicaments d'edsnlp
|
||||||
_MEDICAL_STOP_WORDS_SET.update(_load_edsnlp_drug_names())
|
_MEDICAL_STOP_WORDS_SET.update(_load_edsnlp_drug_names())
|
||||||
|
|||||||
360
audit_fp_detector.py
Normal file
360
audit_fp_detector.py
Normal file
@@ -0,0 +1,360 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Détecteur automatique de faux positifs NOM_GLOBAL.
|
||||||
|
|
||||||
|
Analyse les fichiers .audit.jsonl et croise les NOM_GLOBAL avec :
|
||||||
|
1. Le dictionnaire français système (/usr/share/dict/french)
|
||||||
|
2. Des patterns morphologiques médicaux (-ite, -ose, -ique, -ine, etc.)
|
||||||
|
3. La fréquence inter-documents (un vrai nom apparaît rarement dans 1 seul dossier)
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 audit_fp_detector.py /chemin/vers/anonymise/
|
||||||
|
python3 audit_fp_detector.py /chemin/vers/anonymise/ --auto-fix
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from collections import defaultdict
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# ── Chargement dictionnaire français ────────────────────────────────
|
||||||
|
DICT_PATH = Path("/usr/share/dict/french")
|
||||||
|
_french_words: set = set()
|
||||||
|
|
||||||
|
def _load_french_dict():
|
||||||
|
global _french_words
|
||||||
|
if _french_words:
|
||||||
|
return
|
||||||
|
with open(DICT_PATH) as f:
|
||||||
|
_french_words = set(w.strip().lower() for w in f if len(w.strip()) >= 3)
|
||||||
|
|
||||||
|
# ── Prénoms/noms français courants (homonymes dictionnaire) ────────
|
||||||
|
# Ces mots sont à la fois dans le dictionnaire ET sont des prénoms/noms réels.
|
||||||
|
# Ils ne doivent PAS être flagués comme FP.
|
||||||
|
_KNOWN_NAME_HOMOPHONES = {
|
||||||
|
# Prénoms courants qui sont aussi des mots
|
||||||
|
"martin", "bernard", "petit", "richard", "moreau", "laurent",
|
||||||
|
"simon", "pierre", "marie", "jean", "paul", "louis", "marc",
|
||||||
|
"charles", "henry", "victor", "rose", "marguerite", "pascal",
|
||||||
|
"leon", "léon", "auguste", "clement", "clément", "olive",
|
||||||
|
"sylvie", "denis", "raymond", "roger", "maxime", "claude",
|
||||||
|
"marcel", "germaine", "alice", "florence", "dominique",
|
||||||
|
"christine", "caroline", "elisabeth", "elisabeth", "thomas",
|
||||||
|
"nicolas", "vincent", "benjamin", "lucien", "gaston",
|
||||||
|
"annette", "colette", "suzanne", "andre", "andré", "rené",
|
||||||
|
"yves", "gilles", "noel", "noël", "aimé", "aime",
|
||||||
|
"guy", "joël", "joelle", "gilbert", "fernand", "édith",
|
||||||
|
"edith", "agnès", "agnes", "jeanne", "lucie", "laure",
|
||||||
|
"adrien", "bastien", "julien", "viviane", "constance",
|
||||||
|
"armand", "blanche", "clémence", "clemence", "prudence", "patience",
|
||||||
|
"grace", "grâce", "fidèle", "placide",
|
||||||
|
# Prénoms féminins en -ine/-ine (matchent le suffixe médical)
|
||||||
|
"adeline", "aline", "amandine", "capucine", "celine", "céline",
|
||||||
|
"coline", "catherine", "clementine", "clémentine", "delphine",
|
||||||
|
"emeline", "émeline", "frédérique", "frederique", "ghislaine",
|
||||||
|
"gwendoline", "justine", "karine", "laurence", "laurie",
|
||||||
|
"marceline", "marine", "marjolaine", "martine", "madeleine",
|
||||||
|
"melaine", "moline", "morgane", "nadine", "noémie", "noemie",
|
||||||
|
"pauline", "perrine", "sabine", "sandrine", "séverine", "severine",
|
||||||
|
"tiphaine", "virginie",
|
||||||
|
# Prénoms en -oine/-iste/-ence etc.
|
||||||
|
"antoine", "baptiste", "patrice", "romain", "charlotte",
|
||||||
|
"alexandra", "aurore", "jules", "jacques", "mathieu",
|
||||||
|
"olivier", "ana", "maria", "pascale", "laura", "margot",
|
||||||
|
"marina", "maite", "maïté",
|
||||||
|
# Noms de famille courants qui sont aussi des mots
|
||||||
|
"blanc", "noir", "fort", "brun", "roux", "roy", "fabre",
|
||||||
|
"page", "comte", "baron", "marin", "fournier", "bonhomme",
|
||||||
|
"boucher", "berger", "marchand", "chevalier", "gros",
|
||||||
|
"masson", "bonnet", "vidal", "meunier", "fontaine",
|
||||||
|
"robin", "gay", "perrin", "roche", "rey",
|
||||||
|
"maître", "maitre", "salle", "aubry",
|
||||||
|
# Noms de famille fréquents dans le corpus
|
||||||
|
"barbotin", "brocard", "brunet", "cailliez", "charrier",
|
||||||
|
"colas", "combes", "forges", "gaillard", "galidie",
|
||||||
|
"gendre", "genevois", "grenier", "lemoine", "martel",
|
||||||
|
"martial", "moulin", "pineau", "piper", "pontier",
|
||||||
|
"poulot", "rosier", "roussel-fontaine", "saule", "senne",
|
||||||
|
"serrate", "serre", "taris", "vigneau", "vignes", "voisin",
|
||||||
|
"barre", "campagnard", "claire", "capera", "bigourdan",
|
||||||
|
"breton", "rainer", "bataille", "cabaner", "klement",
|
||||||
|
"boucau", "marte", "dores", "culot",
|
||||||
|
# Prénoms composés courants (matchent suffixe médical -ine/-ence)
|
||||||
|
"anne-christine", "marie-christine", "marie-line",
|
||||||
|
"marie-marceline", "berges",
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Patterns morphologiques médicaux ────────────────────────────────
|
||||||
|
_MEDICAL_SUFFIXES = re.compile(
|
||||||
|
r"(?:ite|ose|ique|isme|ine|ome|able|tion|ment|aire|ence|ance"
|
||||||
|
r"|ectomie|tomie|pathie|scopie|plasie|trophie|graphie"
|
||||||
|
r"|lyse|émie|emie|urie|pnée|pnee|algie|cyte|gène|gene"
|
||||||
|
r"|oïde|oide|ïque|phage|logie|thérapie|therapie)$",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
_MEDICAL_PREFIXES = re.compile(
|
||||||
|
r"^(?:anti|hyper|hypo|intra|extra|para|péri|peri|poly|mono"
|
||||||
|
r"|néo|neo|dys|hémo|hemo|héma|hema|gastro|entéro|entero"
|
||||||
|
r"|broncho|pneumo|cardio|neuro|uro|néphro|nephro"
|
||||||
|
r"|ostéo|osteo|arthro|dermato|onco|immuno|cyto|histo"
|
||||||
|
r"|bio|micro|macro|angio|adéno|adeno|cholé|chole"
|
||||||
|
r"|pancréato|pancreato|hépato|hepato|splén|splen)",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── Mots structurels / trackare / DPI ──────────────────────────────
|
||||||
|
_STRUCTURAL_WORDS = {
|
||||||
|
"observation", "observations", "consultation", "prescripteur",
|
||||||
|
"administration", "evaluation", "évaluation", "indication",
|
||||||
|
"surveillance", "transmission", "transmissions",
|
||||||
|
"preparation", "préparation", "planification",
|
||||||
|
"validation", "notification", "recommandation",
|
||||||
|
"intervention", "installation", "mobilisation",
|
||||||
|
"exploration", "perfusion", "transfusion", "aspiration",
|
||||||
|
"nutrition", "hydratation", "oxygénation", "oxygenation",
|
||||||
|
"cicatrisation", "désinfection", "desinfection",
|
||||||
|
"compensation", "stabilisation", "régularisation",
|
||||||
|
"hospitalisation", "réhospitalisation",
|
||||||
|
"amélioration", "amelioration", "dégradation", "degradation",
|
||||||
|
"aggravation", "complication", "résolution", "resolution",
|
||||||
|
"progression", "régression", "regression",
|
||||||
|
"rééducation", "reeducation", "adaptation", "réadaptation",
|
||||||
|
"orientation", "coordination", "organisation",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def analyze_audit_dir(audit_dir: str) -> dict:
|
||||||
|
"""Analyse tous les .audit.jsonl d'un répertoire.
|
||||||
|
|
||||||
|
Returns dict with:
|
||||||
|
- fp_candidates: list of (token, confidence, reasons)
|
||||||
|
- stats: summary statistics
|
||||||
|
"""
|
||||||
|
_load_french_dict()
|
||||||
|
|
||||||
|
audit_dir = Path(audit_dir)
|
||||||
|
audit_files = sorted(audit_dir.glob("*.audit.jsonl"))
|
||||||
|
|
||||||
|
if not audit_files:
|
||||||
|
print(f"Aucun fichier .audit.jsonl trouvé dans {audit_dir}")
|
||||||
|
return {"fp_candidates": [], "stats": {}}
|
||||||
|
|
||||||
|
# ── Collecter NOM_GLOBAL par fichier ────────────────────────────
|
||||||
|
token_files = defaultdict(set) # token → set of filenames
|
||||||
|
token_counts = defaultdict(int) # token → total occurrences
|
||||||
|
all_kinds = defaultdict(int)
|
||||||
|
|
||||||
|
for af in audit_files:
|
||||||
|
fname = af.stem.replace(".audit", "")
|
||||||
|
with open(af) as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
h = json.loads(line)
|
||||||
|
all_kinds[h["kind"]] += 1
|
||||||
|
if h["kind"] == "NOM_GLOBAL":
|
||||||
|
token = h["original"]
|
||||||
|
token_files[token].add(fname)
|
||||||
|
token_counts[token] += 1
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# ── Analyser chaque token ───────────────────────────────────────
|
||||||
|
fp_candidates = []
|
||||||
|
|
||||||
|
for token in sorted(token_files.keys()):
|
||||||
|
reasons = []
|
||||||
|
confidence = 0.0
|
||||||
|
token_lower = token.lower()
|
||||||
|
n_files = len(token_files[token])
|
||||||
|
n_total = token_counts[token]
|
||||||
|
|
||||||
|
# Skip known name homophones
|
||||||
|
if token_lower in _KNOWN_NAME_HOMOPHONES:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 1. Dans le dictionnaire français ?
|
||||||
|
in_dict = token_lower in _french_words
|
||||||
|
if in_dict:
|
||||||
|
reasons.append("DICT_FR")
|
||||||
|
confidence += 0.4
|
||||||
|
|
||||||
|
# 2. Suffixe médical ? (exclure -ine court qui matche les prénoms)
|
||||||
|
has_medical_suffix = bool(_MEDICAL_SUFFIXES.search(token_lower))
|
||||||
|
if has_medical_suffix:
|
||||||
|
# -ine seul trop large pour les mots courts (prénoms)
|
||||||
|
suffix_match = _MEDICAL_SUFFIXES.search(token_lower)
|
||||||
|
if suffix_match and suffix_match.group() == "ine" and len(token_lower) < 8:
|
||||||
|
has_medical_suffix = False
|
||||||
|
elif suffix_match and suffix_match.group() in ("tion", "ment", "ence", "ance", "aire") and len(token_lower) < 6:
|
||||||
|
has_medical_suffix = False
|
||||||
|
if has_medical_suffix:
|
||||||
|
reasons.append("SUFFIXE_MED")
|
||||||
|
confidence += 0.3
|
||||||
|
|
||||||
|
# 3. Préfixe médical ?
|
||||||
|
has_medical_prefix = bool(_MEDICAL_PREFIXES.search(token_lower))
|
||||||
|
if has_medical_prefix:
|
||||||
|
reasons.append("PREFIXE_MED")
|
||||||
|
confidence += 0.3
|
||||||
|
|
||||||
|
# 4. Mot structurel DPI ?
|
||||||
|
if token_lower in _STRUCTURAL_WORDS:
|
||||||
|
reasons.append("STRUCT_DPI")
|
||||||
|
confidence += 0.5
|
||||||
|
|
||||||
|
# 5. Tout en minuscule (les vrais noms sont Capitalisés ou MAJUSCULES)
|
||||||
|
if token.islower() and len(token) > 3:
|
||||||
|
reasons.append("MINUSCULE")
|
||||||
|
confidence += 0.2
|
||||||
|
|
||||||
|
# 6. Très court (<=3) et ALL-CAPS → souvent abréviation
|
||||||
|
if len(token) <= 3 and token.isupper():
|
||||||
|
reasons.append("ABREV_3CH")
|
||||||
|
confidence += 0.2
|
||||||
|
|
||||||
|
# 7. Apparaît dans 1 seul fichier + dans le dico → très suspect
|
||||||
|
if n_files == 1 and in_dict:
|
||||||
|
reasons.append("1_SEUL_DOC")
|
||||||
|
confidence += 0.2
|
||||||
|
|
||||||
|
# 8. Mot composé avec tiret contenant un mot du dico
|
||||||
|
if "-" in token:
|
||||||
|
parts = token.split("-")
|
||||||
|
dict_parts = [p for p in parts if p.lower() in _french_words and p.lower() not in _KNOWN_NAME_HOMOPHONES]
|
||||||
|
if dict_parts and len(dict_parts) == len(parts):
|
||||||
|
reasons.append("COMPOSE_DICT")
|
||||||
|
confidence += 0.3
|
||||||
|
|
||||||
|
# Seuil : au moins une raison
|
||||||
|
if reasons and confidence >= 0.3:
|
||||||
|
fp_candidates.append({
|
||||||
|
"token": token,
|
||||||
|
"confidence": round(min(confidence, 1.0), 2),
|
||||||
|
"reasons": reasons,
|
||||||
|
"occurrences": n_total,
|
||||||
|
"n_files": n_files,
|
||||||
|
"files": sorted(token_files[token])[:3],
|
||||||
|
})
|
||||||
|
|
||||||
|
# Trier par confiance décroissante
|
||||||
|
fp_candidates.sort(key=lambda x: (-x["confidence"], x["token"]))
|
||||||
|
|
||||||
|
stats = {
|
||||||
|
"audit_files": len(audit_files),
|
||||||
|
"total_nom_global_unique": len(token_files),
|
||||||
|
"total_nom_global_occurrences": sum(token_counts.values()),
|
||||||
|
"fp_candidates": len(fp_candidates),
|
||||||
|
"already_in_stopwords": 0, # filled below
|
||||||
|
}
|
||||||
|
|
||||||
|
# Vérifier lesquels sont déjà dans les stop words
|
||||||
|
try:
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent))
|
||||||
|
from anonymizer_core_refactored_onnx import _MEDICAL_STOP_WORDS_SET
|
||||||
|
already = [c for c in fp_candidates if c["token"].lower() in _MEDICAL_STOP_WORDS_SET]
|
||||||
|
stats["already_in_stopwords"] = len(already)
|
||||||
|
for c in fp_candidates:
|
||||||
|
if c["token"].lower() in _MEDICAL_STOP_WORDS_SET:
|
||||||
|
c["already_stopped"] = True
|
||||||
|
else:
|
||||||
|
c["already_stopped"] = False
|
||||||
|
except ImportError:
|
||||||
|
for c in fp_candidates:
|
||||||
|
c["already_stopped"] = None
|
||||||
|
|
||||||
|
return {"fp_candidates": fp_candidates, "stats": stats}
|
||||||
|
|
||||||
|
|
||||||
|
def print_report(result: dict):
|
||||||
|
"""Affiche un rapport lisible."""
|
||||||
|
stats = result["stats"]
|
||||||
|
candidates = result["fp_candidates"]
|
||||||
|
|
||||||
|
print("=" * 70)
|
||||||
|
print(" DÉTECTION AUTOMATIQUE FAUX POSITIFS NOM_GLOBAL")
|
||||||
|
print("=" * 70)
|
||||||
|
print(f" Fichiers audit analysés : {stats['audit_files']}")
|
||||||
|
print(f" NOM_GLOBAL uniques : {stats['total_nom_global_unique']}")
|
||||||
|
print(f" NOM_GLOBAL occurrences : {stats['total_nom_global_occurrences']}")
|
||||||
|
print(f" Candidats FP détectés : {stats['fp_candidates']}")
|
||||||
|
print(f" Déjà dans stop words : {stats['already_in_stopwords']}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Séparer nouveaux vs déjà traités
|
||||||
|
new_fp = [c for c in candidates if not c.get("already_stopped")]
|
||||||
|
old_fp = [c for c in candidates if c.get("already_stopped")]
|
||||||
|
|
||||||
|
if new_fp:
|
||||||
|
print(f"{'─'*70}")
|
||||||
|
print(f" NOUVEAUX FP À AJOUTER AUX STOP WORDS ({len(new_fp)})")
|
||||||
|
print(f"{'─'*70}")
|
||||||
|
print(f" {'Token':<25s} {'Conf':>5s} {'Occ':>4s} {'Docs':>4s} Raisons")
|
||||||
|
print(f" {'─'*24} {'─'*5} {'─'*4} {'─'*4} {'─'*30}")
|
||||||
|
for c in new_fp:
|
||||||
|
reasons = ", ".join(c["reasons"])
|
||||||
|
print(f" {c['token']:<25s} {c['confidence']:>5.2f} {c['occurrences']:>4d} {c['n_files']:>4d} {reasons}")
|
||||||
|
|
||||||
|
# Générer le code Python à copier
|
||||||
|
print(f"\n{'─'*70}")
|
||||||
|
print(f" CODE À AJOUTER dans _MEDICAL_STOP_WORDS_SET :")
|
||||||
|
print(f"{'─'*70}")
|
||||||
|
tokens_to_add = sorted(set(c["token"].lower() for c in new_fp))
|
||||||
|
line = " "
|
||||||
|
for i, t in enumerate(tokens_to_add):
|
||||||
|
entry = f'"{t}", '
|
||||||
|
if len(line) + len(entry) > 95:
|
||||||
|
print(line.rstrip(", "))
|
||||||
|
line = " "
|
||||||
|
line += entry
|
||||||
|
if line.strip():
|
||||||
|
print(line.rstrip(", "))
|
||||||
|
else:
|
||||||
|
print(" Aucun nouveau FP détecté !")
|
||||||
|
|
||||||
|
if old_fp:
|
||||||
|
print(f"\n{'─'*70}")
|
||||||
|
print(f" DÉJÀ DANS STOP WORDS ({len(old_fp)}) — OK")
|
||||||
|
print(f"{'─'*70}")
|
||||||
|
for c in old_fp:
|
||||||
|
print(f" ✓ {c['token']}")
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
def auto_fix(result: dict, core_path: str = None):
|
||||||
|
"""Ajoute automatiquement les FP détectés aux stop words du core."""
|
||||||
|
new_fp = [c for c in result["fp_candidates"]
|
||||||
|
if not c.get("already_stopped") and c["confidence"] >= 0.5]
|
||||||
|
|
||||||
|
if not new_fp:
|
||||||
|
print("Aucun FP à haute confiance à ajouter automatiquement.")
|
||||||
|
return []
|
||||||
|
|
||||||
|
tokens = sorted(set(c["token"].lower() for c in new_fp))
|
||||||
|
print(f"\n{len(tokens)} tokens à ajouter automatiquement (confiance >= 0.5):")
|
||||||
|
for t in tokens:
|
||||||
|
print(f" + {t}")
|
||||||
|
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
print(f"Usage: {sys.argv[0]} <audit_dir> [--auto-fix]")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
audit_dir = sys.argv[1]
|
||||||
|
do_auto_fix = "--auto-fix" in sys.argv
|
||||||
|
|
||||||
|
result = analyze_audit_dir(audit_dir)
|
||||||
|
print_report(result)
|
||||||
|
|
||||||
|
if do_auto_fix:
|
||||||
|
tokens = auto_fix(result)
|
||||||
|
if tokens:
|
||||||
|
print(f"\nTokens à insérer : {tokens}")
|
||||||
Reference in New Issue
Block a user