feat: Analyse propagation globale - 100% des *_GLOBAL et NOM_EXTRACTED sont des FP

This commit is contained in:
2026-03-02 11:01:14 +01:00
parent 4eba826ca5
commit dfa45041d7
6 changed files with 876 additions and 3 deletions

View File

@@ -26,9 +26,9 @@
- [x] 1.1.3.4 Valider les annotations (double vérification)
- [x] 1.1.3.5 Calculer les statistiques du dataset (PII par type, difficulté)
- [ ] 1.1.4 Enrichir la liste des stopwords médicaux
- [ ] 1.1.4.1 Extraire les termes médicaux des 30 documents annotés
- [ ] 1.1.4.2 Identifier les faux positifs actuels (termes masqués à tort)
- [-] 1.1.4 Enrichir la liste des stopwords médicaux
- [x] 1.1.4.1 Extraire les termes médicaux des 30 documents annotés
- [x] 1.1.4.2 Identifier les faux positifs actuels (termes masqués à tort)
- [ ] 1.1.4.3 Ajouter les nouveaux termes à `_MEDICAL_STOP_WORDS_SET`
- [ ] 1.1.4.4 Documenter les sources des stopwords

View File

@@ -0,0 +1,164 @@
{
"analysis_date": "2026-03-02",
"global_types_stats": [
{
"type": "NOM_GLOBAL",
"base_type": "NOM",
"tp": 0,
"fp": 670,
"fn": 0,
"precision": 0.0,
"base_tp": 506,
"base_fp": 0,
"base_precision": 1.0
},
{
"type": "TEL_GLOBAL",
"base_type": "TEL",
"tp": 0,
"fp": 77,
"fn": 0,
"precision": 0.0,
"base_tp": 193,
"base_fp": 8,
"base_precision": 0.9602
},
{
"type": "ADRESSE_GLOBAL",
"base_type": "ADRESSE",
"tp": 0,
"fp": 55,
"fn": 0,
"precision": 0.0,
"base_tp": 72,
"base_fp": 10,
"base_precision": 0.878
},
{
"type": "CODE_POSTAL_GLOBAL",
"base_type": "CODE_POSTAL",
"tp": 0,
"fp": 39,
"fn": 0,
"precision": 0.0,
"base_tp": 50,
"base_fp": 10,
"base_precision": 0.8333
},
{
"type": "ETAB_GLOBAL",
"base_type": "ETAB",
"tp": 0,
"fp": 36,
"fn": 0,
"precision": 0.0,
"base_tp": 0,
"base_fp": 0,
"base_precision": 0.0
},
{
"type": "EMAIL_GLOBAL",
"base_type": "EMAIL",
"tp": 0,
"fp": 28,
"fn": 0,
"precision": 0.0,
"base_tp": 62,
"base_fp": 0,
"base_precision": 1.0
},
{
"type": "DATE_NAISSANCE_GLOBAL",
"base_type": "DATE_NAISSANCE",
"tp": 0,
"fp": 20,
"fn": 0,
"precision": 0.0,
"base_tp": 114,
"base_fp": 0,
"base_precision": 1.0
},
{
"type": "VILLE_GLOBAL",
"base_type": "VILLE",
"tp": 0,
"fp": 10,
"fn": 0,
"precision": 0.0,
"base_tp": 5,
"base_fp": 20,
"base_precision": 0.2
},
{
"type": "EPISODE_GLOBAL",
"base_type": "EPISODE",
"tp": 0,
"fp": 9,
"fn": 0,
"precision": 0.0,
"base_tp": 18,
"base_fp": 106,
"base_precision": 0.1452
},
{
"type": "RPPS_GLOBAL",
"base_type": "RPPS",
"tp": 0,
"fp": 7,
"fn": 0,
"precision": 0.0,
"base_tp": 21,
"base_fp": 0,
"base_precision": 1.0
}
],
"nom_extracted_stats": {
"precision": 0.0,
"recall": 0.0,
"f1_score": 0.0,
"true_positives": 0,
"false_positives": 3846,
"false_negatives": 0
},
"recommendations": [
{
"priority": 1,
"title": "Désactiver les types *_GLOBAL inutiles",
"types": [
"NOM_GLOBAL",
"TEL_GLOBAL",
"ADRESSE_GLOBAL",
"CODE_POSTAL_GLOBAL",
"ETAB_GLOBAL",
"EMAIL_GLOBAL",
"DATE_NAISSANCE_GLOBAL",
"VILLE_GLOBAL",
"EPISODE_GLOBAL",
"RPPS_GLOBAL"
],
"impact": "Réduction de 951 FP",
"gain_precision": "+15.6 points",
"effort": "Faible",
"implementation": "Modifier la fonction de propagation globale pour exclure ces types"
},
{
"priority": 2,
"title": "Désactiver NOM_EXTRACTED",
"types": [
"NOM_EXTRACTED"
],
"impact": "Réduction de 3846 FP",
"gain_precision": "+62.9 points",
"effort": "Faible",
"implementation": "Commenter ou supprimer la logique d'extraction de noms"
}
],
"estimated_gain": {
"current_fp": 4951,
"estimated_fp": 154,
"fp_reduction": 4797,
"current_precision": 0.1897,
"estimated_precision": 0.8827113480578828,
"precision_gain": 0.6930113480578828
}
}

View File

@@ -0,0 +1,283 @@
{
"extraction_date": "2026-03-02",
"total_detections": 3846,
"unique_terms": 316,
"frequent_terms_count": 196,
"medical_terms_count": 7,
"top_50_frequent": [
{
"term": "lucie",
"count": 188,
"is_medical": false
},
{
"term": "schmitt",
"count": 185,
"is_medical": false
},
{
"term": "masse",
"count": 170,
"is_medical": false
},
{
"term": "laurence",
"count": 138,
"is_medical": false
},
{
"term": "belleau",
"count": 135,
"is_medical": false
},
{
"term": "céline",
"count": 124,
"is_medical": false
},
{
"term": "justine",
"count": 96,
"is_medical": false
},
{
"term": "burg",
"count": 96,
"is_medical": false
},
{
"term": "schneider",
"count": 90,
"is_medical": false
},
{
"term": "sophie",
"count": 74,
"is_medical": false
},
{
"term": "aguer",
"count": 74,
"is_medical": false
},
{
"term": "kasparian",
"count": 68,
"is_medical": false
},
{
"term": "alexandra",
"count": 64,
"is_medical": false
},
{
"term": "valette",
"count": 63,
"is_medical": false
},
{
"term": "buccale",
"count": 61,
"is_medical": false
},
{
"term": "samuel",
"count": 61,
"is_medical": false
},
{
"term": "bannier",
"count": 60,
"is_medical": false
},
{
"term": "grihault",
"count": 60,
"is_medical": false
},
{
"term": "pedia",
"count": 59,
"is_medical": false
},
{
"term": "françois",
"count": 57,
"is_medical": false
},
{
"term": "quentin",
"count": 57,
"is_medical": false
},
{
"term": "cazenave",
"count": 55,
"is_medical": false
},
{
"term": "bedouet",
"count": 46,
"is_medical": false
},
{
"term": "jean",
"count": 44,
"is_medical": false
},
{
"term": "hurtado",
"count": 44,
"is_medical": false
},
{
"term": "droit",
"count": 43,
"is_medical": true
},
{
"term": "jean-pierre",
"count": 39,
"is_medical": false
},
{
"term": "echelle",
"count": 37,
"is_medical": false
},
{
"term": "glasgow",
"count": 37,
"is_medical": false
},
{
"term": "carriere",
"count": 35,
"is_medical": false
},
{
"term": "juliette",
"count": 35,
"is_medical": false
},
{
"term": "txomin",
"count": 33,
"is_medical": false
},
{
"term": "maternowski",
"count": 31,
"is_medical": false
},
{
"term": "cuillere",
"count": 29,
"is_medical": false
},
{
"term": "cafe",
"count": 29,
"is_medical": false
},
{
"term": "vomissements",
"count": 26,
"is_medical": false
},
{
"term": "gournay",
"count": 26,
"is_medical": false
},
{
"term": "eva",
"count": 25,
"is_medical": false
},
{
"term": "enf",
"count": 24,
"is_medical": false
},
{
"term": "marie-line",
"count": 24,
"is_medical": false
},
{
"term": "picamilh",
"count": 23,
"is_medical": false
},
{
"term": "eneko",
"count": 23,
"is_medical": false
},
{
"term": "bronswick",
"count": 22,
"is_medical": false
},
{
"term": "larrouy",
"count": 20,
"is_medical": false
},
{
"term": "elodie",
"count": 20,
"is_medical": false
},
{
"term": "preremplie",
"count": 18,
"is_medical": false
},
{
"term": "infectieuses",
"count": 16,
"is_medical": false
},
{
"term": "petriat",
"count": 16,
"is_medical": false
},
{
"term": "cotyle",
"count": 16,
"is_medical": false
},
{
"term": "sylvie",
"count": 15,
"is_medical": false
}
],
"medical_terms_by_category": {
"Titres/Fonctions": [
"droit",
"droite"
],
"Pathologies": [
"anastomose"
],
"Termes généraux": [
"colique",
"hilaire",
"urologique",
"vasculaire"
]
},
"all_medical_terms": [
"anastomose",
"colique",
"droit",
"droite",
"hilaire",
"urologique",
"vasculaire"
]
}

View File

@@ -0,0 +1,12 @@
# Nouveaux stopwords médicaux extraits automatiquement
# À ajouter à _MEDICAL_STOP_WORDS_SET dans anonymizer_core_refactored_onnx.py
NEW_MEDICAL_STOPWORDS = {
"anastomose",
"colique",
"droit",
"droite",
"hilaire",
"urologique",
"vasculaire",
}

View File

@@ -0,0 +1,214 @@
#!/usr/bin/env python3
"""
Analyse la propagation globale et propose des optimisations.
La propagation globale crée 4,797 FP (96.9% du total):
- NOM_EXTRACTED: 3,846 FP (77.7%)
- *_GLOBAL: 951 FP (19.2%)
Ce script analyse quels types bénéficient vraiment de la propagation.
"""
import sys
import json
from pathlib import Path
from collections import defaultdict, Counter
def analyze_global_propagation():
"""Analyse la propagation globale."""
print("="*80)
print("ANALYSE DE LA PROPAGATION GLOBALE")
print("="*80)
# Charger les résultats d'évaluation
eval_file = Path("tests/ground_truth/quality_evaluation/baseline_quality_evaluation.json")
with open(eval_file, 'r', encoding='utf-8') as f:
eval_data = json.load(f)
by_type = eval_data['by_type']
# Analyser les types *_GLOBAL
global_types = {k: v for k, v in by_type.items() if k.endswith('_GLOBAL')}
non_global_types = {k: v for k, v in by_type.items() if not k.endswith('_GLOBAL') and k != 'NOM_EXTRACTED'}
print(f"\n📊 Types avec propagation globale: {len(global_types)}")
print(f"📊 Types sans propagation: {len(non_global_types)}")
# Statistiques par type *_GLOBAL
print(f"\n" + "="*80)
print("ANALYSE DES TYPES *_GLOBAL")
print("="*80)
global_stats = []
for pii_type, stats in global_types.items():
base_type = pii_type.replace('_GLOBAL', '')
base_stats = non_global_types.get(base_type, {})
global_stats.append({
"type": pii_type,
"base_type": base_type,
"tp": stats['true_positives'],
"fp": stats['false_positives'],
"fn": stats['false_negatives'],
"precision": stats['precision'],
"base_tp": base_stats.get('true_positives', 0),
"base_fp": base_stats.get('false_positives', 0),
"base_precision": base_stats.get('precision', 0.0)
})
# Trier par nombre de FP
global_stats.sort(key=lambda x: x['fp'], reverse=True)
print(f"\n🔴 Types *_GLOBAL par nombre de faux positifs:\n")
for i, stat in enumerate(global_stats, 1):
print(f"{i}. {stat['type']}")
print(f" TP: {stat['tp']}, FP: {stat['fp']}, Précision: {stat['precision']:.2%}")
print(f" Type de base ({stat['base_type']}): TP: {stat['base_tp']}, FP: {stat['base_fp']}, Précision: {stat['base_precision']:.2%}")
# Évaluer l'utilité
if stat['tp'] == 0 and stat['fp'] > 0:
print(f" ⚠️ INUTILE: Aucun TP, {stat['fp']} FP → À DÉSACTIVER")
elif stat['precision'] < 0.5:
print(f" ⚠️ PROBLÉMATIQUE: Précision {stat['precision']:.2%} → À DÉSACTIVER")
elif stat['tp'] > 0 and stat['precision'] >= 0.8:
print(f" ✅ UTILE: {stat['tp']} TP avec bonne précision → À CONSERVER")
else:
print(f" ⚙️ À ÉVALUER: Bénéfice/coût à analyser")
print()
# Analyse NOM_EXTRACTED
print("="*80)
print("ANALYSE DE NOM_EXTRACTED")
print("="*80)
nom_extracted_stats = by_type.get('NOM_EXTRACTED', {})
print(f"\n📊 NOM_EXTRACTED:")
print(f" TP: {nom_extracted_stats.get('true_positives', 0)}")
print(f" FP: {nom_extracted_stats.get('false_positives', 0)}")
print(f" Précision: {nom_extracted_stats.get('precision', 0):.2%}")
if nom_extracted_stats.get('false_positives', 0) > 1000:
print(f"\n ⚠️ PROBLÈME MAJEUR: {nom_extracted_stats['false_positives']} FP")
print(f" Impact: {nom_extracted_stats['false_positives'] / eval_data['global_metrics']['false_positives'] * 100:.1f}% des FP totaux")
print(f"\n 💡 SOLUTION: Désactiver NOM_EXTRACTED ou améliorer drastiquement le filtrage")
# Recommandations
print(f"\n" + "="*80)
print("RECOMMANDATIONS")
print("="*80)
recommendations = []
# Recommandation 1: Désactiver les types *_GLOBAL inutiles
useless_global = [s for s in global_stats if s['tp'] == 0 and s['fp'] > 0]
if useless_global:
total_fp_saved = sum(s['fp'] for s in useless_global)
recommendations.append({
"priority": 1,
"title": "Désactiver les types *_GLOBAL inutiles",
"types": [s['type'] for s in useless_global],
"impact": f"Réduction de {total_fp_saved} FP",
"gain_precision": f"+{total_fp_saved / (eval_data['global_metrics']['true_positives'] + eval_data['global_metrics']['false_positives']) * 100:.1f} points",
"effort": "Faible",
"implementation": "Modifier la fonction de propagation globale pour exclure ces types"
})
# Recommandation 2: Désactiver NOM_EXTRACTED
if nom_extracted_stats.get('false_positives', 0) > 1000:
recommendations.append({
"priority": 2,
"title": "Désactiver NOM_EXTRACTED",
"types": ['NOM_EXTRACTED'],
"impact": f"Réduction de {nom_extracted_stats['false_positives']} FP",
"gain_precision": f"+{nom_extracted_stats['false_positives'] / (eval_data['global_metrics']['true_positives'] + eval_data['global_metrics']['false_positives']) * 100:.1f} points",
"effort": "Faible",
"implementation": "Commenter ou supprimer la logique d'extraction de noms"
})
# Recommandation 3: Améliorer le filtrage des types *_GLOBAL problématiques
problematic_global = [s for s in global_stats if s['tp'] > 0 and s['precision'] < 0.5]
if problematic_global:
total_fp = sum(s['fp'] for s in problematic_global)
recommendations.append({
"priority": 3,
"title": "Améliorer le filtrage des types *_GLOBAL problématiques",
"types": [s['type'] for s in problematic_global],
"impact": f"Réduction estimée de {int(total_fp * 0.7)} FP (70% des {total_fp})",
"gain_precision": f"+{int(total_fp * 0.7) / (eval_data['global_metrics']['true_positives'] + eval_data['global_metrics']['false_positives']) * 100:.1f} points",
"effort": "Moyen",
"implementation": "Ajouter validation croisée ou seuil de confiance avant propagation"
})
print(f"\n🎯 {len(recommendations)} recommandations:\n")
for i, rec in enumerate(recommendations, 1):
print(f"Priorité {rec['priority']}: {rec['title']}")
print(f" Types: {', '.join(rec['types'][:5])}")
if len(rec['types']) > 5:
print(f" ... et {len(rec['types']) - 5} autres")
print(f" Impact: {rec['impact']}")
print(f" Gain précision: {rec['gain_precision']}")
print(f" Effort: {rec['effort']}")
print(f" Implémentation: {rec['implementation']}")
print()
# Estimation du gain total
total_fp_reduction = 0
for rec in recommendations:
# Extraire le nombre de FP de l'impact
import re
match = re.search(r'(\d+)', rec['impact'])
if match:
total_fp_reduction += int(match.group(1))
current_fp = eval_data['global_metrics']['false_positives']
current_tp = eval_data['global_metrics']['true_positives']
current_precision = eval_data['global_metrics']['precision']
new_fp = current_fp - total_fp_reduction
new_precision = current_tp / (current_tp + new_fp) if (current_tp + new_fp) > 0 else 0
print("="*80)
print("ESTIMATION DU GAIN TOTAL")
print("="*80)
print(f"\n🎯 Avec toutes les recommandations:")
print(f" - FP actuels: {current_fp}")
print(f" - FP estimés: {new_fp} (-{total_fp_reduction})")
print(f" - Précision actuelle: {current_precision:.2%}")
print(f" - Précision estimée: {new_precision:.2%} (+{(new_precision - current_precision)*100:.1f} points)")
if new_precision >= 0.97:
print(f"\n ✅ Objectif de précision (≥97%) ATTEIGNABLE!")
else:
print(f"\n ⚠️ Objectif de précision (≥97%) nécessite {(0.97 - new_precision)*100:.1f} points supplémentaires")
# Sauvegarder
output_dir = Path("tests/ground_truth/analysis")
output_file = output_dir / "global_propagation_analysis.json"
output_data = {
"analysis_date": "2026-03-02",
"global_types_stats": global_stats,
"nom_extracted_stats": nom_extracted_stats,
"recommendations": recommendations,
"estimated_gain": {
"current_fp": current_fp,
"estimated_fp": new_fp,
"fp_reduction": total_fp_reduction,
"current_precision": current_precision,
"estimated_precision": new_precision,
"precision_gain": new_precision - current_precision
}
}
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(output_data, f, indent=2, ensure_ascii=False)
print(f"\n📊 Analyse sauvegardée: {output_file}")
print("\n" + "="*80)
return 0
if __name__ == "__main__":
sys.exit(analyze_global_propagation())

View File

@@ -0,0 +1,200 @@
#!/usr/bin/env python3
"""
Extrait les termes médicaux des documents pour enrichir les stopwords.
Analyse les détections NOM_EXTRACTED pour identifier les faux positifs
récurrents (termes médicaux, anatomiques, etc.) à ajouter aux stopwords.
"""
import sys
import json
from pathlib import Path
from collections import Counter
import re
def extract_medical_terms():
"""Extrait les termes médicaux des détections."""
baseline_dir = Path("tests/ground_truth/pdfs/baseline_anonymized")
audit_files = sorted(baseline_dir.glob("*.audit.jsonl"))
if not audit_files:
print(f"✗ Aucun fichier audit trouvé dans {baseline_dir}")
return 1
print("="*80)
print("EXTRACTION DES TERMES MÉDICAUX")
print("="*80)
print(f"\n📁 Analyse de {len(audit_files)} fichiers audit...")
# Collecter tous les NOM_EXTRACTED
nom_extracted = []
for audit_file in audit_files:
with open(audit_file, 'r', encoding='utf-8') as f:
for line in f:
if line.strip():
det = json.loads(line)
if det.get('kind') == 'NOM_EXTRACTED':
text = det.get('original', '').strip()
if text:
nom_extracted.append(text.lower())
print(f"\n📊 {len(nom_extracted)} détections NOM_EXTRACTED trouvées")
# Compter les occurrences
counter = Counter(nom_extracted)
print(f"📊 {len(counter)} termes uniques")
# Identifier les termes médicaux potentiels
# Critères: termes fréquents (>= 3 occurrences) et patterns médicaux
medical_patterns = [
r'^(dr|docteur|professeur|prof)$',
r'^(service|unite|unité|departement|département)$',
r'^(hopital|hôpital|clinique|centre|chu|ch)$',
r'^(medecin|médecin|infirmier|infirmière|aide.soignant)$',
r'^(consultation|hospitalisation|urgence|urgences)$',
r'^(chirurgie|cardiologie|pneumologie|neurologie|oncologie)$',
r'^(radiologie|imagerie|scanner|irm|echographie|échographie)$',
r'^(biologie|laboratoire|analyse|prelevement|prélèvement)$',
r'^(traitement|medicament|médicament|ordonnance|prescription)$',
r'^(diagnostic|pathologie|maladie|syndrome|infection)$',
r'^(patient|malade|sujet|cas)$',
r'^(examen|bilan|controle|contrôle|suivi)$',
r'^(resultat|résultat|valeur|taux|dosage)$',
r'^(antecedent|antécédent|allergie|risque|facteur)$',
r'^(gauche|droit|droite|superieur|supérieur|inferieur|inférieur)$',
r'^(anterieur|antérieur|posterieur|postérieur|lateral|latéral)$',
r'^(proximal|distal|medial|médial)$',
r'(ologie|ique|aire|ose|ite|ome)$', # Suffixes médicaux
]
medical_terms = set()
frequent_terms = []
for term, count in counter.most_common():
# Termes fréquents (>= 3 occurrences)
if count >= 3:
frequent_terms.append((term, count))
# Vérifier si c'est un terme médical
is_medical = any(re.search(pattern, term, re.IGNORECASE) for pattern in medical_patterns)
if is_medical:
medical_terms.add(term)
print(f"\n📊 {len(frequent_terms)} termes fréquents (≥3 occurrences)")
print(f"📊 {len(medical_terms)} termes médicaux identifiés automatiquement")
# Afficher les top 50 termes fréquents
print(f"\n🔝 Top 50 termes les plus fréquents:")
for i, (term, count) in enumerate(frequent_terms[:50], 1):
is_medical = term in medical_terms
marker = "🏥" if is_medical else " "
print(f" {i:2d}. {marker} {term:30s} ({count:3d} occurrences)")
# Catégoriser les termes
categories = {
"Titres/Fonctions": [],
"Services/Départements": [],
"Établissements": [],
"Examens/Procédures": [],
"Anatomie": [],
"Pathologies": [],
"Médicaments/Traitements": [],
"Termes généraux": []
}
# Patterns par catégorie
category_patterns = {
"Titres/Fonctions": [r'(dr|docteur|prof|medecin|médecin|infirmier)'],
"Services/Départements": [r'(service|unite|unité|departement|département|ologie)'],
"Établissements": [r'(hopital|hôpital|clinique|centre|chu|ch)'],
"Examens/Procédures": [r'(examen|bilan|scanner|irm|echo|radio|analyse)'],
"Anatomie": [r'(gauche|droit|superieur|inferieur|anterieur|posterieur|lateral|proximal|distal)'],
"Pathologies": [r'(ite|ose|ome|pathologie|maladie|syndrome|infection)'],
"Médicaments/Traitements": [r'(traitement|medicament|médicament|ordonnance|prescription)'],
}
for term in medical_terms:
categorized = False
for category, patterns in category_patterns.items():
if any(re.search(p, term, re.IGNORECASE) for p in patterns):
categories[category].append(term)
categorized = True
break
if not categorized:
categories["Termes généraux"].append(term)
# Afficher par catégorie
print(f"\n" + "="*80)
print("TERMES MÉDICAUX PAR CATÉGORIE")
print("="*80)
for category, terms in categories.items():
if terms:
print(f"\n{category} ({len(terms)} termes):")
for term in sorted(terms)[:20]: # Limiter à 20 par catégorie
count = counter[term]
print(f" - {term} ({count} occ.)")
if len(terms) > 20:
print(f" ... et {len(terms) - 20} autres")
# Sauvegarder les résultats
output_dir = Path("tests/ground_truth/analysis")
output_dir.mkdir(exist_ok=True)
output_data = {
"extraction_date": "2026-03-02",
"total_detections": len(nom_extracted),
"unique_terms": len(counter),
"frequent_terms_count": len(frequent_terms),
"medical_terms_count": len(medical_terms),
"top_50_frequent": [
{"term": term, "count": count, "is_medical": term in medical_terms}
for term, count in frequent_terms[:50]
],
"medical_terms_by_category": {
category: sorted(terms)
for category, terms in categories.items()
if terms
},
"all_medical_terms": sorted(medical_terms)
}
output_file = output_dir / "medical_stopwords_candidates.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(output_data, f, indent=2, ensure_ascii=False)
print(f"\n📊 Résultats sauvegardés: {output_file}")
# Générer un fichier Python avec les nouveaux stopwords
stopwords_file = output_dir / "new_medical_stopwords.py"
with open(stopwords_file, 'w', encoding='utf-8') as f:
f.write("# Nouveaux stopwords médicaux extraits automatiquement\n")
f.write("# À ajouter à _MEDICAL_STOP_WORDS_SET dans anonymizer_core_refactored_onnx.py\n\n")
f.write("NEW_MEDICAL_STOPWORDS = {\n")
for term in sorted(medical_terms):
f.write(f' "{term}",\n')
f.write("}\n")
print(f"📊 Stopwords Python générés: {stopwords_file}")
print(f"\n" + "="*80)
print("RECOMMANDATIONS")
print("="*80)
print(f"""
1. Réviser manuellement les termes dans: {output_file}
2. Ajouter les termes validés à _MEDICAL_STOP_WORDS_SET
3. Re-exécuter l'anonymisation et l'évaluation
4. Vérifier la réduction des FP NOM_EXTRACTED
Gain estimé: Réduction de ~{len(medical_terms)} termes récurrents
Impact: Amélioration de la précision de plusieurs points
""")
return 0
if __name__ == "__main__":
sys.exit(extract_medical_terms())