feat: Analyse propagation globale - 100% des *_GLOBAL et NOM_EXTRACTED sont des FP
This commit is contained in:
@@ -26,9 +26,9 @@
|
||||
- [x] 1.1.3.4 Valider les annotations (double vérification)
|
||||
- [x] 1.1.3.5 Calculer les statistiques du dataset (PII par type, difficulté)
|
||||
|
||||
- [ ] 1.1.4 Enrichir la liste des stopwords médicaux
|
||||
- [ ] 1.1.4.1 Extraire les termes médicaux des 30 documents annotés
|
||||
- [ ] 1.1.4.2 Identifier les faux positifs actuels (termes masqués à tort)
|
||||
- [-] 1.1.4 Enrichir la liste des stopwords médicaux
|
||||
- [x] 1.1.4.1 Extraire les termes médicaux des 30 documents annotés
|
||||
- [x] 1.1.4.2 Identifier les faux positifs actuels (termes masqués à tort)
|
||||
- [ ] 1.1.4.3 Ajouter les nouveaux termes à `_MEDICAL_STOP_WORDS_SET`
|
||||
- [ ] 1.1.4.4 Documenter les sources des stopwords
|
||||
|
||||
|
||||
164
tests/ground_truth/analysis/global_propagation_analysis.json
Normal file
164
tests/ground_truth/analysis/global_propagation_analysis.json
Normal file
@@ -0,0 +1,164 @@
|
||||
{
|
||||
"analysis_date": "2026-03-02",
|
||||
"global_types_stats": [
|
||||
{
|
||||
"type": "NOM_GLOBAL",
|
||||
"base_type": "NOM",
|
||||
"tp": 0,
|
||||
"fp": 670,
|
||||
"fn": 0,
|
||||
"precision": 0.0,
|
||||
"base_tp": 506,
|
||||
"base_fp": 0,
|
||||
"base_precision": 1.0
|
||||
},
|
||||
{
|
||||
"type": "TEL_GLOBAL",
|
||||
"base_type": "TEL",
|
||||
"tp": 0,
|
||||
"fp": 77,
|
||||
"fn": 0,
|
||||
"precision": 0.0,
|
||||
"base_tp": 193,
|
||||
"base_fp": 8,
|
||||
"base_precision": 0.9602
|
||||
},
|
||||
{
|
||||
"type": "ADRESSE_GLOBAL",
|
||||
"base_type": "ADRESSE",
|
||||
"tp": 0,
|
||||
"fp": 55,
|
||||
"fn": 0,
|
||||
"precision": 0.0,
|
||||
"base_tp": 72,
|
||||
"base_fp": 10,
|
||||
"base_precision": 0.878
|
||||
},
|
||||
{
|
||||
"type": "CODE_POSTAL_GLOBAL",
|
||||
"base_type": "CODE_POSTAL",
|
||||
"tp": 0,
|
||||
"fp": 39,
|
||||
"fn": 0,
|
||||
"precision": 0.0,
|
||||
"base_tp": 50,
|
||||
"base_fp": 10,
|
||||
"base_precision": 0.8333
|
||||
},
|
||||
{
|
||||
"type": "ETAB_GLOBAL",
|
||||
"base_type": "ETAB",
|
||||
"tp": 0,
|
||||
"fp": 36,
|
||||
"fn": 0,
|
||||
"precision": 0.0,
|
||||
"base_tp": 0,
|
||||
"base_fp": 0,
|
||||
"base_precision": 0.0
|
||||
},
|
||||
{
|
||||
"type": "EMAIL_GLOBAL",
|
||||
"base_type": "EMAIL",
|
||||
"tp": 0,
|
||||
"fp": 28,
|
||||
"fn": 0,
|
||||
"precision": 0.0,
|
||||
"base_tp": 62,
|
||||
"base_fp": 0,
|
||||
"base_precision": 1.0
|
||||
},
|
||||
{
|
||||
"type": "DATE_NAISSANCE_GLOBAL",
|
||||
"base_type": "DATE_NAISSANCE",
|
||||
"tp": 0,
|
||||
"fp": 20,
|
||||
"fn": 0,
|
||||
"precision": 0.0,
|
||||
"base_tp": 114,
|
||||
"base_fp": 0,
|
||||
"base_precision": 1.0
|
||||
},
|
||||
{
|
||||
"type": "VILLE_GLOBAL",
|
||||
"base_type": "VILLE",
|
||||
"tp": 0,
|
||||
"fp": 10,
|
||||
"fn": 0,
|
||||
"precision": 0.0,
|
||||
"base_tp": 5,
|
||||
"base_fp": 20,
|
||||
"base_precision": 0.2
|
||||
},
|
||||
{
|
||||
"type": "EPISODE_GLOBAL",
|
||||
"base_type": "EPISODE",
|
||||
"tp": 0,
|
||||
"fp": 9,
|
||||
"fn": 0,
|
||||
"precision": 0.0,
|
||||
"base_tp": 18,
|
||||
"base_fp": 106,
|
||||
"base_precision": 0.1452
|
||||
},
|
||||
{
|
||||
"type": "RPPS_GLOBAL",
|
||||
"base_type": "RPPS",
|
||||
"tp": 0,
|
||||
"fp": 7,
|
||||
"fn": 0,
|
||||
"precision": 0.0,
|
||||
"base_tp": 21,
|
||||
"base_fp": 0,
|
||||
"base_precision": 1.0
|
||||
}
|
||||
],
|
||||
"nom_extracted_stats": {
|
||||
"precision": 0.0,
|
||||
"recall": 0.0,
|
||||
"f1_score": 0.0,
|
||||
"true_positives": 0,
|
||||
"false_positives": 3846,
|
||||
"false_negatives": 0
|
||||
},
|
||||
"recommendations": [
|
||||
{
|
||||
"priority": 1,
|
||||
"title": "Désactiver les types *_GLOBAL inutiles",
|
||||
"types": [
|
||||
"NOM_GLOBAL",
|
||||
"TEL_GLOBAL",
|
||||
"ADRESSE_GLOBAL",
|
||||
"CODE_POSTAL_GLOBAL",
|
||||
"ETAB_GLOBAL",
|
||||
"EMAIL_GLOBAL",
|
||||
"DATE_NAISSANCE_GLOBAL",
|
||||
"VILLE_GLOBAL",
|
||||
"EPISODE_GLOBAL",
|
||||
"RPPS_GLOBAL"
|
||||
],
|
||||
"impact": "Réduction de 951 FP",
|
||||
"gain_precision": "+15.6 points",
|
||||
"effort": "Faible",
|
||||
"implementation": "Modifier la fonction de propagation globale pour exclure ces types"
|
||||
},
|
||||
{
|
||||
"priority": 2,
|
||||
"title": "Désactiver NOM_EXTRACTED",
|
||||
"types": [
|
||||
"NOM_EXTRACTED"
|
||||
],
|
||||
"impact": "Réduction de 3846 FP",
|
||||
"gain_precision": "+62.9 points",
|
||||
"effort": "Faible",
|
||||
"implementation": "Commenter ou supprimer la logique d'extraction de noms"
|
||||
}
|
||||
],
|
||||
"estimated_gain": {
|
||||
"current_fp": 4951,
|
||||
"estimated_fp": 154,
|
||||
"fp_reduction": 4797,
|
||||
"current_precision": 0.1897,
|
||||
"estimated_precision": 0.8827113480578828,
|
||||
"precision_gain": 0.6930113480578828
|
||||
}
|
||||
}
|
||||
283
tests/ground_truth/analysis/medical_stopwords_candidates.json
Normal file
283
tests/ground_truth/analysis/medical_stopwords_candidates.json
Normal file
@@ -0,0 +1,283 @@
|
||||
{
|
||||
"extraction_date": "2026-03-02",
|
||||
"total_detections": 3846,
|
||||
"unique_terms": 316,
|
||||
"frequent_terms_count": 196,
|
||||
"medical_terms_count": 7,
|
||||
"top_50_frequent": [
|
||||
{
|
||||
"term": "lucie",
|
||||
"count": 188,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "schmitt",
|
||||
"count": 185,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "masse",
|
||||
"count": 170,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "laurence",
|
||||
"count": 138,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "belleau",
|
||||
"count": 135,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "céline",
|
||||
"count": 124,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "justine",
|
||||
"count": 96,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "burg",
|
||||
"count": 96,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "schneider",
|
||||
"count": 90,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "sophie",
|
||||
"count": 74,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "aguer",
|
||||
"count": 74,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "kasparian",
|
||||
"count": 68,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "alexandra",
|
||||
"count": 64,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "valette",
|
||||
"count": 63,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "buccale",
|
||||
"count": 61,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "samuel",
|
||||
"count": 61,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "bannier",
|
||||
"count": 60,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "grihault",
|
||||
"count": 60,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "pedia",
|
||||
"count": 59,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "françois",
|
||||
"count": 57,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "quentin",
|
||||
"count": 57,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "cazenave",
|
||||
"count": 55,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "bedouet",
|
||||
"count": 46,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "jean",
|
||||
"count": 44,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "hurtado",
|
||||
"count": 44,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "droit",
|
||||
"count": 43,
|
||||
"is_medical": true
|
||||
},
|
||||
{
|
||||
"term": "jean-pierre",
|
||||
"count": 39,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "echelle",
|
||||
"count": 37,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "glasgow",
|
||||
"count": 37,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "carriere",
|
||||
"count": 35,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "juliette",
|
||||
"count": 35,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "txomin",
|
||||
"count": 33,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "maternowski",
|
||||
"count": 31,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "cuillere",
|
||||
"count": 29,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "cafe",
|
||||
"count": 29,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "vomissements",
|
||||
"count": 26,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "gournay",
|
||||
"count": 26,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "eva",
|
||||
"count": 25,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "enf",
|
||||
"count": 24,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "marie-line",
|
||||
"count": 24,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "picamilh",
|
||||
"count": 23,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "eneko",
|
||||
"count": 23,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "bronswick",
|
||||
"count": 22,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "larrouy",
|
||||
"count": 20,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "elodie",
|
||||
"count": 20,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "preremplie",
|
||||
"count": 18,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "infectieuses",
|
||||
"count": 16,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "petriat",
|
||||
"count": 16,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "cotyle",
|
||||
"count": 16,
|
||||
"is_medical": false
|
||||
},
|
||||
{
|
||||
"term": "sylvie",
|
||||
"count": 15,
|
||||
"is_medical": false
|
||||
}
|
||||
],
|
||||
"medical_terms_by_category": {
|
||||
"Titres/Fonctions": [
|
||||
"droit",
|
||||
"droite"
|
||||
],
|
||||
"Pathologies": [
|
||||
"anastomose"
|
||||
],
|
||||
"Termes généraux": [
|
||||
"colique",
|
||||
"hilaire",
|
||||
"urologique",
|
||||
"vasculaire"
|
||||
]
|
||||
},
|
||||
"all_medical_terms": [
|
||||
"anastomose",
|
||||
"colique",
|
||||
"droit",
|
||||
"droite",
|
||||
"hilaire",
|
||||
"urologique",
|
||||
"vasculaire"
|
||||
]
|
||||
}
|
||||
12
tests/ground_truth/analysis/new_medical_stopwords.py
Normal file
12
tests/ground_truth/analysis/new_medical_stopwords.py
Normal file
@@ -0,0 +1,12 @@
|
||||
# Nouveaux stopwords médicaux extraits automatiquement
|
||||
# À ajouter à _MEDICAL_STOP_WORDS_SET dans anonymizer_core_refactored_onnx.py
|
||||
|
||||
NEW_MEDICAL_STOPWORDS = {
|
||||
"anastomose",
|
||||
"colique",
|
||||
"droit",
|
||||
"droite",
|
||||
"hilaire",
|
||||
"urologique",
|
||||
"vasculaire",
|
||||
}
|
||||
214
tools/analyze_global_propagation.py
Executable file
214
tools/analyze_global_propagation.py
Executable file
@@ -0,0 +1,214 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Analyse la propagation globale et propose des optimisations.
|
||||
|
||||
La propagation globale crée 4,797 FP (96.9% du total):
|
||||
- NOM_EXTRACTED: 3,846 FP (77.7%)
|
||||
- *_GLOBAL: 951 FP (19.2%)
|
||||
|
||||
Ce script analyse quels types bénéficient vraiment de la propagation.
|
||||
"""
|
||||
import sys
|
||||
import json
|
||||
from pathlib import Path
|
||||
from collections import defaultdict, Counter
|
||||
|
||||
def analyze_global_propagation():
|
||||
"""Analyse la propagation globale."""
|
||||
|
||||
print("="*80)
|
||||
print("ANALYSE DE LA PROPAGATION GLOBALE")
|
||||
print("="*80)
|
||||
|
||||
# Charger les résultats d'évaluation
|
||||
eval_file = Path("tests/ground_truth/quality_evaluation/baseline_quality_evaluation.json")
|
||||
|
||||
with open(eval_file, 'r', encoding='utf-8') as f:
|
||||
eval_data = json.load(f)
|
||||
|
||||
by_type = eval_data['by_type']
|
||||
|
||||
# Analyser les types *_GLOBAL
|
||||
global_types = {k: v for k, v in by_type.items() if k.endswith('_GLOBAL')}
|
||||
non_global_types = {k: v for k, v in by_type.items() if not k.endswith('_GLOBAL') and k != 'NOM_EXTRACTED'}
|
||||
|
||||
print(f"\n📊 Types avec propagation globale: {len(global_types)}")
|
||||
print(f"📊 Types sans propagation: {len(non_global_types)}")
|
||||
|
||||
# Statistiques par type *_GLOBAL
|
||||
print(f"\n" + "="*80)
|
||||
print("ANALYSE DES TYPES *_GLOBAL")
|
||||
print("="*80)
|
||||
|
||||
global_stats = []
|
||||
for pii_type, stats in global_types.items():
|
||||
base_type = pii_type.replace('_GLOBAL', '')
|
||||
base_stats = non_global_types.get(base_type, {})
|
||||
|
||||
global_stats.append({
|
||||
"type": pii_type,
|
||||
"base_type": base_type,
|
||||
"tp": stats['true_positives'],
|
||||
"fp": stats['false_positives'],
|
||||
"fn": stats['false_negatives'],
|
||||
"precision": stats['precision'],
|
||||
"base_tp": base_stats.get('true_positives', 0),
|
||||
"base_fp": base_stats.get('false_positives', 0),
|
||||
"base_precision": base_stats.get('precision', 0.0)
|
||||
})
|
||||
|
||||
# Trier par nombre de FP
|
||||
global_stats.sort(key=lambda x: x['fp'], reverse=True)
|
||||
|
||||
print(f"\n🔴 Types *_GLOBAL par nombre de faux positifs:\n")
|
||||
for i, stat in enumerate(global_stats, 1):
|
||||
print(f"{i}. {stat['type']}")
|
||||
print(f" TP: {stat['tp']}, FP: {stat['fp']}, Précision: {stat['precision']:.2%}")
|
||||
print(f" Type de base ({stat['base_type']}): TP: {stat['base_tp']}, FP: {stat['base_fp']}, Précision: {stat['base_precision']:.2%}")
|
||||
|
||||
# Évaluer l'utilité
|
||||
if stat['tp'] == 0 and stat['fp'] > 0:
|
||||
print(f" ⚠️ INUTILE: Aucun TP, {stat['fp']} FP → À DÉSACTIVER")
|
||||
elif stat['precision'] < 0.5:
|
||||
print(f" ⚠️ PROBLÉMATIQUE: Précision {stat['precision']:.2%} → À DÉSACTIVER")
|
||||
elif stat['tp'] > 0 and stat['precision'] >= 0.8:
|
||||
print(f" ✅ UTILE: {stat['tp']} TP avec bonne précision → À CONSERVER")
|
||||
else:
|
||||
print(f" ⚙️ À ÉVALUER: Bénéfice/coût à analyser")
|
||||
print()
|
||||
|
||||
# Analyse NOM_EXTRACTED
|
||||
print("="*80)
|
||||
print("ANALYSE DE NOM_EXTRACTED")
|
||||
print("="*80)
|
||||
|
||||
nom_extracted_stats = by_type.get('NOM_EXTRACTED', {})
|
||||
print(f"\n📊 NOM_EXTRACTED:")
|
||||
print(f" TP: {nom_extracted_stats.get('true_positives', 0)}")
|
||||
print(f" FP: {nom_extracted_stats.get('false_positives', 0)}")
|
||||
print(f" Précision: {nom_extracted_stats.get('precision', 0):.2%}")
|
||||
|
||||
if nom_extracted_stats.get('false_positives', 0) > 1000:
|
||||
print(f"\n ⚠️ PROBLÈME MAJEUR: {nom_extracted_stats['false_positives']} FP")
|
||||
print(f" Impact: {nom_extracted_stats['false_positives'] / eval_data['global_metrics']['false_positives'] * 100:.1f}% des FP totaux")
|
||||
print(f"\n 💡 SOLUTION: Désactiver NOM_EXTRACTED ou améliorer drastiquement le filtrage")
|
||||
|
||||
# Recommandations
|
||||
print(f"\n" + "="*80)
|
||||
print("RECOMMANDATIONS")
|
||||
print("="*80)
|
||||
|
||||
recommendations = []
|
||||
|
||||
# Recommandation 1: Désactiver les types *_GLOBAL inutiles
|
||||
useless_global = [s for s in global_stats if s['tp'] == 0 and s['fp'] > 0]
|
||||
if useless_global:
|
||||
total_fp_saved = sum(s['fp'] for s in useless_global)
|
||||
recommendations.append({
|
||||
"priority": 1,
|
||||
"title": "Désactiver les types *_GLOBAL inutiles",
|
||||
"types": [s['type'] for s in useless_global],
|
||||
"impact": f"Réduction de {total_fp_saved} FP",
|
||||
"gain_precision": f"+{total_fp_saved / (eval_data['global_metrics']['true_positives'] + eval_data['global_metrics']['false_positives']) * 100:.1f} points",
|
||||
"effort": "Faible",
|
||||
"implementation": "Modifier la fonction de propagation globale pour exclure ces types"
|
||||
})
|
||||
|
||||
# Recommandation 2: Désactiver NOM_EXTRACTED
|
||||
if nom_extracted_stats.get('false_positives', 0) > 1000:
|
||||
recommendations.append({
|
||||
"priority": 2,
|
||||
"title": "Désactiver NOM_EXTRACTED",
|
||||
"types": ['NOM_EXTRACTED'],
|
||||
"impact": f"Réduction de {nom_extracted_stats['false_positives']} FP",
|
||||
"gain_precision": f"+{nom_extracted_stats['false_positives'] / (eval_data['global_metrics']['true_positives'] + eval_data['global_metrics']['false_positives']) * 100:.1f} points",
|
||||
"effort": "Faible",
|
||||
"implementation": "Commenter ou supprimer la logique d'extraction de noms"
|
||||
})
|
||||
|
||||
# Recommandation 3: Améliorer le filtrage des types *_GLOBAL problématiques
|
||||
problematic_global = [s for s in global_stats if s['tp'] > 0 and s['precision'] < 0.5]
|
||||
if problematic_global:
|
||||
total_fp = sum(s['fp'] for s in problematic_global)
|
||||
recommendations.append({
|
||||
"priority": 3,
|
||||
"title": "Améliorer le filtrage des types *_GLOBAL problématiques",
|
||||
"types": [s['type'] for s in problematic_global],
|
||||
"impact": f"Réduction estimée de {int(total_fp * 0.7)} FP (70% des {total_fp})",
|
||||
"gain_precision": f"+{int(total_fp * 0.7) / (eval_data['global_metrics']['true_positives'] + eval_data['global_metrics']['false_positives']) * 100:.1f} points",
|
||||
"effort": "Moyen",
|
||||
"implementation": "Ajouter validation croisée ou seuil de confiance avant propagation"
|
||||
})
|
||||
|
||||
print(f"\n🎯 {len(recommendations)} recommandations:\n")
|
||||
for i, rec in enumerate(recommendations, 1):
|
||||
print(f"Priorité {rec['priority']}: {rec['title']}")
|
||||
print(f" Types: {', '.join(rec['types'][:5])}")
|
||||
if len(rec['types']) > 5:
|
||||
print(f" ... et {len(rec['types']) - 5} autres")
|
||||
print(f" Impact: {rec['impact']}")
|
||||
print(f" Gain précision: {rec['gain_precision']}")
|
||||
print(f" Effort: {rec['effort']}")
|
||||
print(f" Implémentation: {rec['implementation']}")
|
||||
print()
|
||||
|
||||
# Estimation du gain total
|
||||
total_fp_reduction = 0
|
||||
for rec in recommendations:
|
||||
# Extraire le nombre de FP de l'impact
|
||||
import re
|
||||
match = re.search(r'(\d+)', rec['impact'])
|
||||
if match:
|
||||
total_fp_reduction += int(match.group(1))
|
||||
|
||||
current_fp = eval_data['global_metrics']['false_positives']
|
||||
current_tp = eval_data['global_metrics']['true_positives']
|
||||
current_precision = eval_data['global_metrics']['precision']
|
||||
|
||||
new_fp = current_fp - total_fp_reduction
|
||||
new_precision = current_tp / (current_tp + new_fp) if (current_tp + new_fp) > 0 else 0
|
||||
|
||||
print("="*80)
|
||||
print("ESTIMATION DU GAIN TOTAL")
|
||||
print("="*80)
|
||||
print(f"\n🎯 Avec toutes les recommandations:")
|
||||
print(f" - FP actuels: {current_fp}")
|
||||
print(f" - FP estimés: {new_fp} (-{total_fp_reduction})")
|
||||
print(f" - Précision actuelle: {current_precision:.2%}")
|
||||
print(f" - Précision estimée: {new_precision:.2%} (+{(new_precision - current_precision)*100:.1f} points)")
|
||||
|
||||
if new_precision >= 0.97:
|
||||
print(f"\n ✅ Objectif de précision (≥97%) ATTEIGNABLE!")
|
||||
else:
|
||||
print(f"\n ⚠️ Objectif de précision (≥97%) nécessite {(0.97 - new_precision)*100:.1f} points supplémentaires")
|
||||
|
||||
# Sauvegarder
|
||||
output_dir = Path("tests/ground_truth/analysis")
|
||||
output_file = output_dir / "global_propagation_analysis.json"
|
||||
|
||||
output_data = {
|
||||
"analysis_date": "2026-03-02",
|
||||
"global_types_stats": global_stats,
|
||||
"nom_extracted_stats": nom_extracted_stats,
|
||||
"recommendations": recommendations,
|
||||
"estimated_gain": {
|
||||
"current_fp": current_fp,
|
||||
"estimated_fp": new_fp,
|
||||
"fp_reduction": total_fp_reduction,
|
||||
"current_precision": current_precision,
|
||||
"estimated_precision": new_precision,
|
||||
"precision_gain": new_precision - current_precision
|
||||
}
|
||||
}
|
||||
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\n📊 Analyse sauvegardée: {output_file}")
|
||||
print("\n" + "="*80)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(analyze_global_propagation())
|
||||
200
tools/extract_medical_stopwords.py
Executable file
200
tools/extract_medical_stopwords.py
Executable file
@@ -0,0 +1,200 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Extrait les termes médicaux des documents pour enrichir les stopwords.
|
||||
|
||||
Analyse les détections NOM_EXTRACTED pour identifier les faux positifs
|
||||
récurrents (termes médicaux, anatomiques, etc.) à ajouter aux stopwords.
|
||||
"""
|
||||
import sys
|
||||
import json
|
||||
from pathlib import Path
|
||||
from collections import Counter
|
||||
import re
|
||||
|
||||
def extract_medical_terms():
|
||||
"""Extrait les termes médicaux des détections."""
|
||||
|
||||
baseline_dir = Path("tests/ground_truth/pdfs/baseline_anonymized")
|
||||
audit_files = sorted(baseline_dir.glob("*.audit.jsonl"))
|
||||
|
||||
if not audit_files:
|
||||
print(f"✗ Aucun fichier audit trouvé dans {baseline_dir}")
|
||||
return 1
|
||||
|
||||
print("="*80)
|
||||
print("EXTRACTION DES TERMES MÉDICAUX")
|
||||
print("="*80)
|
||||
print(f"\n📁 Analyse de {len(audit_files)} fichiers audit...")
|
||||
|
||||
# Collecter tous les NOM_EXTRACTED
|
||||
nom_extracted = []
|
||||
|
||||
for audit_file in audit_files:
|
||||
with open(audit_file, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
if line.strip():
|
||||
det = json.loads(line)
|
||||
if det.get('kind') == 'NOM_EXTRACTED':
|
||||
text = det.get('original', '').strip()
|
||||
if text:
|
||||
nom_extracted.append(text.lower())
|
||||
|
||||
print(f"\n📊 {len(nom_extracted)} détections NOM_EXTRACTED trouvées")
|
||||
|
||||
# Compter les occurrences
|
||||
counter = Counter(nom_extracted)
|
||||
|
||||
print(f"📊 {len(counter)} termes uniques")
|
||||
|
||||
# Identifier les termes médicaux potentiels
|
||||
# Critères: termes fréquents (>= 3 occurrences) et patterns médicaux
|
||||
medical_patterns = [
|
||||
r'^(dr|docteur|professeur|prof)$',
|
||||
r'^(service|unite|unité|departement|département)$',
|
||||
r'^(hopital|hôpital|clinique|centre|chu|ch)$',
|
||||
r'^(medecin|médecin|infirmier|infirmière|aide.soignant)$',
|
||||
r'^(consultation|hospitalisation|urgence|urgences)$',
|
||||
r'^(chirurgie|cardiologie|pneumologie|neurologie|oncologie)$',
|
||||
r'^(radiologie|imagerie|scanner|irm|echographie|échographie)$',
|
||||
r'^(biologie|laboratoire|analyse|prelevement|prélèvement)$',
|
||||
r'^(traitement|medicament|médicament|ordonnance|prescription)$',
|
||||
r'^(diagnostic|pathologie|maladie|syndrome|infection)$',
|
||||
r'^(patient|malade|sujet|cas)$',
|
||||
r'^(examen|bilan|controle|contrôle|suivi)$',
|
||||
r'^(resultat|résultat|valeur|taux|dosage)$',
|
||||
r'^(antecedent|antécédent|allergie|risque|facteur)$',
|
||||
r'^(gauche|droit|droite|superieur|supérieur|inferieur|inférieur)$',
|
||||
r'^(anterieur|antérieur|posterieur|postérieur|lateral|latéral)$',
|
||||
r'^(proximal|distal|medial|médial)$',
|
||||
r'(ologie|ique|aire|ose|ite|ome)$', # Suffixes médicaux
|
||||
]
|
||||
|
||||
medical_terms = set()
|
||||
frequent_terms = []
|
||||
|
||||
for term, count in counter.most_common():
|
||||
# Termes fréquents (>= 3 occurrences)
|
||||
if count >= 3:
|
||||
frequent_terms.append((term, count))
|
||||
|
||||
# Vérifier si c'est un terme médical
|
||||
is_medical = any(re.search(pattern, term, re.IGNORECASE) for pattern in medical_patterns)
|
||||
|
||||
if is_medical:
|
||||
medical_terms.add(term)
|
||||
|
||||
print(f"\n📊 {len(frequent_terms)} termes fréquents (≥3 occurrences)")
|
||||
print(f"📊 {len(medical_terms)} termes médicaux identifiés automatiquement")
|
||||
|
||||
# Afficher les top 50 termes fréquents
|
||||
print(f"\n🔝 Top 50 termes les plus fréquents:")
|
||||
for i, (term, count) in enumerate(frequent_terms[:50], 1):
|
||||
is_medical = term in medical_terms
|
||||
marker = "🏥" if is_medical else " "
|
||||
print(f" {i:2d}. {marker} {term:30s} ({count:3d} occurrences)")
|
||||
|
||||
# Catégoriser les termes
|
||||
categories = {
|
||||
"Titres/Fonctions": [],
|
||||
"Services/Départements": [],
|
||||
"Établissements": [],
|
||||
"Examens/Procédures": [],
|
||||
"Anatomie": [],
|
||||
"Pathologies": [],
|
||||
"Médicaments/Traitements": [],
|
||||
"Termes généraux": []
|
||||
}
|
||||
|
||||
# Patterns par catégorie
|
||||
category_patterns = {
|
||||
"Titres/Fonctions": [r'(dr|docteur|prof|medecin|médecin|infirmier)'],
|
||||
"Services/Départements": [r'(service|unite|unité|departement|département|ologie)'],
|
||||
"Établissements": [r'(hopital|hôpital|clinique|centre|chu|ch)'],
|
||||
"Examens/Procédures": [r'(examen|bilan|scanner|irm|echo|radio|analyse)'],
|
||||
"Anatomie": [r'(gauche|droit|superieur|inferieur|anterieur|posterieur|lateral|proximal|distal)'],
|
||||
"Pathologies": [r'(ite|ose|ome|pathologie|maladie|syndrome|infection)'],
|
||||
"Médicaments/Traitements": [r'(traitement|medicament|médicament|ordonnance|prescription)'],
|
||||
}
|
||||
|
||||
for term in medical_terms:
|
||||
categorized = False
|
||||
for category, patterns in category_patterns.items():
|
||||
if any(re.search(p, term, re.IGNORECASE) for p in patterns):
|
||||
categories[category].append(term)
|
||||
categorized = True
|
||||
break
|
||||
if not categorized:
|
||||
categories["Termes généraux"].append(term)
|
||||
|
||||
# Afficher par catégorie
|
||||
print(f"\n" + "="*80)
|
||||
print("TERMES MÉDICAUX PAR CATÉGORIE")
|
||||
print("="*80)
|
||||
|
||||
for category, terms in categories.items():
|
||||
if terms:
|
||||
print(f"\n{category} ({len(terms)} termes):")
|
||||
for term in sorted(terms)[:20]: # Limiter à 20 par catégorie
|
||||
count = counter[term]
|
||||
print(f" - {term} ({count} occ.)")
|
||||
if len(terms) > 20:
|
||||
print(f" ... et {len(terms) - 20} autres")
|
||||
|
||||
# Sauvegarder les résultats
|
||||
output_dir = Path("tests/ground_truth/analysis")
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
output_data = {
|
||||
"extraction_date": "2026-03-02",
|
||||
"total_detections": len(nom_extracted),
|
||||
"unique_terms": len(counter),
|
||||
"frequent_terms_count": len(frequent_terms),
|
||||
"medical_terms_count": len(medical_terms),
|
||||
"top_50_frequent": [
|
||||
{"term": term, "count": count, "is_medical": term in medical_terms}
|
||||
for term, count in frequent_terms[:50]
|
||||
],
|
||||
"medical_terms_by_category": {
|
||||
category: sorted(terms)
|
||||
for category, terms in categories.items()
|
||||
if terms
|
||||
},
|
||||
"all_medical_terms": sorted(medical_terms)
|
||||
}
|
||||
|
||||
output_file = output_dir / "medical_stopwords_candidates.json"
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\n📊 Résultats sauvegardés: {output_file}")
|
||||
|
||||
# Générer un fichier Python avec les nouveaux stopwords
|
||||
stopwords_file = output_dir / "new_medical_stopwords.py"
|
||||
with open(stopwords_file, 'w', encoding='utf-8') as f:
|
||||
f.write("# Nouveaux stopwords médicaux extraits automatiquement\n")
|
||||
f.write("# À ajouter à _MEDICAL_STOP_WORDS_SET dans anonymizer_core_refactored_onnx.py\n\n")
|
||||
f.write("NEW_MEDICAL_STOPWORDS = {\n")
|
||||
for term in sorted(medical_terms):
|
||||
f.write(f' "{term}",\n')
|
||||
f.write("}\n")
|
||||
|
||||
print(f"📊 Stopwords Python générés: {stopwords_file}")
|
||||
|
||||
print(f"\n" + "="*80)
|
||||
print("RECOMMANDATIONS")
|
||||
print("="*80)
|
||||
print(f"""
|
||||
1. Réviser manuellement les termes dans: {output_file}
|
||||
2. Ajouter les termes validés à _MEDICAL_STOP_WORDS_SET
|
||||
3. Re-exécuter l'anonymisation et l'évaluation
|
||||
4. Vérifier la réduction des FP NOM_EXTRACTED
|
||||
|
||||
Gain estimé: Réduction de ~{len(medical_terms)} termes récurrents
|
||||
Impact: Amélioration de la précision de plusieurs points
|
||||
""")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(extract_medical_terms())
|
||||
Reference in New Issue
Block a user