Files
anonymisation/tests/ground_truth/analysis/baseline_analysis.json

158 lines
3.7 KiB
JSON

{
"analysis_date": "2026-03-02",
"global_metrics": {
"precision": 0.1897,
"recall": 1.0,
"f1_score": 0.3189,
"true_positives": 1159,
"false_positives": 4951,
"false_negatives": 0
},
"problems": [
{
"priority": "HAUTE",
"category": "Propagation globale",
"description": "951 faux positifs dus aux détections *_GLOBAL",
"types": [
"NOM_GLOBAL",
"ETAB_GLOBAL",
"TEL_GLOBAL",
"ADRESSE_GLOBAL",
"CODE_POSTAL_GLOBAL",
"DATE_NAISSANCE_GLOBAL",
"EMAIL_GLOBAL",
"RPPS_GLOBAL",
"EPISODE_GLOBAL",
"VILLE_GLOBAL"
],
"impact": "19.2% des FP totaux",
"solution": "Améliorer la logique de propagation globale ou désactiver pour certains types"
},
{
"priority": "HAUTE",
"category": "Extraction de noms",
"description": "3846 faux positifs de type NOM_EXTRACTED",
"types": [
"NOM_EXTRACTED"
],
"impact": "77.7% des FP totaux",
"solution": "Améliorer les stopwords médicaux et la détection contextuelle"
},
{
"priority": "MOYENNE",
"category": "Précision faible",
"description": "10 types avec précision < 50%",
"types": [
"NOM_EXTRACTED",
"NOM_GLOBAL",
"ETAB_GLOBAL",
"TEL_GLOBAL",
"ADRESSE_GLOBAL",
"CODE_POSTAL_GLOBAL",
"DATE_NAISSANCE_GLOBAL",
"EMAIL_GLOBAL",
"EPISODE",
"VILLE"
],
"impact": "Affecte 4897 FP",
"solution": "Améliorer les regex et la détection contextuelle pour ces types"
}
],
"improvements": [
{
"priority": 2,
"title": "Enrichir les stopwords médicaux",
"impact": "Réduction de ~3846 FP NOM_EXTRACTED",
"effort": "Faible",
"gain_precision": "+62.9 points",
"tasks": [
"Extraire les termes médicaux des documents annotés",
"Identifier les faux positifs récurrents",
"Ajouter à _MEDICAL_STOP_WORDS_SET"
]
},
{
"priority": 4,
"title": "Implémenter la détection contextuelle",
"impact": "Réduction de ~126 FP",
"effort": "Élevé",
"gain_precision": "+2.1 points",
"tasks": [
"Créer detectors/contextual.py",
"Implémenter la détection avec contexte fort/faible",
"Filtrer via stopwords médicaux",
"Intégrer dans le pipeline hybride"
]
}
],
"false_positives_by_type": {
"NOM_EXTRACTED": 3846,
"NOM_GLOBAL": 670,
"EPISODE": 106,
"TEL_GLOBAL": 77,
"ADRESSE_GLOBAL": 55,
"CODE_POSTAL_GLOBAL": 39,
"ETAB_GLOBAL": 36,
"EMAIL_GLOBAL": 28,
"DATE_NAISSANCE_GLOBAL": 20,
"VILLE": 20,
"ADRESSE": 10,
"CODE_POSTAL": 10,
"VILLE_GLOBAL": 10,
"EPISODE_GLOBAL": 9,
"TEL": 8,
"RPPS_GLOBAL": 7
},
"low_precision_types": [
{
"type": "NOM_EXTRACTED",
"precision": 0.0,
"fp": 3846
},
{
"type": "NOM_GLOBAL",
"precision": 0.0,
"fp": 670
},
{
"type": "ETAB_GLOBAL",
"precision": 0.0,
"fp": 36
},
{
"type": "TEL_GLOBAL",
"precision": 0.0,
"fp": 77
},
{
"type": "ADRESSE_GLOBAL",
"precision": 0.0,
"fp": 55
},
{
"type": "CODE_POSTAL_GLOBAL",
"precision": 0.0,
"fp": 39
},
{
"type": "DATE_NAISSANCE_GLOBAL",
"precision": 0.0,
"fp": 20
},
{
"type": "EMAIL_GLOBAL",
"precision": 0.0,
"fp": 28
},
{
"type": "EPISODE",
"precision": 0.1452,
"fp": 106
},
{
"type": "VILLE",
"precision": 0.2,
"fp": 20
}
]
}