anonymisation/tests/ground_truth/analysis/baseline_analysis.json

{
  "analysis_date": "2026-03-02",
  "global_metrics": {
    "precision": 0.1897,
    "recall": 1.0,
    "f1_score": 0.3189,
    "true_positives": 1159,
    "false_positives": 4951,
    "false_negatives": 0
  },
  "problems": [
    {
      "priority": "HAUTE",
      "category": "Propagation globale",
      "description": "951 faux positifs dus aux détections *_GLOBAL",
      "types": [
        "NOM_GLOBAL",
        "ETAB_GLOBAL",
        "TEL_GLOBAL",
        "ADRESSE_GLOBAL",
        "CODE_POSTAL_GLOBAL",
        "DATE_NAISSANCE_GLOBAL",
        "EMAIL_GLOBAL",
        "RPPS_GLOBAL",
        "EPISODE_GLOBAL",
        "VILLE_GLOBAL"
      ],
      "impact": "19.2% des FP totaux",
      "solution": "Améliorer la logique de propagation globale ou désactiver pour certains types"
    },
    {
      "priority": "HAUTE",
      "category": "Extraction de noms",
      "description": "3846 faux positifs de type NOM_EXTRACTED",
      "types": [
        "NOM_EXTRACTED"
      ],
      "impact": "77.7% des FP totaux",
      "solution": "Améliorer les stopwords médicaux et la détection contextuelle"
    },
    {
      "priority": "MOYENNE",
      "category": "Précision faible",
      "description": "10 types avec précision < 50%",
      "types": [
        "NOM_EXTRACTED",
        "NOM_GLOBAL",
        "ETAB_GLOBAL",
        "TEL_GLOBAL",
        "ADRESSE_GLOBAL",
        "CODE_POSTAL_GLOBAL",
        "DATE_NAISSANCE_GLOBAL",
        "EMAIL_GLOBAL",
        "EPISODE",
        "VILLE"
      ],
      "impact": "Affecte 4897 FP",
      "solution": "Améliorer les regex et la détection contextuelle pour ces types"
    }
  ],
  "improvements": [
    {
      "priority": 2,
      "title": "Enrichir les stopwords médicaux",
      "impact": "Réduction de ~3846 FP NOM_EXTRACTED",
      "effort": "Faible",
      "gain_precision": "+62.9 points",
      "tasks": [
        "Extraire les termes médicaux des documents annotés",
        "Identifier les faux positifs récurrents",
        "Ajouter à _MEDICAL_STOP_WORDS_SET"
      ]
    },
    {
      "priority": 4,
      "title": "Implémenter la détection contextuelle",
      "impact": "Réduction de ~126 FP",
      "effort": "Élevé",
      "gain_precision": "+2.1 points",
      "tasks": [
        "Créer detectors/contextual.py",
        "Implémenter la détection avec contexte fort/faible",
        "Filtrer via stopwords médicaux",
        "Intégrer dans le pipeline hybride"
      ]
    }
  ],
  "false_positives_by_type": {
    "NOM_EXTRACTED": 3846,
    "NOM_GLOBAL": 670,
    "EPISODE": 106,
    "TEL_GLOBAL": 77,
    "ADRESSE_GLOBAL": 55,
    "CODE_POSTAL_GLOBAL": 39,
    "ETAB_GLOBAL": 36,
    "EMAIL_GLOBAL": 28,
    "DATE_NAISSANCE_GLOBAL": 20,
    "VILLE": 20,
    "ADRESSE": 10,
    "CODE_POSTAL": 10,
    "VILLE_GLOBAL": 10,
    "EPISODE_GLOBAL": 9,
    "TEL": 8,
    "RPPS_GLOBAL": 7
  },
  "low_precision_types": [
    {
      "type": "NOM_EXTRACTED",
      "precision": 0.0,
      "fp": 3846
    },
    {
      "type": "NOM_GLOBAL",
      "precision": 0.0,
      "fp": 670
    },
    {
      "type": "ETAB_GLOBAL",
      "precision": 0.0,
      "fp": 36
    },
    {
      "type": "TEL_GLOBAL",
      "precision": 0.0,
      "fp": 77
    },
    {
      "type": "ADRESSE_GLOBAL",
      "precision": 0.0,
      "fp": 55
    },
    {
      "type": "CODE_POSTAL_GLOBAL",
      "precision": 0.0,
      "fp": 39
    },
    {
      "type": "DATE_NAISSANCE_GLOBAL",
      "precision": 0.0,
      "fp": 20
    },
    {
      "type": "EMAIL_GLOBAL",
      "precision": 0.0,
      "fp": 28
    },
    {
      "type": "EPISODE",
      "precision": 0.1452,
      "fp": 106
    },
    {
      "type": "VILLE",
      "precision": 0.2,
      "fp": 20
    }
  ]
}