feat(phase3): CamemBERT v3 + détection villes + initiales + texte espacé + docs réglementaires

Intégration du modèle CamemBERT-bio-deid v3 (F1=0.96, Recall=0.97, 1112 docs)
et corrections qualité issues de l'audit approfondi sur 29 fichiers.

Détection des villes en texte libre :
- Automate Aho-Corasick sur 33K communes INSEE + 11.6K villes FINESS
- Stratégie contextuelle : exige un contexte géographique (à, de, vers,
  habite, urgences de, etc.) sauf pour les villes composées (Saint-Palais)
- Blacklist de ~80 communes homonymes de mots courants (charge, signes, plan...)
- Normalisation SAINT↔ST pour les variantes orthographiques
- De 18 fuites de villes à 2 cas résiduels atypiques

Masquage des initiales de prénom :
- Post-traitement regex : "Dr T. [NOM]" → "Dr [NOM] [NOM]"
- Références initiales : "Ref : JF/VA" → "Ref : [NOM]/[NOM]"

Détection texte espacé d'en-tête :
- "C E N T R E  H O S P I T A L I E R" → [ETABLISSEMENT]

Autres corrections :
- Fix regex RE_EXTRACT_MME_MR (Mr?.? → Mr.?, \s+ → [ \t]+, * → {0,4})
- Stop words médicaux : lever, coucher, services hospitaliers (viscérale, etc.)
- CamemBERT NER manager : version tracking, propriété version, log F1/Recall
- Script finetune : export ONNX automatique + mise à jour VERSION.json
- Évaluateur qualité : exclusion stop words médicaux des alertes INSEE

Documentation :
- Spécifications techniques CamemBERT-bio-deid v3
- Conformité RGPD + AI Act (caviardage PDF raster)
- AIPD (Analyse d'Impact Protection des Données)

Score qualité : 97.0/100 (Grade A), Leak score 100/100

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-11 12:16:13 +01:00
parent c9572c383a
commit eb14cd219d
8 changed files with 1957 additions and 9 deletions

View File

@@ -0,0 +1,254 @@
{
"date": "2026-03-11T12:11:24.286697",
"scores": {
"global_score": 97.0,
"leak_score": 100.0,
"fp_score": 90,
"totals": {
"documents": 29,
"audit_hits": 2804,
"name_tokens_known": 461,
"leak_audit": 0,
"leak_occurrences": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 568,
"fp_medical": 0,
"fp_overmasking": 2
}
},
"per_file": {
"BACTERIO 23232115": {
"leak_audit": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 3,
"fp_medical": 0,
"fp_overmasking": 0
},
"CONSULTATION ANESTHESISTE 23056022": {
"leak_audit": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 11,
"fp_medical": 0,
"fp_overmasking": 0
},
"CONSULTATION ANESTHESISTE 23060661": {
"leak_audit": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 6,
"fp_medical": 0,
"fp_overmasking": 0
},
"CONSULTATION ANESTHESISTE 23139653": {
"leak_audit": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 6,
"fp_medical": 0,
"fp_overmasking": 0
},
"CRH 60_23106634": {
"leak_audit": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 5,
"fp_medical": 0,
"fp_overmasking": 1
},
"CRO 23159905": {
"leak_audit": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 5,
"fp_medical": 0,
"fp_overmasking": 1
},
"CRO 23160703": {
"leak_audit": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 2,
"fp_medical": 0,
"fp_overmasking": 0
},
"LETTRE DE SORTIE 23087212": {
"leak_audit": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 0,
"fp_medical": 0,
"fp_overmasking": 0
},
"trackare-00260974-23070213_00260974_23070213": {
"leak_audit": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 29,
"fp_medical": 0,
"fp_overmasking": 0
},
"trackare-03020576-23175616_03020576_23175616": {
"leak_audit": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 31,
"fp_medical": 0,
"fp_overmasking": 0
},
"trackare-05000272-23074376_05000272_23074376": {
"leak_audit": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 11,
"fp_medical": 0,
"fp_overmasking": 0
},
"trackare-05012679-23098722_05012679_23098722": {
"leak_audit": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 23,
"fp_medical": 0,
"fp_overmasking": 0
},
"trackare-05012965-23060770_05012965_23060770": {
"leak_audit": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 31,
"fp_medical": 0,
"fp_overmasking": 0
},
"trackare-07003136-23135847_07003136_23135847": {
"leak_audit": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 35,
"fp_medical": 0,
"fp_overmasking": 0
},
"trackare-11004431-23124019_11004431_23124019": {
"leak_audit": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 20,
"fp_medical": 0,
"fp_overmasking": 0
},
"trackare-13013848-23165708_13013848_23165708": {
"leak_audit": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 17,
"fp_medical": 0,
"fp_overmasking": 0
},
"trackare-14025311-23034958_14025311_23034958": {
"leak_audit": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 12,
"fp_medical": 0,
"fp_overmasking": 0
},
"trackare-17015185-23043950_17015185_23043950": {
"leak_audit": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 18,
"fp_medical": 0,
"fp_overmasking": 0
},
"trackare-23000862-23018396_23000862_23018396": {
"leak_audit": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 32,
"fp_medical": 0,
"fp_overmasking": 0
},
"trackare-99246761-23159905_99246761_23159905": {
"leak_audit": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 34,
"fp_medical": 0,
"fp_overmasking": 0
},
"trackare-99252128-23177582_99252128_23177582": {
"leak_audit": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 33,
"fp_medical": 0,
"fp_overmasking": 0
},
"trackare-BA042686-23090597_BA042686_23090597": {
"leak_audit": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 23,
"fp_medical": 0,
"fp_overmasking": 0
},
"trackare-BA065989-23102874_BA065989_23102874": {
"leak_audit": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 11,
"fp_medical": 0,
"fp_overmasking": 0
},
"trackare-BA067657-23076655_BA067657_23076655": {
"leak_audit": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 32,
"fp_medical": 0,
"fp_overmasking": 0
},
"trackare-BA093659-23074520_BA093659_23074520": {
"leak_audit": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 30,
"fp_medical": 0,
"fp_overmasking": 0
},
"trackare-BA121804-23016863_BA121804_23016863": {
"leak_audit": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 34,
"fp_medical": 0,
"fp_overmasking": 0
},
"trackare-BA127127-23135726_BA127127_23135726": {
"leak_audit": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 26,
"fp_medical": 0,
"fp_overmasking": 0
},
"trackare-BA171849-23214501_BA171849_23214501": {
"leak_audit": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 22,
"fp_medical": 0,
"fp_overmasking": 0
},
"trackare-BA192486-23127395_BA192486_23127395": {
"leak_audit": 0,
"leak_regex": 0,
"leak_insee_high": 0,
"leak_insee_medium": 26,
"fp_medical": 0,
"fp_overmasking": 0
}
}
}