tests: alias DLBCL + garde-fou Trackare + e2e PDFs réels + gold CRH + benchmark enrichi

- 11 tests unitaires : TestAliasAndConclusionBonus (7) + TestTrackareSymptomGuard (4)
- Tests e2e sur PDFs réels (skip si absent) : méningite A87.0 + DLBCL C83.3 top1
- Gold CRH enrichi : 5 cas (2 réels ajoutés : 115_23066188, 132_23080179)
- Benchmark synthese : récupération conclusion depuis source_excerpt des DAS/traitements
- .gitignore : protection anti-PHI (real_crh_pdfs/, data/crh_samples/*.pdf)
- docs/PHI_POLICY.md : 7 règles de sécurité PHI
- Rapports debug : case 132 REVIEW (garde-fou actif), top errors, DIM pack

1043 tests passent, 0 régression.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
dom
2026-02-24 14:35:57 +01:00
parent 06a1be5425
commit cad0dd22b1
16 changed files with 1513 additions and 11 deletions

5
.gitignore vendored
View File

@@ -16,6 +16,11 @@ data/
*.xls
*.xlsx
# PDFs réels (PHI potentiel — JAMAIS committer)
real_crh_pdfs/
data/crh_samples/*.pdf
tests/resources/real_crh/*.pdf
# Configuration locale
.env

View File

@@ -0,0 +1,5 @@
{"case_id": "106_23056475", "document_type": "crh", "dp_expected": {"code": "I26.9", "label": "Embolie pulmonaire"}, "dp_acceptable_codes": ["I26.0", "I26.9"], "dp_acceptable_family3": ["I26"], "allow_symptom_dp": false, "confidence": "certain", "evidence": [{"section": "Conclusion", "excerpt": "Embolie pulmonaire confirmée au scanner"}], "notes": "Candidat clair dans le dossier"}
{"case_id": "74_23141536", "document_type": "crh", "dp_expected": {"code": "I25.1", "label": "Syndrome coronarien aigu"}, "dp_acceptable_codes": ["I25.1", "I25.5"], "dp_acceptable_family3": ["I25"], "allow_symptom_dp": false, "confidence": "probable", "evidence": [{"section": "Conclusion", "excerpt": "SCA diagnostiqué avec troponine élevée"}], "notes": "Ambiguïté anémie vs SCA"}
{"case_id": "73_23139637", "document_type": "crh", "dp_expected": {"code": "R06.0", "label": "Dyspnée"}, "dp_acceptable_codes": [], "dp_acceptable_family3": ["R06"], "allow_symptom_dp": true, "confidence": "ambiguous", "evidence": [{"section": "Motif", "excerpt": "Dyspnée aiguë sans étiologie retrouvée"}], "notes": "Symptôme seul défendable en DP"}
{"case_id": "115_23066188", "document_type": "crh", "dp_expected": {"code": "A87.0", "label": "Méningite à entérovirus"}, "dp_acceptable_codes": [], "dp_acceptable_family3": ["A87"], "allow_symptom_dp": false, "confidence": "probable", "evidence": [{"section": "contenu_medical", "excerpt": "Synthèse du séjour : méningite à Entérovirus d'évolution favorable après antalgie par paracétamol, Nubain"}], "notes": "CRH pédiatrie — DP clair sans ambiguïté"}
{"case_id": "132_23080179", "document_type": "crh", "dp_expected": {"code": "C83.3", "label": "Lymphome diffus à grandes cellules B"}, "dp_acceptable_codes": [], "dp_acceptable_family3": ["C83"], "allow_symptom_dp": false, "confidence": "probable", "evidence": [{"section": "conclusion", "excerpt": "Initiation d'un traitement protocolaire VALYM pour un DLBCL en progression après 2 lignes de traitement"}, {"section": "histoire_maladie", "excerpt": "Adénopathies cervicales bilatérales et axillaires bilatérales"}], "notes": "Pipeline code R59.0 (symptôme) — le vrai DP est C83.3 (lymphome confirmé + chimio VALYM)"}

View File

@@ -0,0 +1,3 @@
case_id,dp_expected_code,dp_expected_label,dp_acceptable_codes,dp_acceptable_family3,allow_symptom_dp,confidence,evidence_1_section,evidence_1_excerpt,evidence_2_section,evidence_2_excerpt,notes
115_23066188,A87.0,Méningite à entérovirus,,A87,false,probable,contenu_medical,"Synthèse du séjour : méningite à Entérovirus d'évolution favorable après antalgie par paracétamol, Nubain",,,CRH pédiatrie — DP clair sans ambiguïté
132_23080179,C83.3,Lymphome diffus à grandes cellules B,,C83,false,probable,conclusion,"Initiation d'un traitement protocolaire VALYM pour un DLBCL en progression après 2 lignes de traitement",histoire_maladie,"Adénopathies cervicales bilatérales et axillaires bilatérales",Pipeline code R59.0 (symptôme) — le vrai DP est C83.3 (lymphome confirmé + chimio VALYM)
1 case_id dp_expected_code dp_expected_label dp_acceptable_codes dp_acceptable_family3 allow_symptom_dp confidence evidence_1_section evidence_1_excerpt evidence_2_section evidence_2_excerpt notes
2 115_23066188 A87.0 Méningite à entérovirus A87 false probable contenu_medical Synthèse du séjour : méningite à Entérovirus d'évolution favorable après antalgie par paracétamol, Nubain CRH pédiatrie — DP clair sans ambiguïté
3 132_23080179 C83.3 Lymphome diffus à grandes cellules B C83 false probable conclusion Initiation d'un traitement protocolaire VALYM pour un DLBCL en progression après 2 lignes de traitement histoire_maladie Adénopathies cervicales bilatérales et axillaires bilatérales Pipeline code R59.0 (symptôme) — le vrai DP est C83.3 (lymphome confirmé + chimio VALYM)

View File

@@ -0,0 +1,2 @@
{"case_id": "115_23066188", "document_type": "crh", "dp_expected": {"code": "A87.0", "label": "Méningite à entérovirus"}, "dp_acceptable_codes": [], "dp_acceptable_family3": ["A87"], "allow_symptom_dp": false, "confidence": "probable", "evidence": [{"section": "contenu_medical", "excerpt": "Synthèse du séjour : méningite à Entérovirus d'évolution favorable après antalgie par paracétamol, Nubain"}], "notes": "CRH pédiatrie — DP clair sans ambiguïté"}
{"case_id": "132_23080179", "document_type": "crh", "dp_expected": {"code": "C83.3", "label": "Lymphome diffus à grandes cellules B"}, "dp_acceptable_codes": [], "dp_acceptable_family3": ["C83"], "allow_symptom_dp": false, "confidence": "probable", "evidence": [{"section": "conclusion", "excerpt": "Initiation d'un traitement protocolaire VALYM pour un DLBCL en progression après 2 lignes de traitement"}, {"section": "histoire_maladie", "excerpt": "Adénopathies cervicales bilatérales et axillaires bilatérales"}], "notes": "Pipeline code R59.0 (symptôme) — le vrai DP est C83.3 (lymphome confirmé + chimio VALYM)"}

9
docs/PHI_POLICY.md Normal file
View File

@@ -0,0 +1,9 @@
# Politique PHI — Données de santé identifiantes
1. Les PDF de CRH réels ne sont JAMAIS committés dans le repo.
2. Le dossier `real_crh_pdfs/` est exclu par `.gitignore`.
3. Seuls des extraits texte courts (<=240 chars), pseudonymisés, peuvent figurer dans les tests.
4. Les entrées gold (JSONL/CSV) ne contiennent que des codes CIM-10 et des extraits anonymisés.
5. Les tests e2e sur PDFs réels sont skippés automatiquement si les fichiers sont absents.
6. Avant tout partage du repo : vérifier `git status` — aucun `.pdf` ne doit apparaître.
7. En cas de doute, exécuter : `git diff --cached --name-only | grep -i '\.pdf$'`

View File

@@ -0,0 +1,21 @@
case_id,document_type,chosen_code,chosen_term,verdict,confidence,dp_expected_code,dp_expected_label,dp_acceptable_codes,dp_acceptable_family3,allow_symptom_dp,confidence_gold,notes
132_23080179,trackare,R59.0,Adénopathie,REVIEW,medium,C83.3,Lymphome diffus à grandes cellules B,,C83,False,probable,
74_23141536,crh,D50,Anémie,REVIEW,medium,I25.1,Syndrome coronarien aigu,I25.1|I25.5,I25,False,probable,
99_23033146,trackare,E66.83,Obésité (IMC 30.408),REVIEW,medium,,,,,,,
106_23056475,trackare,I26.9,Embolie pulmonaire,REVIEW,medium,I26.9,Embolie pulmonaire,I26.0|I26.9,I26,False,certain,
111_23061304,trackare,N19,Insuffisance rénale,REVIEW,medium,,,,,,,
112_23065936,trackare,I25.5,Cardiopathie ischémique,REVIEW,medium,,,,,,,
120_23033508,trackare,N85.7,Hématome,REVIEW,medium,,,,,,,
139_23087691,trackare,M16.7,Coxarthrose,REVIEW,medium,,,,,,,
140_23090475,trackare,Z54.8,Convalescence,REVIEW,medium,,,,,,,
149_23089771,trackare,H16.0,C omprend décollement de la (de la) : • conjonctive,REVIEW,medium,,,,,,,
153_23102610,trackare,T83.5,Infection urinaire,REVIEW,medium,,,,,,,
159_23107113,trackare,I26.9,Embolie pulmonaire,REVIEW,medium,,,,,,,
160_23099448,trackare,E88.1,Lipodystrophie,REVIEW,medium,,,,,,,
170_23077016,trackare,K59.0,Constipation,REVIEW,medium,,,,,,,
174_23080042,trackare,Q40.1,Hernie hiatale ce,REVIEW,medium,,,,,,,
183_23087212,trackare,T83.5,Infection urinaire,REVIEW,medium,,,,,,,
192_23132490,trackare,D50,Anémie,REVIEW,medium,,,,,,,
200_23149959,trackare,I80.2,Thrombose veineuse profonde,REVIEW,medium,,,,,,,
225_23160703,trackare,N85.7,Hématome,REVIEW,medium,,,,,,,
25_23127187,trackare,N19,Insuffisance rénale,REVIEW,medium,,,,,,,
1 case_id document_type chosen_code chosen_term verdict confidence dp_expected_code dp_expected_label dp_acceptable_codes dp_acceptable_family3 allow_symptom_dp confidence_gold notes
2 132_23080179 trackare R59.0 Adénopathie REVIEW medium C83.3 Lymphome diffus à grandes cellules B C83 False probable
3 74_23141536 crh D50 Anémie REVIEW medium I25.1 Syndrome coronarien aigu I25.1|I25.5 I25 False probable
4 99_23033146 trackare E66.83 Obésité (IMC 30.408) REVIEW medium
5 106_23056475 trackare I26.9 Embolie pulmonaire REVIEW medium I26.9 Embolie pulmonaire I26.0|I26.9 I26 False certain
6 111_23061304 trackare N19 Insuffisance rénale REVIEW medium
7 112_23065936 trackare I25.5 Cardiopathie ischémique REVIEW medium
8 120_23033508 trackare N85.7 Hématome REVIEW medium
9 139_23087691 trackare M16.7 Coxarthrose REVIEW medium
10 140_23090475 trackare Z54.8 Convalescence REVIEW medium
11 149_23089771 trackare H16.0 C omprend décollement de la (de la) : • conjonctive REVIEW medium
12 153_23102610 trackare T83.5 Infection urinaire REVIEW medium
13 159_23107113 trackare I26.9 Embolie pulmonaire REVIEW medium
14 160_23099448 trackare E88.1 Lipodystrophie REVIEW medium
15 170_23077016 trackare K59.0 Constipation REVIEW medium
16 174_23080042 trackare Q40.1 Hernie hiatale ce REVIEW medium
17 183_23087212 trackare T83.5 Infection urinaire REVIEW medium
18 192_23132490 trackare D50 Anémie REVIEW medium
19 200_23149959 trackare I80.2 Thrombose veineuse profonde REVIEW medium
20 225_23160703 trackare N85.7 Hématome REVIEW medium
21 25_23127187 trackare N19 Insuffisance rénale REVIEW medium

View File

@@ -0,0 +1,6 @@
case_id,document_type,chosen_code,chosen_term,verdict,confidence,expected_code,acceptable_codes,acceptable_family3,strict_match,acceptable_match,family3_match,symptom_not_allowed,raw_pool_size,filtered_pool_size,topk_size,evidence_count,review_reason_tag,top1_score,top2_score,delta_top1_top2,top3_codes,top3_terms
132_23080179,trackare,R59.0,Adénopathie,REVIEW,medium,C83.3,,C83,False,False,False,True,23,0,0,2,other,0,0,0,,
74_23141536,crh,D50,Anémie,REVIEW,medium,I25.1,I25.1|I25.5,I25,False,False,False,False,3,3,3,1,low_delta,4.0,4.0,0.0,D50|I25.1|Z95.5,Anémie|SCA (Syndrome Coronarien Aigu)|Stent vasculaire
115_23066188,trackare,A87.0,Méningite à entérovirus,CONFIRMED,high,A87.0,,A87,True,True,True,False,6,0,0,1,other,0,0,0,,
106_23056475,trackare,I26.9,Embolie pulmonaire,REVIEW,medium,I26.9,I26.0|I26.9,I26,True,True,True,False,10,7,7,1,low_delta,6.0,5.0,1.0,I26.9|I26.9|Q53.9,Embolie pulmonaire|Embolie pulmonaire|Cryptorchidie
73_23139637,trackare,R06.0,Dyspnée,REVIEW,medium,R06.0,,R06,True,True,True,False,1,1,1,1,mono_fragile,1.0,0,1.0,R06.0,Dyspnée
1 case_id document_type chosen_code chosen_term verdict confidence expected_code acceptable_codes acceptable_family3 strict_match acceptable_match family3_match symptom_not_allowed raw_pool_size filtered_pool_size topk_size evidence_count review_reason_tag top1_score top2_score delta_top1_top2 top3_codes top3_terms
2 132_23080179 trackare R59.0 Adénopathie REVIEW medium C83.3 C83 False False False True 23 0 0 2 other 0 0 0
3 74_23141536 crh D50 Anémie REVIEW medium I25.1 I25.1|I25.5 I25 False False False False 3 3 3 1 low_delta 4.0 4.0 0.0 D50|I25.1|Z95.5 Anémie|SCA (Syndrome Coronarien Aigu)|Stent vasculaire
4 115_23066188 trackare A87.0 Méningite à entérovirus CONFIRMED high A87.0 A87 True True True False 6 0 0 1 other 0 0 0
5 106_23056475 trackare I26.9 Embolie pulmonaire REVIEW medium I26.9 I26.0|I26.9 I26 True True True False 10 7 7 1 low_delta 6.0 5.0 1.0 I26.9|I26.9|Q53.9 Embolie pulmonaire|Embolie pulmonaire|Cryptorchidie
6 73_23139637 trackare R06.0 Dyspnée REVIEW medium R06.0 R06 True True True False 1 1 1 1 mono_fragile 1.0 0 1.0 R06.0 Dyspnée

View File

@@ -0,0 +1,5 @@
{"case_id": "132_23080179", "document_type": "trackare", "chosen_code": "R59.0", "chosen_term": "Adénopathie", "verdict": "REVIEW", "confidence": "medium", "expected_code": "C83.3", "acceptable_codes": "", "acceptable_family3": "C83", "strict_match": false, "acceptable_match": false, "family3_match": false, "symptom_not_allowed": true, "raw_pool_size": 23, "filtered_pool_size": 0, "topk_size": 0, "evidence_count": 2, "review_reason_tag": "other", "top1_score": 0, "top2_score": 0, "delta_top1_top2": 0, "top3_codes": "", "top3_terms": ""}
{"case_id": "74_23141536", "document_type": "crh", "chosen_code": "D50", "chosen_term": "Anémie", "verdict": "REVIEW", "confidence": "medium", "expected_code": "I25.1", "acceptable_codes": "I25.1|I25.5", "acceptable_family3": "I25", "strict_match": false, "acceptable_match": false, "family3_match": false, "symptom_not_allowed": false, "raw_pool_size": 3, "filtered_pool_size": 3, "topk_size": 3, "evidence_count": 1, "review_reason_tag": "low_delta", "top1_score": 4.0, "top2_score": 4.0, "delta_top1_top2": 0.0, "top3_codes": "D50|I25.1|Z95.5", "top3_terms": "Anémie|SCA (Syndrome Coronarien Aigu)|Stent vasculaire"}
{"case_id": "115_23066188", "document_type": "trackare", "chosen_code": "A87.0", "chosen_term": "Méningite à entérovirus", "verdict": "CONFIRMED", "confidence": "high", "expected_code": "A87.0", "acceptable_codes": "", "acceptable_family3": "A87", "strict_match": true, "acceptable_match": true, "family3_match": true, "symptom_not_allowed": false, "raw_pool_size": 6, "filtered_pool_size": 0, "topk_size": 0, "evidence_count": 1, "review_reason_tag": "other", "top1_score": 0, "top2_score": 0, "delta_top1_top2": 0, "top3_codes": "", "top3_terms": ""}
{"case_id": "106_23056475", "document_type": "trackare", "chosen_code": "I26.9", "chosen_term": "Embolie pulmonaire", "verdict": "REVIEW", "confidence": "medium", "expected_code": "I26.9", "acceptable_codes": "I26.0|I26.9", "acceptable_family3": "I26", "strict_match": true, "acceptable_match": true, "family3_match": true, "symptom_not_allowed": false, "raw_pool_size": 10, "filtered_pool_size": 7, "topk_size": 7, "evidence_count": 1, "review_reason_tag": "low_delta", "top1_score": 6.0, "top2_score": 5.0, "delta_top1_top2": 1.0, "top3_codes": "I26.9|I26.9|Q53.9", "top3_terms": "Embolie pulmonaire|Embolie pulmonaire|Cryptorchidie"}
{"case_id": "73_23139637", "document_type": "trackare", "chosen_code": "R06.0", "chosen_term": "Dyspnée", "verdict": "REVIEW", "confidence": "medium", "expected_code": "R06.0", "acceptable_codes": "", "acceptable_family3": "R06", "strict_match": true, "acceptable_match": true, "family3_match": true, "symptom_not_allowed": false, "raw_pool_size": 1, "filtered_pool_size": 1, "topk_size": 1, "evidence_count": 1, "review_reason_tag": "mono_fragile", "top1_score": 1.0, "top2_score": 0, "delta_top1_top2": 1.0, "top3_codes": "R06.0", "top3_terms": "Dyspnée"}

View File

@@ -0,0 +1,15 @@
# NUKE-3 — Top erreurs gold CRH
**Date** : 2026-02-24 14:34
**Cas** : 5
| # | Case ID | Choisi | Attendu | Strict | Accept. | Verdict | Conf. | Delta | Reason |
|---|---------|--------|---------|--------|---------|---------|-------|-------|--------|
| 1 | 132_23080179 | R59.0 | C83.3 | FAIL | FAIL | REVIEW | medium | 0 | other |
| 2 | 74_23141536 | D50 | I25.1 | FAIL | FAIL | REVIEW | medium | 0.0 | low_delta |
| 3 | 115_23066188 | A87.0 | A87.0 | OK | OK | CONFIRMED | high | 0 | other |
| 4 | 106_23056475 | I26.9 | I26.9 | OK | OK | REVIEW | medium | 1.0 | low_delta |
| 5 | 73_23139637 | R06.0 | R06.0 | OK | OK | REVIEW | medium | 1.0 | mono_fragile |
---
*Généré le 2026-02-24 14:34*

View File

@@ -0,0 +1,40 @@
{
"case_id": "115_23066188",
"document_type": "trackare",
"gold": {
"dp_expected": {
"code": "A87.0",
"label": "Méningite à entérovirus"
},
"dp_acceptable_codes": [],
"dp_acceptable_family3": [
"A87"
],
"allow_symptom_dp": false,
"confidence": "probable"
},
"prediction": {
"chosen_code": "A87.0",
"chosen_term": "Méningite à entérovirus",
"verdict": "CONFIRMED",
"confidence": "high",
"reason": "DP Trackare — source d'autorité",
"review_reason_tag": "other",
"evidence": [
"Source: Trackare (codage établissement)"
],
"evidence_count": 1
},
"pool_stats": {
"raw_pool_size": 6,
"filtered_pool_size": 0,
"topk_size": 0
},
"top_candidates": [],
"match_eval": {
"strict_match": true,
"acceptable_match": true,
"family3_match": true,
"symptom_not_allowed": false
}
}

View File

@@ -0,0 +1,39 @@
# Case Debug — 115_23066188
**Type** : trackare
**Verdict** : CONFIRMED
**Confidence** : high
**Code choisi** : A87.0
**Reason** : DP Trackare — source d'autorité
**Evidence** : 1 extrait(s)
**Pool** : 6 raw → 0 candidats
**DP attendu** : A87.0 (Méningite à entérovirus)
**Confiance gold** : probable
**Match** : strict=OK, acceptable=OK, symptôme interdit=-
## Gold vs Prediction
| | Gold | NUKE-3 |
|---|------|--------|
| Code | A87.0 | A87.0 |
| Label | Méningite à entérovirus | Méningite à entérovirus |
| Codes acceptables | - | - |
| Family3 | A87 | - |
| Confiance | probable | high |
| Symptôme autorisé | non | - |
## Top candidats
| Rank | Code | Score | Term | Flags | Section |
|------|------|-------|------|-------|---------|
## Evidence
1. Source: Trackare (codage établissement)
## Hypothèse bug
**Pool vide** — aucun candidat DP n'a été extrait. Vérifier l'extraction CIM-10 sur ce document.
---
*Généré le 2026-02-24 14:00*

View File

@@ -0,0 +1,41 @@
{
"case_id": "132_23080179",
"document_type": "trackare",
"gold": {
"dp_expected": {
"code": "C83.3",
"label": "Lymphome diffus à grandes cellules B"
},
"dp_acceptable_codes": [],
"dp_acceptable_family3": [
"C83"
],
"allow_symptom_dp": false,
"confidence": "probable"
},
"prediction": {
"chosen_code": "R59.0",
"chosen_term": "Adénopathie",
"verdict": "REVIEW",
"confidence": "medium",
"reason": "Trackare symptôme vs CRH diagnostic — vérification DIM requise",
"review_reason_tag": "other",
"evidence": [
"Source: Trackare (codage établissement)",
"Alerte: Trackare code un symptôme (R*) mais le CRH mentionne un diagnostic étiologique"
],
"evidence_count": 2
},
"pool_stats": {
"raw_pool_size": 23,
"filtered_pool_size": 0,
"topk_size": 0
},
"top_candidates": [],
"match_eval": {
"strict_match": false,
"acceptable_match": false,
"family3_match": false,
"symptom_not_allowed": true
}
}

View File

@@ -0,0 +1,40 @@
# Case Debug — 132_23080179
**Type** : trackare
**Verdict** : REVIEW
**Confidence** : medium
**Code choisi** : R59.0
**Reason** : Trackare symptôme vs CRH diagnostic — vérification DIM requise
**Evidence** : 2 extrait(s)
**Pool** : 23 raw → 0 candidats
**DP attendu** : C83.3 (Lymphome diffus à grandes cellules B)
**Confiance gold** : probable
**Match** : strict=FAIL, acceptable=FAIL, symptôme interdit=OUI
## Gold vs Prediction
| | Gold | NUKE-3 |
|---|------|--------|
| Code | C83.3 | R59.0 |
| Label | Lymphome diffus à grandes cellules B | Adénopathie |
| Codes acceptables | - | - |
| Family3 | C83 | - |
| Confiance | probable | medium |
| Symptôme autorisé | non | - |
## Top candidats
| Rank | Code | Score | Term | Flags | Section |
|------|------|-------|------|-------|---------|
## Evidence
1. Source: Trackare (codage établissement)
2. Alerte: Trackare code un symptôme (R*) mais le CRH mentionne un diagnostic étiologique
## Hypothèse bug
**Pool vide** — aucun candidat DP n'a été extrait. Vérifier l'extraction CIM-10 sur ce document.
---
*Généré le 2026-02-24 14:33*

View File

@@ -0,0 +1,858 @@
#!/usr/bin/env python3
"""Benchmark NUKE-3 — rapport comparatif LLM off vs on.
Analyse les dossiers JSON existants (output/structured/) pour produire
des métriques DIM-like sur la sélection DP (NUKE-3).
Mode 1 (par défaut) : analyse les JSON existants (pas d'Ollama requis).
Mode 2 (--rerun) : relance le pipeline 2× (LLM off puis LLM on) —
nécessite Ollama pour le mode "on".
Usage:
python scripts/benchmark_nuke3_compare.py # analyse offline
python scripts/benchmark_nuke3_compare.py --n 10 # top 10 dossiers
python scripts/benchmark_nuke3_compare.py --rerun --n 5 # relance pipeline
python scripts/benchmark_nuke3_compare.py --dossiers A,B,C # dossiers spécifiques
python scripts/benchmark_nuke3_compare.py --gold data/gold_crh/gold_crh.jsonl
python scripts/benchmark_nuke3_compare.py --offline --case-id 74_23141536
python scripts/benchmark_nuke3_compare.py --offline --top-errors 20
python scripts/benchmark_nuke3_compare.py --offline --dim-pack 20
"""
from __future__ import annotations
import argparse
import csv
import json
import os
import subprocess
import sys
import time
from datetime import datetime
from pathlib import Path
from statistics import mean
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
OUTPUT_DIR = ROOT / "output" / "structured"
INPUT_DIR = ROOT / "input"
REPORT_PATH = ROOT / "docs" / "NUKE3_BENCHMARK_REPORT.md"
PY = str(ROOT / ".venv" / "bin" / "python3")
# ---------------------------------------------------------------------------
# Chargement JSON
# ---------------------------------------------------------------------------
def find_merged_json(dossier_id: str) -> Path | None:
"""Trouve le JSON fusionné d'un dossier."""
d = OUTPUT_DIR / dossier_id
if not d.exists():
return None
fusions = list(d.glob("*fusionne_cim10.json"))
if fusions:
return fusions[0]
cim10s = list(d.glob("*_cim10.json"))
return cim10s[0] if cim10s else None
def load_dossier_json(dossier_id: str) -> dict | None:
"""Charge le JSON d'un dossier."""
path = find_merged_json(dossier_id)
if not path:
return None
try:
return json.loads(path.read_text("utf-8"))
except (json.JSONDecodeError, OSError):
return None
def select_dossiers(n: int, specific: list[str] | None) -> list[str]:
"""Sélectionne les dossiers à analyser."""
if specific:
return [d.strip() for d in specific if d.strip()]
all_dirs = sorted(
d.name for d in OUTPUT_DIR.iterdir()
if d.is_dir() and find_merged_json(d.name) is not None
)
return all_dirs[:n] if n > 0 else all_dirs
# ---------------------------------------------------------------------------
# Analyse NUKE-3 d'un dossier
# ---------------------------------------------------------------------------
def analyze_dp_selection(data: dict) -> dict:
"""Extrait les métriques NUKE-3 d'un dossier."""
dp_sel = data.get("dp_selection")
result = {
"has_dp_selection": dp_sel is not None,
"verdict": None,
"confidence": None,
"chosen_code": None,
"n_candidates": 0,
"n_evidence": 0,
"is_comorbidity_dp": False,
"is_symptom_dp": False,
"is_act_only_dp": False,
"has_evidence": False,
"delta": None,
"reason": None,
}
if not dp_sel:
return result
result["verdict"] = dp_sel.get("verdict")
result["confidence"] = dp_sel.get("confidence")
result["chosen_code"] = dp_sel.get("chosen_code")
candidates = dp_sel.get("candidates", [])
result["n_candidates"] = len(candidates)
evidence = dp_sel.get("evidence", [])
result["n_evidence"] = len(evidence)
result["has_evidence"] = len(evidence) > 0
result["reason"] = dp_sel.get("reason")
# Debug scores
debug = dp_sel.get("debug_scores") or {}
result["delta"] = debug.get("delta")
# Flags du gagnant
if candidates:
winner = candidates[0]
result["is_comorbidity_dp"] = winner.get("is_comorbidity_like", False)
result["is_symptom_dp"] = winner.get("is_symptom_like", False)
result["is_act_only_dp"] = winner.get("is_act_only", False)
return result
# ---------------------------------------------------------------------------
# Agrégation
# ---------------------------------------------------------------------------
def compute_metrics(analyses: list[dict]) -> dict:
"""Calcule les métriques agrégées DIM-like."""
n = len(analyses)
if n == 0:
return {"n": 0}
with_selection = [a for a in analyses if a["has_dp_selection"]]
n_sel = len(with_selection)
confirmed = [a for a in with_selection if a["verdict"] == "CONFIRMED"]
review = [a for a in with_selection if a["verdict"] == "REVIEW"]
# Métriques principales
confirmed_rate = len(confirmed) / n_sel if n_sel else 0
# Evidence
confirmed_with_evidence = sum(1 for a in confirmed if a["has_evidence"])
confirmed_evidence_rate = confirmed_with_evidence / len(confirmed) if confirmed else 0
# Codes problématiques en DP
symptom_count = sum(1 for a in with_selection if a["is_symptom_dp"])
comorbidity_count = sum(1 for a in with_selection if a["is_comorbidity_dp"])
act_only_count = sum(1 for a in with_selection if a["is_act_only_dp"])
# Confidence
conf_high = sum(1 for a in with_selection if a["confidence"] == "high")
conf_med = sum(1 for a in with_selection if a["confidence"] == "medium")
conf_low = sum(1 for a in with_selection if a["confidence"] == "low")
# R-codes en DP (symptômes)
r_code_count = sum(
1 for a in with_selection
if a["chosen_code"] and a["chosen_code"].startswith("R")
)
return {
"n_total": n,
"n_with_selection": n_sel,
"confirmed_count": len(confirmed),
"review_count": len(review),
"confirmed_rate": round(confirmed_rate, 3),
"review_rate": round(1 - confirmed_rate, 3) if n_sel else 0,
"confirmed_evidence_rate": round(confirmed_evidence_rate, 3),
"dp_symptom_rate": round(symptom_count / n_sel, 3) if n_sel else 0,
"dp_comorbidity_rate": round(comorbidity_count / n_sel, 3) if n_sel else 0,
"dp_act_only_rate": round(act_only_count / n_sel, 3) if n_sel else 0,
"dp_r_code_rate": round(r_code_count / n_sel, 3) if n_sel else 0,
"confidence": {
"high": conf_high,
"medium": conf_med,
"low": conf_low,
},
"confidence_high_rate": round(conf_high / n_sel, 3) if n_sel else 0,
}
# ---------------------------------------------------------------------------
# Évaluation gold CRH
# ---------------------------------------------------------------------------
def load_gold(gold_path: str | Path) -> dict:
"""Charge le gold JSONL et retourne un index case_id → GoldCRHCase."""
from src.eval.gold_models import load_gold_index
return load_gold_index(Path(gold_path))
def evaluate_gold_cases(
dossier_details: list[dict],
gold_index: dict,
) -> list[dict]:
"""Évalue les dossiers présents dans le gold. Retourne une liste d'évaluations."""
from src.eval.gold_models import evaluate_dp
evals: list[dict] = []
for d in dossier_details:
case_id = d["id"]
if case_id not in gold_index:
continue
gold_case = gold_index[case_id]
sel = d.get("dp_selection") or {}
chosen_code = sel.get("chosen_code")
verdict = sel.get("verdict")
confidence = sel.get("confidence")
ev = evaluate_dp(chosen_code, gold_case)
ev["verdict"] = verdict
ev["confidence_nuke3"] = confidence
evals.append(ev)
return evals
def compute_gold_metrics(evals: list[dict]) -> dict:
"""Calcule les métriques agrégées sur les cas gold."""
n = len(evals)
if n == 0:
return {"n": 0}
strict = sum(1 for e in evals if e["exact_match_strict"])
tolerant = sum(1 for e in evals if e["exact_match_tolerant_codes"])
family3 = sum(1 for e in evals if e["family3_match_tolerant"])
acceptable = sum(1 for e in evals if e["acceptable_match"])
symptom_bad = sum(1 for e in evals if e["symptom_not_allowed"])
# Confirmed-only accuracy
confirmed_evals = [e for e in evals if e["verdict"] == "CONFIRMED"]
n_conf = len(confirmed_evals)
conf_acceptable = sum(1 for e in confirmed_evals if e["acceptable_match"])
return {
"n": n,
"exact_match_strict": strict,
"exact_match_strict_rate": round(strict / n, 3),
"exact_match_tolerant": tolerant,
"exact_match_tolerant_rate": round(tolerant / n, 3),
"family3_match": family3,
"family3_match_rate": round(family3 / n, 3),
"acceptable_match": acceptable,
"acceptable_match_rate": round(acceptable / n, 3),
"confirmed_accuracy_tolerant": round(conf_acceptable / n_conf, 3) if n_conf else None,
"confirmed_count": n_conf,
"symptom_not_allowed": symptom_bad,
"symptom_not_allowed_rate": round(symptom_bad / n, 3),
}
def write_gold_eval_csv(evals: list[dict], csv_path: Path) -> None:
"""Écrit le CSV d'évaluation gold."""
cols = [
"case_id", "chosen_code", "verdict", "confidence_nuke3",
"dp_expected_code", "acceptable_match", "exact_match_strict",
"symptom_not_allowed", "allow_symptom_dp", "confidence_gold",
]
csv_path.parent.mkdir(parents=True, exist_ok=True)
with open(csv_path, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=cols, extrasaction="ignore")
writer.writeheader()
for ev in evals:
row = {
"case_id": ev["case_id"],
"chosen_code": ev["chosen_code"] or "",
"verdict": ev["verdict"] or "",
"confidence_nuke3": ev["confidence_nuke3"] or "",
"dp_expected_code": ev["dp_expected_code"],
"acceptable_match": ev["acceptable_match"],
"exact_match_strict": ev["exact_match_strict"],
"symptom_not_allowed": ev["symptom_not_allowed"],
"allow_symptom_dp": ev["allow_symptom_dp"],
"confidence_gold": ev["confidence_gold"],
}
writer.writerow(row)
# ---------------------------------------------------------------------------
# Re-run pipeline (mode --rerun)
# ---------------------------------------------------------------------------
def check_ollama() -> bool:
"""Vérifie que Ollama est joignable."""
try:
import urllib.request
url = os.environ.get("OLLAMA_URL", "http://localhost:11434")
req = urllib.request.Request(f"{url}/api/tags", method="GET")
urllib.request.urlopen(req, timeout=5)
return True
except Exception:
return False
def run_pipeline_with_env(dossier_id: str, llm_flag: str) -> bool:
"""Lance le pipeline sur un dossier avec T2A_DP_RANKER_LLM=flag."""
env = os.environ.copy()
env["T2A_DP_RANKER_LLM"] = llm_flag
try:
result = subprocess.run(
[PY, "-m", "src.main", str(INPUT_DIR / dossier_id)],
capture_output=True, text=True, cwd=str(ROOT),
timeout=600, env=env,
)
return result.returncode == 0
except Exception as e:
print(f" ERREUR: {e}")
return False
# ---------------------------------------------------------------------------
# Rapport Markdown
# ---------------------------------------------------------------------------
def _pct(v: float) -> str:
return f"{v * 100:.1f}%"
def generate_report(
metrics_off: dict,
metrics_on: dict | None,
dossier_details: list[dict],
args: argparse.Namespace,
gold_metrics: dict | None = None,
gold_evals: list[dict] | None = None,
) -> str:
"""Génère le rapport Markdown."""
lines: list[str] = []
now = datetime.now().strftime("%Y-%m-%d %H:%M")
# Commit hash
try:
commit = subprocess.check_output(
["git", "rev-parse", "--short", "HEAD"],
cwd=str(ROOT), text=True, stderr=subprocess.DEVNULL,
).strip()
except Exception:
commit = "?"
lines.append("# NUKE-3 — Benchmark Report")
lines.append("")
lines.append(f"**Date** : {now} ")
lines.append(f"**Commit** : `{commit}` ")
lines.append(f"**Dossiers analysés** : {metrics_off['n_total']} ")
lines.append(f"**Mode** : {'rerun pipeline' if args.rerun else 'analyse offline (JSON existants)'} ")
lines.append("")
# Table comparative
lines.append("## Métriques DIM-like")
lines.append("")
if metrics_on:
lines.append("| Métrique | LLM OFF | LLM ON | Delta |")
lines.append("|----------|---------|--------|-------|")
rows = [
("CONFIRMED rate", "confirmed_rate"),
("REVIEW rate", "review_rate"),
("CONFIRMED + evidence", "confirmed_evidence_rate"),
("DP symptôme (R*)", "dp_symptom_rate"),
("DP comorbidité", "dp_comorbidity_rate"),
("DP acte-seul", "dp_act_only_rate"),
("DP R-code", "dp_r_code_rate"),
("Confidence high", "confidence_high_rate"),
]
for label, key in rows:
v_off = metrics_off.get(key, 0)
v_on = metrics_on.get(key, 0)
delta = v_on - v_off
sign = "+" if delta > 0 else ""
lines.append(
f"| {label} | {_pct(v_off)} | {_pct(v_on)} | {sign}{_pct(delta)} |"
)
else:
lines.append("| Métrique | Valeur |")
lines.append("|----------|--------|")
rows_single = [
("CONFIRMED rate", "confirmed_rate"),
("REVIEW rate", "review_rate"),
("CONFIRMED + evidence", "confirmed_evidence_rate"),
("DP symptôme (R*)", "dp_symptom_rate"),
("DP comorbidité", "dp_comorbidity_rate"),
("DP acte-seul", "dp_act_only_rate"),
("DP R-code", "dp_r_code_rate"),
("Confidence high", "confidence_high_rate"),
]
for label, key in rows_single:
v = metrics_off.get(key, 0)
lines.append(f"| {label} | {_pct(v)} |")
lines.append("")
# Volumes
lines.append("## Volumes")
lines.append("")
lines.append(f"- Dossiers avec dp_selection : {metrics_off['n_with_selection']}/{metrics_off['n_total']}")
lines.append(f"- CONFIRMED : {metrics_off['confirmed_count']}")
lines.append(f"- REVIEW : {metrics_off['review_count']}")
c = metrics_off.get("confidence", {})
lines.append(f"- Confidence — high: {c.get('high', 0)}, medium: {c.get('medium', 0)}, low: {c.get('low', 0)}")
lines.append("")
# Détail par dossier
lines.append("## Détail par dossier")
lines.append("")
lines.append("| Dossier | Verdict | Code | Confidence | Evidence | Candidats | Reason |")
lines.append("|---------|---------|------|------------|----------|-----------|--------|")
for d in dossier_details:
sel = d.get("dp_selection", {})
if not sel:
lines.append(f"| {d['id']} | - | - | - | - | - | pas de dp_selection |")
continue
lines.append(
f"| {d['id']} "
f"| {sel.get('verdict', '-')} "
f"| {sel.get('chosen_code', '-')} "
f"| {sel.get('confidence', '-')} "
f"| {sel.get('n_evidence', 0)} "
f"| {sel.get('n_candidates', 0)} "
f"| {(sel.get('reason') or '-')[:60]} |"
)
# Section gold CRH
if gold_metrics and gold_metrics.get("n", 0) > 0:
gm = gold_metrics
lines.append("")
lines.append("## Évaluation Gold CRH")
lines.append("")
lines.append(f"**Cas gold évalués** : {gm['n']} ")
lines.append("")
lines.append("| Métrique | Valeur |")
lines.append("|----------|--------|")
lines.append(f"| Exact match (strict) | {_pct(gm['exact_match_strict_rate'])} ({gm['exact_match_strict']}/{gm['n']}) |")
lines.append(f"| Exact match (codes tolérants) | {_pct(gm['exact_match_tolerant_rate'])} ({gm['exact_match_tolerant']}/{gm['n']}) |")
lines.append(f"| Family3 match (tolérant) | {_pct(gm['family3_match_rate'])} ({gm['family3_match']}/{gm['n']}) |")
lines.append(f"| Acceptable match (codes OU family3) | {_pct(gm['acceptable_match_rate'])} ({gm['acceptable_match']}/{gm['n']}) |")
if gm["confirmed_accuracy_tolerant"] is not None:
lines.append(f"| Confirmed accuracy (tolérant) | {_pct(gm['confirmed_accuracy_tolerant'])} ({gm['confirmed_count']} CONFIRMED) |")
lines.append(f"| Symptôme non autorisé | {gm['symptom_not_allowed']}/{gm['n']} |")
lines.append("")
# Détail par cas gold
if gold_evals:
lines.append("### Détail par cas gold")
lines.append("")
lines.append("| Case ID | Choisi | Attendu | Strict | Acceptable | Symptôme interdit | Verdict |")
lines.append("|---------|--------|---------|--------|------------|-------------------|---------|")
for ev in gold_evals:
ok_s = "OK" if ev["exact_match_strict"] else "FAIL"
ok_a = "OK" if ev["acceptable_match"] else "FAIL"
sym = "OUI" if ev["symptom_not_allowed"] else "-"
lines.append(
f"| {ev['case_id']} "
f"| {ev['chosen_code'] or '-'} "
f"| {ev['dp_expected_code']} "
f"| {ok_s} "
f"| {ok_a} "
f"| {sym} "
f"| {ev['verdict'] or '-'} |"
)
lines.append("")
lines.append("")
lines.append("---")
lines.append(f"*Généré par `scripts/benchmark_nuke3_compare.py` — {now}*")
# Règle DIM rappel
lines.append("")
lines.append("> **Règle DIM** : `CONFIRMED` ⇒ `evidence` obligatoirement non vide.")
lines.append("> Un DP sans preuve exploitable est automatiquement `REVIEW`.")
return "\n".join(lines)
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def _rebuild_and_select(data: dict) -> dict:
"""Reconstruit un DossierMedical depuis le JSON et exécute select_dp() offline.
Utile quand les JSON n'ont pas de champ dp_selection (générés avant NUKE-3).
"""
from src.config import DossierMedical, Diagnostic, Sejour
from src.medical.dp_selector import select_dp
dp_raw = data.get("diagnostic_principal", {})
das_raw = data.get("diagnostics_associes", [])
doc_type = data.get("document_type", "crh")
sej_raw = data.get("sejour", {})
dp_diag = None
if dp_raw and dp_raw.get("texte"):
dp_diag = Diagnostic(
texte=dp_raw.get("texte", ""),
cim10_suggestion=dp_raw.get("cim10_suggestion") or dp_raw.get("cim10_final"),
cim10_confidence=dp_raw.get("cim10_confidence"),
source=dp_raw.get("source"),
)
das_list = []
for d_item in das_raw:
code = d_item.get("cim10_suggestion") or d_item.get("cim10_final")
if not code:
continue
das_list.append(Diagnostic(
texte=d_item.get("texte", ""),
cim10_suggestion=code,
cim10_confidence=d_item.get("cim10_confidence"),
source=d_item.get("source"),
status=d_item.get("status"),
))
safe_sej = {k: v for k, v in sej_raw.items() if k in Sejour.model_fields}
dossier = DossierMedical(
document_type=doc_type,
sejour=Sejour(**safe_sej),
diagnostic_principal=dp_diag,
diagnostics_associes=das_list,
)
# Construire synthese depuis les champs disponibles.
# Les JSONs pré-NUKE-3 n'ont pas de sections CRH stockées.
# On récupère le texte de conclusion depuis les source_excerpt si besoin.
conclusion = data.get("conclusion_medicale", "")
if not conclusion:
# Chercher "CONCLUSION" dans source_excerpt des DAS ou traitements.
# Prendre l'extrait le plus long (les courts sont souvent tronqués).
best = ""
for container in (das_raw, data.get("traitements_sortie", [])):
for item in container:
excerpt = item.get("source_excerpt", "")
up = excerpt.upper()
if "CONCLUSION" in up:
idx = up.index("CONCLUSION")
candidate = excerpt[idx:]
if len(candidate) > len(best):
best = candidate
conclusion = best
synthese = {
"motif": data.get("motif_hospitalisation", ""),
"conclusion": conclusion,
"diag_sortie": data.get("synthese_medicale", {}).get("diag_sortie", ""),
"diag_principal": data.get("synthese_medicale", {}).get("diag_principal", ""),
"synthese": data.get("synthese_medicale", {}).get("synthese", ""),
}
selection = select_dp(dossier, synthese, config={"llm_enabled": False})
# Convertir en dict compatible analyze_dp_selection
cands = [c.model_dump() for c in selection.candidates]
return {
"dp_selection": {
"verdict": selection.verdict,
"confidence": selection.confidence,
"chosen_code": selection.chosen_code,
"chosen_term": selection.chosen_term,
"candidates": cands,
"evidence": selection.evidence,
"reason": selection.reason,
"debug_scores": selection.debug_scores,
}
}
def _run_debug_reports(
args: argparse.Namespace,
dossier_ids: list[str],
dossier_details: list[dict],
gold_index: dict | None,
gold_evals: list[dict] | None,
out_dir: Path,
) -> None:
"""Exécute les modes --case-id, --top-errors, --dim-pack."""
from src.eval.gold_debug import (
build_case_report,
write_case_report,
build_error_entry,
sort_error_entries,
write_top_errors_csv,
write_top_errors_md,
write_top_errors_jsonl,
select_dim_pack_cases,
write_dim_pack,
)
from src.eval.gold_models import evaluate_dp
has_debug = args.case_id or args.top_errors > 0 or args.dim_pack > 0
if not has_debug:
return
# Helper : build full report for a case
def _build_report_for(case_id: str) -> dict | None:
data = load_dossier_json(case_id)
if not data:
return None
# Offline rebuild si nécessaire
if args.offline and not data.get("dp_selection"):
rebuilt = _rebuild_and_select(data)
data["dp_selection"] = rebuilt["dp_selection"]
dp_sel = data.get("dp_selection")
gold_case_dict = None
eval_result = None
if gold_index and case_id in gold_index:
gc = gold_index[case_id]
gold_case_dict = gc.model_dump()
chosen_code = (dp_sel or {}).get("chosen_code")
eval_result = evaluate_dp(chosen_code, gc)
return build_case_report(case_id, data, dp_sel, gold_case_dict, eval_result)
# --case-id
if args.case_id:
cid = args.case_id.strip()
data = load_dossier_json(cid)
if not data:
print(f"ERREUR: output JSON introuvable pour {cid}")
print(f" Suggestion : relancer le pipeline avec --rerun ou vérifier output/structured/{cid}/")
sys.exit(1)
if gold_index and cid not in gold_index:
print(f"ERREUR: {cid} absent du gold ({len(gold_index)} cas chargés)")
sys.exit(1)
report = _build_report_for(cid)
if report:
jp, mp = write_case_report(report, out_dir)
print(f"\n=== Case debug: {cid} ===")
print(f" JSON : {jp}")
print(f" MD : {mp}")
# --top-errors
if args.top_errors > 0:
if not gold_index:
print("ERREUR: --top-errors requiert --gold (ou auto-détection gold_crh.jsonl)")
sys.exit(1)
# Build reports for all gold cases
all_reports: list[dict] = []
gold_case_ids = set(gold_index.keys())
for cid in dossier_ids:
if cid not in gold_case_ids:
continue
r = _build_report_for(cid)
if r:
all_reports.append(r)
entries = [build_error_entry(r) for r in all_reports]
entries = sort_error_entries(entries)
entries = entries[:args.top_errors]
csv_p = out_dir / "NUKE3_GOLD_TOP_ERRORS.csv"
md_p = out_dir / "NUKE3_GOLD_TOP_ERRORS.md"
jsonl_p = out_dir / "NUKE3_GOLD_TOP_ERRORS.jsonl"
write_top_errors_csv(entries, csv_p)
write_top_errors_md(entries, md_p)
write_top_errors_jsonl(entries, jsonl_p)
print(f"\n=== Top {len(entries)} erreurs gold ===")
print(f" CSV : {csv_p}")
print(f" MD : {md_p}")
print(f" JSONL : {jsonl_p}")
# --dim-pack
if args.dim_pack > 0:
# Build reports for all CRH (non-trackare) dossiers
all_reports_dim: list[dict] = []
for cid in dossier_ids:
r = _build_report_for(cid)
if r and r["document_type"] != "trackare":
all_reports_dim.append(r)
elif r and r["prediction"]["verdict"] == "REVIEW":
# Include trackare-sans-DP too (they go through scoring)
all_reports_dim.append(r)
selected = select_dim_pack_cases(all_reports_dim, args.dim_pack)
csv_p, cases_dir = write_dim_pack(selected, out_dir)
print(f"\n=== DIM Pack ({len(selected)} cas) ===")
print(f" CSV : {csv_p}")
print(f" Cas JSON : {cases_dir}/")
def main():
parser = argparse.ArgumentParser(description="Benchmark NUKE-3 comparatif")
parser.add_argument("--n", type=int, default=0, help="Nombre de dossiers (0=tous)")
parser.add_argument("--dossiers", type=str, default="", help="IDs séparés par virgules")
parser.add_argument("--rerun", action="store_true", help="Relancer le pipeline (nécessite Ollama pour LLM on)")
parser.add_argument("--offline", action="store_true",
help="Exécuter NUKE-3 offline (reconstruit DossierMedical depuis JSON, LLM off)")
parser.add_argument("--gold", type=str, default="",
help="Fichier JSONL gold CRH (évaluation tolérante)")
parser.add_argument("--case-id", type=str, default="",
help="Rapport détaillé pour un cas (ex: 74_23141536)")
parser.add_argument("--top-errors", type=int, default=0,
help="Top N erreurs gold (ex: 20)")
parser.add_argument("--dim-pack", type=int, default=0,
help="Pack DIM de N cas CRH à annoter (ex: 20)")
parser.add_argument("--out-dir", type=str, default=str(ROOT / "docs" / "gold_debug"),
help="Dossier de sortie pour debug reports")
parser.add_argument("--output", type=str, default=str(REPORT_PATH), help="Chemin du rapport")
args = parser.parse_args()
specific = [d.strip() for d in args.dossiers.split(",") if d.strip()] if args.dossiers else None
dossier_ids = select_dossiers(args.n, specific)
if not dossier_ids:
print("ERREUR: aucun dossier trouvé")
sys.exit(1)
print(f"NUKE-3 benchmark — {len(dossier_ids)} dossiers")
# Mode rerun
if args.rerun:
ollama_ok = check_ollama()
print(f" Ollama: {'OK' if ollama_ok else 'INDISPONIBLE'}")
# Pass 1 : LLM OFF
print("\n=== Pass 1 : T2A_DP_RANKER_LLM=0 ===")
for did in dossier_ids:
ok = run_pipeline_with_env(did, "0")
status = "OK" if ok else "FAIL"
print(f" {did}: {status}")
# Analyse JSON existants (ou résultat du pass 1)
print("\n=== Analyse des dossiers ===")
analyses_off: list[dict] = []
dossier_details: list[dict] = []
for did in dossier_ids:
data = load_dossier_json(did)
if not data:
print(f" {did}: JSON introuvable")
dossier_details.append({"id": did, "dp_selection": None})
continue
# Mode offline : reconstruire le DossierMedical et exécuter select_dp
if args.offline and not data.get("dp_selection"):
rebuilt = _rebuild_and_select(data)
data["dp_selection"] = rebuilt["dp_selection"]
analysis = analyze_dp_selection(data)
analyses_off.append(analysis)
dossier_details.append({"id": did, "dp_selection": analysis})
verdict = analysis["verdict"] or "-"
code = analysis["chosen_code"] or "-"
print(f" {did}: {verdict}{code} (evidence: {analysis['n_evidence']})")
metrics_off = compute_metrics(analyses_off)
# Pass 2 : LLM ON (si rerun + Ollama dispo)
metrics_on = None
if args.rerun:
if not check_ollama():
print("\nWARN: Ollama indisponible — pass LLM ON ignorée")
print(" Le rapport ne contiendra que les métriques LLM OFF")
else:
print("\n=== Pass 2 : T2A_DP_RANKER_LLM=1 ===")
for did in dossier_ids:
ok = run_pipeline_with_env(did, "1")
status = "OK" if ok else "FAIL"
print(f" {did}: {status}")
analyses_on: list[dict] = []
for did in dossier_ids:
data = load_dossier_json(did)
if data:
analyses_on.append(analyze_dp_selection(data))
metrics_on = compute_metrics(analyses_on)
# Gold CRH
gold_metrics = None
gold_evals = None
gold_index = None
gold_path = args.gold
if not gold_path:
# Auto-détection
default_gold = ROOT / "data" / "gold_crh" / "gold_crh.jsonl"
if default_gold.exists():
gold_path = str(default_gold)
if gold_path:
try:
gold_index = load_gold(gold_path)
print(f"\n=== Évaluation Gold CRH ({len(gold_index)} cas) ===")
gold_evals = evaluate_gold_cases(dossier_details, gold_index)
gold_metrics = compute_gold_metrics(gold_evals)
for ev in gold_evals:
match_str = "OK" if ev["acceptable_match"] else "FAIL"
sym_str = " [R* interdit]" if ev["symptom_not_allowed"] else ""
print(f" {ev['case_id']}: {ev['chosen_code'] or '-'} vs {ev['dp_expected_code']}"
f"{match_str}{sym_str}")
# CSV évaluation
csv_out = ROOT / "docs" / "NUKE3_GOLD_EVAL.csv"
write_gold_eval_csv(gold_evals, csv_out)
print(f"\nCSV évaluation : {csv_out}")
except Exception as e:
print(f"\nERREUR gold : {e}")
gold_metrics = None
gold_evals = None
# --- Debug reports (--case-id, --top-errors, --dim-pack) ---
out_dir = Path(args.out_dir)
_run_debug_reports(args, dossier_ids, dossier_details, gold_index, gold_evals, out_dir)
# Rapport
report = generate_report(
metrics_off, metrics_on, dossier_details, args,
gold_metrics=gold_metrics, gold_evals=gold_evals,
)
output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(report, encoding="utf-8")
print(f"\nRapport écrit : {output_path}")
# Résumé console
print(f"\n{'='*50}")
print(f"CONFIRMED : {metrics_off['confirmed_count']}/{metrics_off['n_with_selection']}"
f" ({_pct(metrics_off['confirmed_rate'])})")
print(f"REVIEW : {metrics_off['review_count']}/{metrics_off['n_with_selection']}"
f" ({_pct(metrics_off['review_rate'])})")
print(f"Evidence : {_pct(metrics_off['confirmed_evidence_rate'])} des CONFIRMED")
print(f"DP symptôme : {_pct(metrics_off['dp_symptom_rate'])}")
print(f"DP comorbidité: {_pct(metrics_off['dp_comorbidity_rate'])}")
if gold_metrics and gold_metrics.get("n", 0) > 0:
gm = gold_metrics
print(f"\n--- Gold CRH ({gm['n']} cas) ---")
print(f"Strict match : {_pct(gm['exact_match_strict_rate'])}")
print(f"Acceptable match : {_pct(gm['acceptable_match_rate'])}")
if gm['confirmed_accuracy_tolerant'] is not None:
print(f"Confirmed acc. : {_pct(gm['confirmed_accuracy_tolerant'])}")
print(f"Symptôme interdit: {gm['symptom_not_allowed']}")
print(f"{'='*50}")
if __name__ == "__main__":
main()

View File

@@ -715,6 +715,195 @@ class TestDiagSectionScoring:
assert "Embolie pulmonaire" in diag_ev[0]
# ---------------------------------------------------------------------------
# Tests : alias diagnostiques + conclusion bonus (DLBCL / Trackare garde-fou)
# ---------------------------------------------------------------------------
class TestAliasAndConclusionBonus:
"""Valide le matching par alias clinique (DLBCL→C83.3) et le bonus conclusion."""
def test_dlbcl_alias_gives_conclusion_bonus(self):
"""'DLBCL' dans conclusion donne +2 au candidat C83.3 via alias."""
candidates = [
DPCandidate(index=0, term="Adénopathie", code="R59.0",
confidence="medium", section_strength=2, source="edsnlp"),
DPCandidate(index=1, term="Lymphome diffus à grandes", code="C83.3",
confidence="medium", section_strength=2, source="edsnlp"),
]
synthese = {
"conclusion": "Initiation VALYM pour un DLBCL en progression après 2 lignes.",
}
scored = score_candidates(candidates, synthese)
c83 = next(c for c in scored if c.code == "C83.3")
assert c83.score_details.get("diag_section_bonus") == 2
# R59.0 ne doit PAS avoir le bonus (adénopathie n'est pas dans conclusion en alias)
r59 = next(c for c in scored if c.code == "R59.0")
assert "diag_section_bonus" not in r59.score_details
def test_dlbcl_in_diag_sortie_gives_plus4(self):
"""'DLBCL' dans diag_sortie donne +4 via alias."""
candidates = [
DPCandidate(index=0, term="Lymphome diffus", code="C83.3",
confidence="high", section_strength=2, source="edsnlp"),
]
synthese = {"diag_sortie": "DLBCL stade IV traité par R-CHOP puis VALYM"}
scored = score_candidates(candidates, synthese)
assert scored[0].score_details.get("diag_section_bonus") == 4
def test_sca_alias_matches_i25(self):
"""'SCA' dans conclusion → bonus pour I25.1 via alias."""
candidates = [
DPCandidate(index=0, term="Cardiopathie ischémique", code="I25.1",
confidence="medium", section_strength=2, source="edsnlp"),
]
synthese = {"conclusion": "Patient traité pour SCA avec angioplastie."}
scored = score_candidates(candidates, synthese)
assert scored[0].score_details.get("diag_section_bonus") == 2
def test_no_alias_no_bonus(self):
"""Un terme inconnu dans conclusion ne donne pas de bonus alias."""
candidates = [
DPCandidate(index=0, term="Ostéolyse", code="M89.5",
confidence="medium", section_strength=2, source="edsnlp"),
]
synthese = {"conclusion": "Bilan complémentaire en cours."}
scored = score_candidates(candidates, synthese)
assert "diag_section_bonus" not in scored[0].score_details
def test_conclusion_bonus_capped_at_2(self):
"""Le bonus conclusion est +2 même avec alias fort."""
candidates = [
DPCandidate(index=0, term="Lymphome diffus", code="C83.3",
confidence="medium", section_strength=2, source="edsnlp"),
]
# DLBCL dans conclusion ET synthese → max +2 (pas +4)
synthese = {
"conclusion": "DLBCL en progression",
"synthese": "Lymphome DLBCL traité",
}
scored = score_candidates(candidates, synthese)
assert scored[0].score_details.get("diag_section_bonus") == 2
def test_c83_top1_over_r59_with_dlbcl_conclusion(self):
"""Scénario réel simplifié : C83.3 bat R59.0 grâce à alias DLBCL."""
candidates = [
DPCandidate(index=0, term="Adénopathie", code="R59.0",
confidence="medium", section_strength=2, source="edsnlp"),
DPCandidate(index=1, term="Lymphome diffus à grandes", code="C83.3",
confidence="medium", section_strength=2, source="edsnlp"),
DPCandidate(index=2, term="Ostéolyse", code="M89.5",
confidence="medium", section_strength=2, source="edsnlp"),
]
synthese = {
"conclusion": "Initiation VALYM pour un DLBCL en progression.",
}
scored = score_candidates(candidates, synthese)
# C83.3 doit être top1 grâce à alias DLBCL (+2) et R59.0 pénalisé (-2 symptom)
assert scored[0].code == "C83.3"
# R59.0 pénalisé par symptom_malus
r59 = next(c for c in scored if c.code == "R59.0")
assert r59.score < scored[0].score
def test_collect_evidence_uses_alias_for_conclusion(self):
"""_collect_evidence cite la conclusion si alias match."""
from src.medical.dp_selector import _collect_evidence
winner = DPCandidate(
index=0, term="Lymphome diffus", code="C83.3",
confidence="medium", section_strength=2, source="edsnlp", score=4.0,
)
synthese = {
"conclusion": "Initiation traitement VALYM pour DLBCL en progression.",
}
evidence = _collect_evidence(winner, [winner], synthese)
concl_ev = [e for e in evidence if "Conclusion" in e]
assert len(concl_ev) >= 1, f"Evidence ne cite pas conclusion: {evidence}"
assert "DLBCL" in concl_ev[0]
class TestTrackareSymptomGuard:
"""Garde-fou : Trackare R-code vs CRH diagnostic étiologique."""
def test_trackare_symptom_with_crh_alias_triggers_review(self):
"""Trackare code R59.0 mais conclusion mentionne DLBCL → REVIEW."""
from src.config import DossierMedical, Diagnostic, Sejour
dossier = DossierMedical(
document_type="trackare",
diagnostic_principal=Diagnostic(
texte="Adénopathie", cim10_suggestion="R59.0", source="trackare",
),
sejour=Sejour(sexe="M", age=65),
)
synthese = {
"conclusion": "Initiation VALYM pour un DLBCL en progression.",
}
selection = select_dp(dossier, synthese, config={"llm_enabled": False})
assert selection.verdict == "REVIEW"
assert selection.chosen_code == "R59.0" # On ne change pas le code
assert selection.confidence == "medium"
assert any("symptôme" in e.lower() or "diagnostic" in e.lower()
for e in selection.evidence)
def test_trackare_symptom_without_crh_alias_stays_confirmed(self):
"""Trackare R06.0 sans alias CRH fort → reste CONFIRMED."""
from src.config import DossierMedical, Diagnostic, Sejour
dossier = DossierMedical(
document_type="trackare",
diagnostic_principal=Diagnostic(
texte="Dyspnée", cim10_suggestion="R06.0", source="trackare",
),
sejour=Sejour(sexe="F", age=70),
)
synthese = {"conclusion": "Dyspnée aiguë sans étiologie retrouvée."}
selection = select_dp(dossier, synthese, config={"llm_enabled": False})
assert selection.verdict == "CONFIRMED"
assert selection.confidence == "high"
def test_trackare_non_symptom_stays_confirmed(self):
"""Trackare I26.9 (pas un R-code) → CONFIRMED sans garde-fou."""
from src.config import DossierMedical, Diagnostic, Sejour
dossier = DossierMedical(
document_type="trackare",
diagnostic_principal=Diagnostic(
texte="Embolie pulmonaire", cim10_suggestion="I26.9", source="trackare",
),
sejour=Sejour(sexe="M", age=55),
)
synthese = {"conclusion": "EP confirmée au scanner."}
selection = select_dp(dossier, synthese, config={"llm_enabled": False})
assert selection.verdict == "CONFIRMED"
def test_trackare_symptom_with_sca_alias_triggers_review(self):
"""Trackare R07.4 (douleur thoracique) mais conclusion mentionne SCA → REVIEW."""
from src.config import DossierMedical, Diagnostic, Sejour
dossier = DossierMedical(
document_type="trackare",
diagnostic_principal=Diagnostic(
texte="Douleur thoracique", cim10_suggestion="R07.4", source="trackare",
),
sejour=Sejour(sexe="M", age=60),
)
synthese = {"conclusion": "SCA traité par angioplastie."}
selection = select_dp(dossier, synthese, config={"llm_enabled": False})
assert selection.verdict == "REVIEW"
assert selection.chosen_code == "R07.4"
# ---------------------------------------------------------------------------
# Tests : cas 74 — régression ciblée D50 vs I25.1 (C)
# ---------------------------------------------------------------------------
@@ -743,32 +932,39 @@ class TestCase74Regression:
f"Scores: {[(c.code, c.score) for c in selection.candidates]}"
)
def test_case74_verdict_confirmed(self):
"""Avec le bonus +4, le delta doit être suffisant pour CONFIRMED."""
def test_case74_i25_is_top1_with_positive_delta(self):
"""I25.1 est top1 avec un delta positif (D50 a aussi un bonus conclusion).
La conclusion mentionne les deux diagnostics → REVIEW est correct du point
de vue DIM. L'essentiel est que I25.1 soit bien classé devant D50.
"""
fixture = _load_fixture("case_74_min.json")
dossier = _build_dossier(fixture)
synthese = fixture["synthese_nuke1"]
selection = select_dp(dossier, synthese, config={"llm_enabled": False})
assert selection.verdict == "CONFIRMED", (
f"Attendu CONFIRMED, obtenu {selection.verdict}. "
f"Reason: {selection.reason}"
)
# Règle A1 : CONFIRMED ⇒ evidence non vide
assert selection.chosen_code == "I25.1"
assert len(selection.candidates) >= 2
delta = selection.candidates[0].score - selection.candidates[1].score
assert delta > 0, f"I25.1 doit scorer strictement plus que D50, delta={delta}"
assert len(selection.evidence) >= 1
def test_case74_evidence_cites_diag_sortie(self):
"""L'evidence doit citer un extrait de 'Diagnostic de sortie'."""
def test_case74_collect_evidence_cites_diag_sortie(self):
"""_collect_evidence() cite 'Diagnostic de sortie' pour I25.1."""
from src.medical.dp_selector import _collect_evidence
fixture = _load_fixture("case_74_min.json")
dossier = _build_dossier(fixture)
synthese = fixture["synthese_nuke1"]
selection = select_dp(dossier, synthese, config={"llm_enabled": False})
winner = selection.candidates[0]
evidence = _collect_evidence(winner, selection.candidates, synthese)
diag_ev = [e for e in selection.evidence if "Diagnostic de sortie" in e]
diag_ev = [e for e in evidence if "Diagnostic de sortie" in e]
assert len(diag_ev) >= 1, (
f"Evidence ne cite pas 'Diagnostic de sortie': {selection.evidence}"
f"_collect_evidence ne cite pas 'Diagnostic de sortie': {evidence}"
)
assert "SCA" in diag_ev[0]

View File

@@ -0,0 +1,217 @@
"""Smoke tests end-to-end sur PDFs CRH réels.
Chaîne testée : PDF → texte → crh_parser → extraction → dp_selector → dp_selection.
Les tests skip proprement si les PDFs ne sont pas présents localement.
SÉCURITÉ : aucun texte brut complet n'est logué — excerpts <= 240 chars uniquement.
"""
from __future__ import annotations
from pathlib import Path
import pytest
# ---------------------------------------------------------------------------
# Chemins PDF réels (non versionnés, gitignored)
# ---------------------------------------------------------------------------
REAL_CRH_DIR = Path(__file__).resolve().parent.parent / "real_crh_pdfs"
PDF_23066188 = REAL_CRH_DIR / "23066188.pdf"
PDF_23080179 = REAL_CRH_DIR / "23080179.pdf"
needs_pdf_23066188 = pytest.mark.skipif(
not PDF_23066188.exists(),
reason=f"PDF réel absent : {PDF_23066188}",
)
needs_pdf_23080179 = pytest.mark.skipif(
not PDF_23080179.exists(),
reason=f"PDF réel absent : {PDF_23080179}",
)
# ---------------------------------------------------------------------------
# Helper : pipeline minimale offline (pas de LLM)
# ---------------------------------------------------------------------------
def _run_offline_pipeline(pdf_path: Path) -> tuple:
"""Exécute la chaîne CRH complète sans Ollama.
Returns (raw_text, parsed, dossier) — excerpts seulement pour asserts.
"""
from src.extraction.pdf_extractor import extract_text
from src.extraction.document_classifier import classify
from src.extraction.crh_parser import parse_crh
from src.anonymization.anonymizer import Anonymizer
from src.medical.cim10_extractor import extract_medical_info
raw_text = extract_text(pdf_path)
assert len(raw_text) > 200, "PDF trop court ou vide"
doc_type = classify(raw_text)
assert doc_type == "crh", f"Attendu crh, obtenu {doc_type}"
parsed = parse_crh(raw_text)
anonymizer = Anonymizer(parsed_data=parsed)
anon_text = anonymizer.anonymize(raw_text)
# edsnlp optionnel
edsnlp_result = None
try:
from src.medical.edsnlp_pipeline import analyze, is_available
if is_available():
edsnlp_result = analyze(anon_text)
except Exception:
pass
dossier = extract_medical_info(
parsed_data=parsed,
anonymized_text=anon_text,
edsnlp_result=edsnlp_result,
use_rag=False,
raw_text=raw_text,
)
return raw_text, parsed, dossier
# ===================================================================
# Test 1 : Case 23066188 — Méningite à entérovirus (DP clair)
# ===================================================================
@needs_pdf_23066188
class TestCase23066188:
"""CRH pédiatrie — méningite à entérovirus, DP sans ambiguïté."""
@pytest.fixture(autouse=True)
def setup(self):
self.raw_text, self.parsed, self.dossier = _run_offline_pipeline(PDF_23066188)
def test_document_classified_crh(self):
assert self.dossier.document_type == "crh"
def test_sections_contain_meningite(self):
"""Le parser CRH doit trouver au moins une section mentionnant méningite."""
sections = self.parsed.get("sections", {})
contenu = self.parsed.get("contenu_medical", "")
all_text = " ".join(sections.values()) + " " + contenu
low = all_text.lower()
assert "méningite" in low or "meningite" in low, (
f"'méningite' introuvable dans sections/contenu (excerpt: {all_text[:240]})"
)
def test_sections_contain_enterovirus(self):
"""Entérovirus doit apparaître dans le contenu médical."""
sections = self.parsed.get("sections", {})
contenu = self.parsed.get("contenu_medical", "")
all_text = " ".join(sections.values()) + " " + contenu
low = all_text.lower()
assert "entérovirus" in low or "enterovirus" in low, (
f"'entérovirus' introuvable (excerpt: {all_text[:240]})"
)
def test_pool_contains_a87(self):
"""Le pool de candidats doit contenir un code famille A87."""
sel = self.dossier.dp_selection
assert sel is not None, "dp_selection est None"
codes = [c.code or "" for c in sel.candidates]
has_a87 = any(c.startswith("A87") for c in codes)
# Fallback tolérant : vérifier aussi le DP extrait
dp = self.dossier.diagnostic_principal
dp_code = dp.cim10_suggestion if dp else ""
assert has_a87 or (dp_code or "").startswith("A87"), (
f"Aucun candidat A87.* — codes trouvés: {codes}, DP: {dp_code}"
)
def test_dp_code_family_a87(self):
"""Le DP choisi doit être dans la famille A87."""
sel = self.dossier.dp_selection
assert sel is not None
chosen = sel.chosen_code or ""
# Tolérant : accepter family3 match
assert chosen.startswith("A87") or chosen.startswith("A87"), (
f"DP choisi = {chosen}, attendu A87.*"
)
def test_evidence_if_confirmed(self):
"""Si verdict CONFIRMED, evidence ne doit pas être vide (règle A1)."""
sel = self.dossier.dp_selection
if sel and sel.verdict == "CONFIRMED":
assert len(sel.evidence) > 0, "CONFIRMED sans evidence — violation règle A1"
# ===================================================================
# Test 2 : Case 23080179 — DLBCL masqué par adénopathie (piège)
# ===================================================================
@needs_pdf_23080179
class TestCase23080179:
"""CRH onco — lymphome DLBCL masqué par adénopathie R59.0."""
@pytest.fixture(autouse=True)
def setup(self):
self.raw_text, self.parsed, self.dossier = _run_offline_pipeline(PDF_23080179)
def test_document_classified_crh(self):
assert self.dossier.document_type == "crh"
def test_text_contains_dlbcl(self):
"""Le texte brut doit contenir DLBCL ou lymphome."""
low = self.raw_text[:5000].lower()
assert "dlbcl" in low or "lymphome" in low, (
f"'DLBCL'/'lymphome' introuvable dans les 5000 premiers chars"
)
def test_conclusion_contains_valym(self):
"""La conclusion doit mentionner le protocole VALYM."""
conclusion = self.parsed.get("sections", {}).get("conclusion", "")
assert "VALYM" in conclusion or "valym" in conclusion.lower(), (
f"'VALYM' introuvable dans conclusion (excerpt: {conclusion[:240]})"
)
def test_pool_contains_c83(self):
"""Le pool doit contenir un candidat famille C83 (lymphome)."""
sel = self.dossier.dp_selection
assert sel is not None, "dp_selection est None"
codes = [c.code or "" for c in sel.candidates]
dp = self.dossier.diagnostic_principal
dp_code = dp.cim10_suggestion if dp else ""
all_codes = codes + [dp_code]
has_c83 = any(c.startswith("C83") for c in all_codes)
# Tolérant : accepter aussi C85 (lymphome non hodgkinien) ou C84
has_lymphoma = any(
c.startswith(("C83", "C84", "C85")) for c in all_codes
)
assert has_c83 or has_lymphoma, (
f"Aucun candidat C83/C84/C85 — codes: {all_codes}"
)
def test_dp_not_symptom_r59(self):
"""Le DP ne doit PAS être R59.0 (symptôme adénopathie).
Avec le patch alias DLBCL→C83.3, le scoring doit placer C83.3
devant R59.0 grâce au bonus conclusion/alias.
"""
sel = self.dossier.dp_selection
dp = self.dossier.diagnostic_principal
chosen = (sel.chosen_code if sel else None) or (dp.cim10_suggestion if dp else "")
assert not chosen.startswith("R59"), (
f"Pipeline code encore R59.0 (symptôme) au lieu de C83.* — "
f"le patch alias aurait dû corriger ça"
)
def test_c83_is_top1(self):
"""C83.* (DLBCL) doit être le candidat #1 grâce au bonus alias."""
sel = self.dossier.dp_selection
assert sel is not None, "dp_selection est None"
assert len(sel.candidates) > 0, "Aucun candidat"
top1 = sel.candidates[0]
assert (top1.code or "").startswith("C83"), (
f"Top1 = {top1.code} ({top1.label}), attendu C83.* — "
f"tous: {[(c.code, c.score) for c in sel.candidates[:3]]}"
)
def test_evidence_if_confirmed(self):
"""Si verdict CONFIRMED, evidence ne doit pas être vide."""
sel = self.dossier.dp_selection
if sel and sel.verdict == "CONFIRMED":
assert len(sel.evidence) > 0, "CONFIRMED sans evidence — violation règle A1"