chore: add .gitignore
This commit is contained in:
272
tests/test_dp_gold.py
Normal file
272
tests/test_dp_gold.py
Normal file
@@ -0,0 +1,272 @@
|
||||
"""Tests gold DP scoring : retraitement de vrais CRH avec validation du dp_selection.
|
||||
|
||||
Lance l'extraction sur des textes anonymisés réels et vérifie que :
|
||||
1. dp_selection est peuplé (pas None)
|
||||
2. Le verdict est cohérent
|
||||
3. Le DP sélectionné correspond au DP attendu (gold) — mode déterministe
|
||||
4. Les candidats sont scorés et triés
|
||||
"""
|
||||
|
||||
import json
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
|
||||
from src.config import DossierMedical
|
||||
from src.medical.cim10_extractor import extract_medical_info
|
||||
from src.extraction.crh_parser import parse_crh
|
||||
|
||||
BASE = Path(__file__).resolve().parent.parent
|
||||
ANON_DIR = BASE / "output" / "anonymized"
|
||||
STRUCT_DIR = BASE / "output" / "structured"
|
||||
|
||||
# Gold dossiers DETERMINISTES : DP trouvable sans LLM (CIM10_MAP, regex, code explicite)
|
||||
GOLD_DETERMINISTIC = [
|
||||
# 1. Classique K85.1 — DP clair dans conclusion via CIM10_MAP
|
||||
("1_23042753", "CRH_23042753", "K85.1"),
|
||||
# 5. CRH avec DP regex — "pancréatite aiguë biliaire" en conclusion
|
||||
("21_23111304", "CRH_23111304", "K85.1"),
|
||||
]
|
||||
|
||||
# Gold dossiers LLM-DEPENDANTS : DP correct nécessite le LLM (pas dans CIM10_MAP)
|
||||
# On vérifie que le scoring fonctionne, sans exiger le code exact
|
||||
GOLD_LLM_DEPENDENT = [
|
||||
# K85.0 idiopathique — "pancréatite aiguë" dans MAP → K85.9, le .0 nécessite LLM
|
||||
("223_23169043", "CRH_23169043", "K85.0", "K85"),
|
||||
# H10.2 pédiatrique — conjonctivite pas dans MAP, nécessite LLM
|
||||
("250_23196454", "CRH_23196454", "H10.2", None),
|
||||
# D86.9 sarcoïdose — pas dans MAP, nécessite LLM
|
||||
("144_23097531", "CRH_23097531", "D86.9", None),
|
||||
]
|
||||
|
||||
ALL_GOLD = [
|
||||
(d, c, e) for d, c, e in GOLD_DETERMINISTIC
|
||||
] + [
|
||||
(d, c, e) for d, c, e, _ in GOLD_LLM_DEPENDENT
|
||||
]
|
||||
|
||||
|
||||
def _load_crh(dir_name: str, crh_name: str) -> tuple[dict, str]:
|
||||
"""Charge le texte anonymisé et le parse avec le CRH parser."""
|
||||
text_path = ANON_DIR / dir_name / f"{crh_name}_anonymized.txt"
|
||||
if not text_path.exists():
|
||||
pytest.skip(f"Fichier anonymisé manquant : {text_path}")
|
||||
text = text_path.read_text(encoding="utf-8")
|
||||
parsed = parse_crh(text)
|
||||
return parsed, text
|
||||
|
||||
|
||||
def _load_existing_json(dir_name: str, crh_name: str) -> dict:
|
||||
"""Charge le JSON existant pour comparaison."""
|
||||
json_path = STRUCT_DIR / dir_name / f"{crh_name}_cim10.json"
|
||||
if not json_path.exists():
|
||||
return {}
|
||||
return json.loads(json_path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
class TestDPGoldDeterministic:
|
||||
"""Tests sur dossiers CRH dont le DP est trouvable sans LLM."""
|
||||
|
||||
@pytest.mark.parametrize("dir_name,crh_name,expected_dp", GOLD_DETERMINISTIC)
|
||||
def test_dp_selection_populated(self, dir_name, crh_name, expected_dp):
|
||||
"""dp_selection est peuplé pour chaque CRH gold."""
|
||||
parsed, text = _load_crh(dir_name, crh_name)
|
||||
dossier = extract_medical_info(parsed, text)
|
||||
|
||||
assert dossier.dp_selection is not None, (
|
||||
f"{crh_name}: dp_selection est None — le scoring n'a pas été déclenché"
|
||||
)
|
||||
assert len(dossier.dp_selection.candidates) >= 1, (
|
||||
f"{crh_name}: aucun candidat DP trouvé"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize("dir_name,crh_name,expected_dp", GOLD_DETERMINISTIC)
|
||||
def test_dp_code_matches_gold(self, dir_name, crh_name, expected_dp):
|
||||
"""Le DP sélectionné correspond au code gold attendu."""
|
||||
parsed, text = _load_crh(dir_name, crh_name)
|
||||
dossier = extract_medical_info(parsed, text)
|
||||
|
||||
assert dossier.diagnostic_principal is not None, (
|
||||
f"{crh_name}: aucun DP extrait"
|
||||
)
|
||||
actual_code = dossier.diagnostic_principal.cim10_suggestion
|
||||
assert actual_code == expected_dp, (
|
||||
f"{crh_name}: DP attendu {expected_dp}, obtenu {actual_code}"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize("dir_name,crh_name,expected_dp", GOLD_DETERMINISTIC)
|
||||
def test_candidates_have_scores(self, dir_name, crh_name, expected_dp):
|
||||
"""Chaque candidat a un score et des détails."""
|
||||
parsed, text = _load_crh(dir_name, crh_name)
|
||||
dossier = extract_medical_info(parsed, text)
|
||||
|
||||
if dossier.dp_selection is None:
|
||||
pytest.skip("dp_selection absent")
|
||||
|
||||
for c in dossier.dp_selection.candidates:
|
||||
assert isinstance(c.score, int), f"Score non-entier pour {c.label}"
|
||||
assert isinstance(c.score_details, dict), f"score_details manquant pour {c.label}"
|
||||
assert c.source_section, f"source_section vide pour {c.label}"
|
||||
|
||||
@pytest.mark.parametrize("dir_name,crh_name,expected_dp", GOLD_DETERMINISTIC)
|
||||
def test_candidates_sorted_by_score(self, dir_name, crh_name, expected_dp):
|
||||
"""Les candidats sont triés par score décroissant."""
|
||||
parsed, text = _load_crh(dir_name, crh_name)
|
||||
dossier = extract_medical_info(parsed, text)
|
||||
|
||||
if dossier.dp_selection is None or len(dossier.dp_selection.candidates) < 2:
|
||||
pytest.skip("Pas assez de candidats pour vérifier le tri")
|
||||
|
||||
scores = [c.score for c in dossier.dp_selection.candidates]
|
||||
assert scores == sorted(scores, reverse=True), (
|
||||
f"{crh_name}: candidats non triés par score: {scores}"
|
||||
)
|
||||
|
||||
|
||||
class TestDPGoldLLMDependent:
|
||||
"""Tests sur dossiers dont le DP exact nécessite le LLM.
|
||||
|
||||
On vérifie que le scoring fonctionne (candidats trouvés, triés, scorés)
|
||||
sans exiger le code CIM-10 exact.
|
||||
"""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dir_name,crh_name,expected_dp,expected_family",
|
||||
GOLD_LLM_DEPENDENT,
|
||||
)
|
||||
def test_dp_selection_populated(self, dir_name, crh_name, expected_dp, expected_family):
|
||||
"""dp_selection est peuplé même sans LLM."""
|
||||
parsed, text = _load_crh(dir_name, crh_name)
|
||||
dossier = extract_medical_info(parsed, text)
|
||||
|
||||
assert dossier.dp_selection is not None, (
|
||||
f"{crh_name}: dp_selection est None"
|
||||
)
|
||||
assert len(dossier.dp_selection.candidates) >= 1, (
|
||||
f"{crh_name}: aucun candidat DP trouvé"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dir_name,crh_name,expected_dp,expected_family",
|
||||
GOLD_LLM_DEPENDENT,
|
||||
)
|
||||
def test_dp_family_if_specified(self, dir_name, crh_name, expected_dp, expected_family):
|
||||
"""Si une famille est spécifiée, le DP trouvé est dans la bonne famille CIM-10."""
|
||||
if expected_family is None:
|
||||
pytest.skip("Pas de famille attendue pour ce dossier")
|
||||
|
||||
parsed, text = _load_crh(dir_name, crh_name)
|
||||
dossier = extract_medical_info(parsed, text)
|
||||
|
||||
if dossier.diagnostic_principal is None:
|
||||
pytest.skip("Aucun DP extrait")
|
||||
|
||||
actual_code = dossier.diagnostic_principal.cim10_suggestion
|
||||
assert actual_code and actual_code.startswith(expected_family), (
|
||||
f"{crh_name}: DP attendu famille {expected_family}*, obtenu {actual_code}"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dir_name,crh_name,expected_dp,expected_family",
|
||||
GOLD_LLM_DEPENDENT,
|
||||
)
|
||||
def test_candidates_have_scores(self, dir_name, crh_name, expected_dp, expected_family):
|
||||
"""Chaque candidat a un score et des détails."""
|
||||
parsed, text = _load_crh(dir_name, crh_name)
|
||||
dossier = extract_medical_info(parsed, text)
|
||||
|
||||
if dossier.dp_selection is None:
|
||||
pytest.skip("dp_selection absent")
|
||||
|
||||
for c in dossier.dp_selection.candidates:
|
||||
assert isinstance(c.score, int), f"Score non-entier pour {c.label}"
|
||||
assert isinstance(c.score_details, dict), f"score_details manquant pour {c.label}"
|
||||
|
||||
|
||||
class TestDPGoldNonRegression:
|
||||
"""Vérifie que le nouveau scoring ne dégrade pas les DAS existants.
|
||||
|
||||
Tolérance élevée car les anciens JSON contenaient des DAS enrichis par LLM
|
||||
(source: "llm_das") qui ne sont pas reproductibles sans Ollama.
|
||||
"""
|
||||
|
||||
@pytest.mark.parametrize("dir_name,crh_name,expected_dp", ALL_GOLD)
|
||||
def test_das_still_extracted(self, dir_name, crh_name, expected_dp):
|
||||
"""Les DAS principaux (non-LLM) sont toujours extraits."""
|
||||
parsed, text = _load_crh(dir_name, crh_name)
|
||||
dossier = extract_medical_info(parsed, text)
|
||||
old_json = _load_existing_json(dir_name, crh_name)
|
||||
|
||||
if not old_json:
|
||||
pytest.skip("JSON existant manquant")
|
||||
|
||||
# Filtrer les DAS LLM-only de l'ancien JSON (source "llm_das")
|
||||
old_das_codes = set()
|
||||
for d in old_json.get("diagnostics_associes", []):
|
||||
code = d.get("cim10_suggestion")
|
||||
source = d.get("source", "")
|
||||
if code and source != "llm_das":
|
||||
old_das_codes.add(code)
|
||||
|
||||
new_das_codes = {
|
||||
d.cim10_suggestion
|
||||
for d in dossier.diagnostics_associes
|
||||
if d.cim10_suggestion
|
||||
}
|
||||
|
||||
# Tolérance : le DP code lui-même est exclu des DAS, c'est normal
|
||||
missing = old_das_codes - new_das_codes
|
||||
if missing:
|
||||
missing -= {expected_dp}
|
||||
# Tolérance : codes de même famille que le DP sont exclus (dédup DP/DAS)
|
||||
if expected_dp:
|
||||
missing = {c for c in missing if c[:3] != expected_dp[:3]}
|
||||
|
||||
# Tolérance élargie : sans LLM beaucoup de DAS manquent
|
||||
# (les anciens JSON avaient des DAS source "llm_das", "conclusion", "edsnlp"
|
||||
# enrichis par le LLM qui ne sont pas reproductibles sans Ollama)
|
||||
assert len(missing) <= 10, (
|
||||
f"{crh_name}: trop de DAS manquants après le nouveau scoring: {missing}"
|
||||
)
|
||||
|
||||
|
||||
class TestDPGoldVerbose:
|
||||
"""Test verbeux pour inspection manuelle — affiche les détails dp_selection."""
|
||||
|
||||
@pytest.mark.parametrize("dir_name,crh_name,expected_dp", ALL_GOLD)
|
||||
def test_print_dp_selection(self, dir_name, crh_name, expected_dp, capsys):
|
||||
"""Affiche dp_selection pour inspection manuelle (pytest -s)."""
|
||||
parsed, text = _load_crh(dir_name, crh_name)
|
||||
|
||||
# Lister les sections CRH parsées
|
||||
sections = parsed.get("sections", {})
|
||||
section_keys = list(sections.keys())
|
||||
|
||||
dossier = extract_medical_info(parsed, text)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"GOLD: {crh_name} — DP attendu: {expected_dp}")
|
||||
print(f"Sections CRH parsées: {section_keys}")
|
||||
|
||||
# Afficher les nouvelles sections si présentes
|
||||
for key in ("diag_sortie", "diag_principal", "synthese"):
|
||||
if key in sections:
|
||||
print(f" {key}: {sections[key][:150]}...")
|
||||
|
||||
if dossier.dp_selection:
|
||||
sel = dossier.dp_selection
|
||||
print(f"Verdict: {sel.verdict}")
|
||||
print(f"Winner reason: {sel.winner_reason}")
|
||||
for i, c in enumerate(sel.candidates):
|
||||
print(f" [{i+1}] {c.code} — {c.label[:60]} "
|
||||
f"(section={c.source_section}, score={c.score}, "
|
||||
f"details={c.score_details})")
|
||||
else:
|
||||
print("dp_selection: None (Trackare DP?)")
|
||||
|
||||
actual = dossier.diagnostic_principal
|
||||
if actual:
|
||||
print(f"DP retenu: {actual.cim10_suggestion} — {actual.texte}")
|
||||
print(f"Source: {actual.source}")
|
||||
else:
|
||||
print("AUCUN DP")
|
||||
print(f"{'='*60}")
|
||||
Reference in New Issue
Block a user