"""Tests gold DP scoring : retraitement de vrais CRH avec validation du dp_selection. Lance l'extraction sur des textes anonymisés réels et vérifie que : 1. dp_selection est peuplé (pas None) 2. Le verdict est cohérent 3. Le DP sélectionné correspond au DP attendu (gold) — mode déterministe 4. Les candidats sont scorés et triés """ import json import pytest from pathlib import Path from src.config import DossierMedical from src.medical.cim10_extractor import extract_medical_info from src.extraction.crh_parser import parse_crh BASE = Path(__file__).resolve().parent.parent ANON_DIR = BASE / "output" / "anonymized" STRUCT_DIR = BASE / "output" / "structured" # Gold dossiers DETERMINISTES : DP trouvable sans LLM (CIM10_MAP, regex, code explicite) GOLD_DETERMINISTIC = [ # 1. Classique K85.1 — DP clair dans conclusion via CIM10_MAP ("1_23042753", "CRH_23042753", "K85.1"), # 5. CRH avec DP regex — "pancréatite aiguë biliaire" en conclusion ("21_23111304", "CRH_23111304", "K85.1"), ] # Gold dossiers LLM-DEPENDANTS : DP correct nécessite le LLM (pas dans CIM10_MAP) # On vérifie que le scoring fonctionne, sans exiger le code exact GOLD_LLM_DEPENDENT = [ # K85.0 idiopathique — "pancréatite aiguë" dans MAP → K85.9, le .0 nécessite LLM ("223_23169043", "CRH_23169043", "K85.0", "K85"), # H10.2 pédiatrique — conjonctivite pas dans MAP, nécessite LLM ("250_23196454", "CRH_23196454", "H10.2", None), # D86.9 sarcoïdose — pas dans MAP, nécessite LLM ("144_23097531", "CRH_23097531", "D86.9", None), ] ALL_GOLD = [ (d, c, e) for d, c, e in GOLD_DETERMINISTIC ] + [ (d, c, e) for d, c, e, _ in GOLD_LLM_DEPENDENT ] def _load_crh(dir_name: str, crh_name: str) -> tuple[dict, str]: """Charge le texte anonymisé et le parse avec le CRH parser.""" text_path = ANON_DIR / dir_name / f"{crh_name}_anonymized.txt" if not text_path.exists(): pytest.skip(f"Fichier anonymisé manquant : {text_path}") text = text_path.read_text(encoding="utf-8") parsed = parse_crh(text) return parsed, text def _load_existing_json(dir_name: str, crh_name: str) -> dict: """Charge le JSON existant pour comparaison.""" json_path = STRUCT_DIR / dir_name / f"{crh_name}_cim10.json" if not json_path.exists(): return {} return json.loads(json_path.read_text(encoding="utf-8")) class TestDPGoldDeterministic: """Tests sur dossiers CRH dont le DP est trouvable sans LLM.""" @pytest.mark.parametrize("dir_name,crh_name,expected_dp", GOLD_DETERMINISTIC) def test_dp_selection_populated(self, dir_name, crh_name, expected_dp): """dp_selection est peuplé pour chaque CRH gold.""" parsed, text = _load_crh(dir_name, crh_name) dossier = extract_medical_info(parsed, text) assert dossier.dp_selection is not None, ( f"{crh_name}: dp_selection est None — le scoring n'a pas été déclenché" ) assert len(dossier.dp_selection.candidates) >= 1, ( f"{crh_name}: aucun candidat DP trouvé" ) @pytest.mark.parametrize("dir_name,crh_name,expected_dp", GOLD_DETERMINISTIC) def test_dp_code_matches_gold(self, dir_name, crh_name, expected_dp): """Le DP sélectionné correspond au code gold attendu.""" parsed, text = _load_crh(dir_name, crh_name) dossier = extract_medical_info(parsed, text) assert dossier.diagnostic_principal is not None, ( f"{crh_name}: aucun DP extrait" ) actual_code = dossier.diagnostic_principal.cim10_suggestion assert actual_code == expected_dp, ( f"{crh_name}: DP attendu {expected_dp}, obtenu {actual_code}" ) @pytest.mark.parametrize("dir_name,crh_name,expected_dp", GOLD_DETERMINISTIC) def test_candidates_have_scores(self, dir_name, crh_name, expected_dp): """Chaque candidat a un score et des détails.""" parsed, text = _load_crh(dir_name, crh_name) dossier = extract_medical_info(parsed, text) if dossier.dp_selection is None: pytest.skip("dp_selection absent") for c in dossier.dp_selection.candidates: assert isinstance(c.score, int), f"Score non-entier pour {c.label}" assert isinstance(c.score_details, dict), f"score_details manquant pour {c.label}" assert c.source_section, f"source_section vide pour {c.label}" @pytest.mark.parametrize("dir_name,crh_name,expected_dp", GOLD_DETERMINISTIC) def test_candidates_sorted_by_score(self, dir_name, crh_name, expected_dp): """Les candidats sont triés par score décroissant.""" parsed, text = _load_crh(dir_name, crh_name) dossier = extract_medical_info(parsed, text) if dossier.dp_selection is None or len(dossier.dp_selection.candidates) < 2: pytest.skip("Pas assez de candidats pour vérifier le tri") scores = [c.score for c in dossier.dp_selection.candidates] assert scores == sorted(scores, reverse=True), ( f"{crh_name}: candidats non triés par score: {scores}" ) class TestDPGoldLLMDependent: """Tests sur dossiers dont le DP exact nécessite le LLM. On vérifie que le scoring fonctionne (candidats trouvés, triés, scorés) sans exiger le code CIM-10 exact. """ @pytest.mark.parametrize( "dir_name,crh_name,expected_dp,expected_family", GOLD_LLM_DEPENDENT, ) def test_dp_selection_populated(self, dir_name, crh_name, expected_dp, expected_family): """dp_selection est peuplé même sans LLM.""" parsed, text = _load_crh(dir_name, crh_name) dossier = extract_medical_info(parsed, text) assert dossier.dp_selection is not None, ( f"{crh_name}: dp_selection est None" ) assert len(dossier.dp_selection.candidates) >= 1, ( f"{crh_name}: aucun candidat DP trouvé" ) @pytest.mark.parametrize( "dir_name,crh_name,expected_dp,expected_family", GOLD_LLM_DEPENDENT, ) def test_dp_family_if_specified(self, dir_name, crh_name, expected_dp, expected_family): """Si une famille est spécifiée, le DP trouvé est dans la bonne famille CIM-10.""" if expected_family is None: pytest.skip("Pas de famille attendue pour ce dossier") parsed, text = _load_crh(dir_name, crh_name) dossier = extract_medical_info(parsed, text) if dossier.diagnostic_principal is None: pytest.skip("Aucun DP extrait") actual_code = dossier.diagnostic_principal.cim10_suggestion assert actual_code and actual_code.startswith(expected_family), ( f"{crh_name}: DP attendu famille {expected_family}*, obtenu {actual_code}" ) @pytest.mark.parametrize( "dir_name,crh_name,expected_dp,expected_family", GOLD_LLM_DEPENDENT, ) def test_candidates_have_scores(self, dir_name, crh_name, expected_dp, expected_family): """Chaque candidat a un score et des détails.""" parsed, text = _load_crh(dir_name, crh_name) dossier = extract_medical_info(parsed, text) if dossier.dp_selection is None: pytest.skip("dp_selection absent") for c in dossier.dp_selection.candidates: assert isinstance(c.score, int), f"Score non-entier pour {c.label}" assert isinstance(c.score_details, dict), f"score_details manquant pour {c.label}" class TestDPGoldNonRegression: """Vérifie que le nouveau scoring ne dégrade pas les DAS existants. Tolérance élevée car les anciens JSON contenaient des DAS enrichis par LLM (source: "llm_das") qui ne sont pas reproductibles sans Ollama. """ @pytest.mark.parametrize("dir_name,crh_name,expected_dp", ALL_GOLD) def test_das_still_extracted(self, dir_name, crh_name, expected_dp): """Les DAS principaux (non-LLM) sont toujours extraits.""" parsed, text = _load_crh(dir_name, crh_name) dossier = extract_medical_info(parsed, text) old_json = _load_existing_json(dir_name, crh_name) if not old_json: pytest.skip("JSON existant manquant") # Filtrer les DAS LLM-only de l'ancien JSON (source "llm_das") old_das_codes = set() for d in old_json.get("diagnostics_associes", []): code = d.get("cim10_suggestion") source = d.get("source", "") if code and source != "llm_das": old_das_codes.add(code) new_das_codes = { d.cim10_suggestion for d in dossier.diagnostics_associes if d.cim10_suggestion } # Tolérance : le DP code lui-même est exclu des DAS, c'est normal missing = old_das_codes - new_das_codes if missing: missing -= {expected_dp} # Tolérance : codes de même famille que le DP sont exclus (dédup DP/DAS) if expected_dp: missing = {c for c in missing if c[:3] != expected_dp[:3]} # Tolérance élargie : sans LLM beaucoup de DAS manquent # (les anciens JSON avaient des DAS source "llm_das", "conclusion", "edsnlp" # enrichis par le LLM qui ne sont pas reproductibles sans Ollama) assert len(missing) <= 10, ( f"{crh_name}: trop de DAS manquants après le nouveau scoring: {missing}" ) class TestDPGoldVerbose: """Test verbeux pour inspection manuelle — affiche les détails dp_selection.""" @pytest.mark.parametrize("dir_name,crh_name,expected_dp", ALL_GOLD) def test_print_dp_selection(self, dir_name, crh_name, expected_dp, capsys): """Affiche dp_selection pour inspection manuelle (pytest -s).""" parsed, text = _load_crh(dir_name, crh_name) # Lister les sections CRH parsées sections = parsed.get("sections", {}) section_keys = list(sections.keys()) dossier = extract_medical_info(parsed, text) print(f"\n{'='*60}") print(f"GOLD: {crh_name} — DP attendu: {expected_dp}") print(f"Sections CRH parsées: {section_keys}") # Afficher les nouvelles sections si présentes for key in ("diag_sortie", "diag_principal", "synthese"): if key in sections: print(f" {key}: {sections[key][:150]}...") if dossier.dp_selection: sel = dossier.dp_selection print(f"Verdict: {sel.verdict}") print(f"Winner reason: {sel.winner_reason}") for i, c in enumerate(sel.candidates): print(f" [{i+1}] {c.code} — {c.label[:60]} " f"(section={c.source_section}, score={c.score}, " f"details={c.score_details})") else: print("dp_selection: None (Trackare DP?)") actual = dossier.diagnostic_principal if actual: print(f"DP retenu: {actual.cim10_suggestion} — {actual.texte}") print(f"Source: {actual.source}") else: print("AUCUN DP") print(f"{'='*60}")