#!/usr/bin/env python3 """Benchmark qualité DP scoring déterministe vs pipeline LLM. Compare le DP trouvé par le nouveau scoring déterministe (± fallback LLM) avec le DP de référence (gold) extrait par le pipeline complet (avec LLM). Métriques (calculées sur dossiers avec gold_dp non-None uniquement) : - exact_match : code identique - family4 : 4 premiers chars identiques (ex: K85.1 vs K85.0) - family3 : 3 premiers chars identiques (ex: K85.x vs K86.x → non) - coverage_dp : % de dossiers où un DP est proposé (new_code non-None) Usage: .venv/bin/python3 benchmark_quality.py [--limit 50] [--verbose] .venv/bin/python3 benchmark_quality.py --limit 50 --use-llm --verbose """ from __future__ import annotations import argparse import json import sys import time from collections import Counter from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parent)) from src.extraction.crh_parser import parse_crh from src.config import DossierMedical, Diagnostic from src.medical.cim10_extractor import ( _extract_sejour, _extract_actes, _extract_biologie, _extract_imagerie, ) from src.medical.dp_scoring import ( build_dp_shortlist, build_dp_candidate_pool, score_candidates, select_dp, llm_dp_fallback, llm_dp_pool_rank, generate_synthese_pmsi, ) BASE = Path(__file__).resolve().parent ANON_DIR = BASE / "output" / "anonymized" STRUCT_DIR = BASE / "output" / "structured" def find_crh_dossiers(limit: int = 50) -> list[dict]: """Trouve les dossiers avec CRH anonymisé ET JSON gold.""" dossiers = [] for anon_dir in sorted(ANON_DIR.iterdir()): if not anon_dir.is_dir(): continue dir_name = anon_dir.name crh_files = list(anon_dir.glob("CRH_*_anonymized.txt")) if not crh_files: continue crh_file = crh_files[0] crh_name = crh_file.stem.replace("_anonymized", "") gold_json = STRUCT_DIR / dir_name / f"{crh_name}_cim10.json" if not gold_json.exists(): continue dossiers.append({ "dir_name": dir_name, "crh_name": crh_name, "text_path": crh_file, "gold_path": gold_json, }) if len(dossiers) >= limit: break return dossiers def load_gold_dp(gold_path: Path) -> dict: """Charge le DP de référence depuis le JSON gold.""" data = json.loads(gold_path.read_text(encoding="utf-8")) dp = data.get("diagnostic_principal", {}) return { "code": dp.get("cim10_suggestion"), "label": dp.get("texte", ""), "confidence": dp.get("cim10_confidence", ""), "source": dp.get("source", ""), } def run_dp_only(text_path: Path, use_llm: bool = False) -> dict: """Extraction DP ciblée : scoring déterministe + fallback LLM optionnel.""" text = text_path.read_text(encoding="utf-8") parsed = parse_crh(text) dossier = DossierMedical() dossier.document_type = parsed.get("type", "") _extract_sejour(parsed, dossier) _extract_biologie(text, dossier) _extract_actes(text, dossier) _extract_imagerie(text, dossier) edsnlp_result = None try: from src.medical.edsnlp_pipeline import run_edsnlp edsnlp_result = run_edsnlp(text) except Exception: pass candidates = build_dp_shortlist(parsed, text, edsnlp_result, dossier) candidates = score_candidates(candidates, dossier, full_text=text) selection = select_dp(candidates, dossier, use_llm=use_llm) # Instrumentation : comorbidity fallback comorbidity_fallback = ( selection.verdict == "review" and "comorbidité banale" in (selection.winner_reason or "") ) dp_pre_llm = None if comorbidity_fallback and selection.candidates: c0 = selection.candidates[0] dp_pre_llm = {"code": c0.code, "section": c0.source_section} if use_llm and selection.verdict == "review": llm_selection = llm_dp_fallback( parsed, text, dossier, dp_candidates=candidates, edsnlp_result=edsnlp_result, ) if llm_selection.candidates: all_candidates = list(llm_selection.candidates) if selection.candidates: all_candidates.extend(selection.candidates) llm_selection.candidates = all_candidates selection = llm_selection dossier.dp_selection = selection if selection.candidates: winner = selection.candidates[0] dossier.diagnostic_principal = Diagnostic( texte=winner.label, cim10_suggestion=winner.code, source=winner.source_section, source_page=winner.source_page, source_excerpt=winner.source_excerpt, ) result = { "dp_code": None, "dp_label": "", "dp_source": "", "verdict": None, "winner_reason": None, "candidates": [], "comorbidity_fallback": comorbidity_fallback, "dp_pre_llm": dp_pre_llm, } if dossier.diagnostic_principal: result["dp_code"] = dossier.diagnostic_principal.cim10_suggestion result["dp_label"] = dossier.diagnostic_principal.texte result["dp_source"] = dossier.diagnostic_principal.source or "" if dossier.dp_selection: sel = dossier.dp_selection result["verdict"] = sel.verdict result["winner_reason"] = sel.winner_reason result["candidates"] = [ {"code": c.code, "label": c.label, "section": c.source_section, "score": c.score, "details": c.score_details} for c in sel.candidates ] return result def run_dp_pool_rank(text_path: Path) -> dict: """DP Pool Rank : génère SynthesePMSI + pool, puis LLM choisit parmi le pool.""" text = text_path.read_text(encoding="utf-8") parsed = parse_crh(text) dossier = DossierMedical() dossier.document_type = parsed.get("type", "") _extract_sejour(parsed, dossier) _extract_biologie(text, dossier) _extract_actes(text, dossier) _extract_imagerie(text, dossier) edsnlp_result = None try: from src.medical.edsnlp_pipeline import run_edsnlp edsnlp_result = run_edsnlp(text) except Exception: pass # 1. Synthèse PMSI synthese = generate_synthese_pmsi(parsed, text, dossier) # 2. Pool de candidats pool = build_dp_candidate_pool(parsed, text, edsnlp_result, dossier) # 3. LLM pool rank dp_shortlist = build_dp_shortlist(parsed, text, edsnlp_result, dossier) dp_shortlist = score_candidates(dp_shortlist, dossier, full_text=text) selection = llm_dp_pool_rank( parsed, text, dossier, pool_candidates=pool, synthese=synthese, fallback_oneshot=True, dp_candidates=dp_shortlist, edsnlp_result=edsnlp_result, ) dossier.dp_selection = selection if selection.candidates: winner = selection.candidates[0] dossier.diagnostic_principal = Diagnostic( texte=winner.label, cim10_suggestion=winner.code, source=winner.source_section, source_page=winner.source_page, source_excerpt=winner.source_excerpt, ) result = { "dp_code": None, "dp_label": "", "dp_source": "", "verdict": None, "winner_reason": None, "candidates": [], "pool_size": len(pool), "pool_top10": [ {"terme": c.terme, "section": c.section, "preuve": c.preuve[:120], "score": round(c.score_initial, 2)} for c in pool[:10] ], "synthese": synthese.model_dump() if synthese else None, } if dossier.diagnostic_principal: result["dp_code"] = dossier.diagnostic_principal.cim10_suggestion result["dp_label"] = dossier.diagnostic_principal.texte result["dp_source"] = dossier.diagnostic_principal.source or "" if dossier.dp_selection: sel = dossier.dp_selection result["verdict"] = sel.verdict result["winner_reason"] = sel.winner_reason result["candidates"] = [ {"code": c.code, "label": c.label, "section": c.source_section, "score": c.score, "details": c.score_details} for c in sel.candidates ] return result # --- Matching helpers --- def _norm(code: str) -> str: """Normalise un code CIM-10 pour comparaison : supprime le point.""" return code.replace(".", "") def match_exact(a: str | None, b: str | None) -> bool: if not a or not b: return False return a == b def match_family4(a: str | None, b: str | None) -> bool: """4 premiers chars sans point identiques (ex: K851 vs K850 → True).""" if not a or not b: return False return _norm(a)[:4] == _norm(b)[:4] def match_family3(a: str | None, b: str | None) -> bool: """3 premiers chars identiques (ex: K85.x → K85).""" if not a or not b: return False return a[:3] == b[:3] def main(): parser = argparse.ArgumentParser(description="Benchmark qualité DP scoring") parser.add_argument("--limit", type=int, default=50, help="Nombre de dossiers") parser.add_argument("--verbose", action="store_true", help="Afficher chaque dossier") parser.add_argument("--use-llm", action="store_true", help="Activer le fallback LLM DP sur les REVIEW (nécessite Ollama)") parser.add_argument("--synthese-pmsi", action="store_true", help="Générer la SynthesePMSI pour chaque dossier (nécessite Ollama)") parser.add_argument("--dp-candidates", action="store_true", help="Générer et afficher le DP Candidate Pool pour chaque dossier") parser.add_argument("--use-dp-pool-rank", action="store_true", help="Utiliser le mode DP Pool Rank (LLM choisit parmi le pool, nécessite Ollama)") args = parser.parse_args() mode = "déterministe + LLM fallback" if args.use_llm else "déterministe seul" if args.use_dp_pool_rank: mode = "DP Pool Rank (LLM choisit parmi pool)" if args.synthese_pmsi: mode += " + SynthesePMSI" if args.dp_candidates: mode += " + DPCandidatePool" print(f"=== Benchmark DP scoring {mode} (n={args.limit}) ===\n") dossiers = find_crh_dossiers(limit=args.limit) print(f"Dossiers CRH trouvés : {len(dossiers)}\n") if not dossiers: print("ERREUR : aucun dossier CRH avec gold JSON trouvé") return # Compteurs total = len(dossiers) gold_none = 0 # gold_dp = None (exclus des métriques de match) evaluable = 0 # gold_dp non-None → base pour exact/family exact = 0 fam4 = 0 fam3 = 0 coverage_has_dp = 0 # new_code non-None (sur total) review_count = 0 confirmed_count = 0 comorbidity_fallback_count = 0 errors: list[dict] = [] review_reasons: list[str] = [] syntheses: list[dict] = [] # (crh_name, gold_code, new_code, synthese_dict) dp_pools: list[dict] = [] pool_rank_results: list[dict] = [] # résultats détaillés pool rank gold_confidences = Counter() verdicts = Counter() source_sections = Counter() timings: list[float] = [] for i, d in enumerate(dossiers): gold = load_gold_dp(d["gold_path"]) gold_code = gold["code"] gold_confidences[gold["confidence"] or "none"] += 1 t0 = time.time() if args.use_dp_pool_rank: result = run_dp_pool_rank(d["text_path"]) pool_rank_results.append({ "crh": d["crh_name"], "gold_code": gold_code, "dp_label": result["dp_label"], "dp_source": result["dp_source"], "verdict": result["verdict"], "winner_reason": result["winner_reason"], "pool_size": result.get("pool_size", 0), "pool_top10": result.get("pool_top10", []), "synthese": result.get("synthese"), "candidates": result.get("candidates", []), }) else: result = run_dp_only(d["text_path"], use_llm=args.use_llm) elapsed = time.time() - t0 timings.append(elapsed) new_code = result["dp_code"] # SynthesePMSI optionnelle if args.synthese_pmsi: text_synth = d["text_path"].read_text(encoding="utf-8") parsed_synth = parse_crh(text_synth) dossier_tmp = DossierMedical() dossier_tmp.document_type = parsed_synth.get("type", "") _extract_sejour(parsed_synth, dossier_tmp) _extract_actes(text_synth, dossier_tmp) synthese = generate_synthese_pmsi(parsed_synth, text_synth, dossier_tmp) syntheses.append({ "crh": d["crh_name"], "gold_code": gold_code, "new_code": new_code, "synthese": synthese.model_dump() if synthese else None, }) # DP Candidate Pool optionnel if args.dp_candidates: text_pool = d["text_path"].read_text(encoding="utf-8") parsed_pool = parse_crh(text_pool) dossier_pool = DossierMedical() dossier_pool.document_type = parsed_pool.get("type", "") _extract_sejour(parsed_pool, dossier_pool) _extract_actes(text_pool, dossier_pool) edsnlp_pool = None try: from src.medical.edsnlp_pipeline import run_edsnlp edsnlp_pool = run_edsnlp(text_pool) except Exception: pass pool = build_dp_candidate_pool(parsed_pool, text_pool, edsnlp_pool, dossier_pool) dp_pools.append({ "crh": d["crh_name"], "gold_code": gold_code, "new_code": new_code, "pool_size": len(pool), "candidates": [ {"terme": c.terme, "section": c.section, "preuve": c.preuve[:120], "score": round(c.score_initial, 2)} for c in pool ], }) verdict = result["verdict"] verdicts[verdict or "no_selection"] += 1 if result["dp_source"]: source_sections[result["dp_source"]] += 1 # Coverage : new_code proposé (sur total) if new_code: coverage_has_dp += 1 # Métriques de match : uniquement si gold_dp non-None if gold_code is None: gold_none += 1 else: evaluable += 1 is_exact = match_exact(new_code, gold_code) is_f4 = match_family4(new_code, gold_code) is_f3 = match_family3(new_code, gold_code) if is_exact: exact += 1 if is_f4: fam4 += 1 if is_f3: fam3 += 1 # Erreurs (non-exact avec gold) if not is_exact: errors.append({ "dir": d["dir_name"], "crh": d["crh_name"], "gold_code": gold_code, "gold_label": gold["label"], "gold_conf": gold["confidence"], "new_code": new_code or "(aucun)", "new_label": result["dp_label"] or "(aucun)", "new_source": result["dp_source"], "verdict": verdict, "winner_reason": result["winner_reason"] or "", "candidates": result["candidates"][:3], "is_f4": is_f4, "is_f3": is_f3, }) if result.get("comorbidity_fallback"): comorbidity_fallback_count += 1 if verdict == "review": review_count += 1 if result["winner_reason"]: review_reasons.append(result["winner_reason"]) elif verdict == "confirmed": confirmed_count += 1 if args.verbose: if gold_code is None: tag = "SKIP" elif match_exact(new_code, gold_code): tag = "EXACT" elif match_family4(new_code, gold_code): tag = "FAM4" elif match_family3(new_code, gold_code): tag = "FAM3" else: tag = "MISS" print(f" [{i+1:3d}] {d['crh_name']} : gold={gold_code} new={new_code} " f"[{tag}] verdict={verdict} ({elapsed:.1f}s)") # === Rapport === print(f"\n{'='*60}") print(f"RESULTATS — {total} dossiers CRH ({mode})") print(f"{'='*60}\n") pct = lambda n, d: n / d * 100 if d else 0 print(f" Évaluables (gold non-None) : {evaluable}/{total} (excl. {gold_none} sans gold DP)") print() print(f" DP exact match : {exact}/{evaluable} ({pct(exact, evaluable):.1f}%)") print(f" DP family4 : {fam4}/{evaluable} ({pct(fam4, evaluable):.1f}%)") print(f" DP family3 : {fam3}/{evaluable} ({pct(fam3, evaluable):.1f}%)") print(f" Coverage DP : {coverage_has_dp}/{total} ({pct(coverage_has_dp, total):.1f}%)") print() print(f" Verdict REVIEW : {review_count}/{total} ({pct(review_count, total):.1f}%)") print(f" Verdict CONFIRM: {confirmed_count}/{total} ({pct(confirmed_count, total):.1f}%)") print(f" Comorbidité FB : {comorbidity_fallback_count}/{total} ({pct(comorbidity_fallback_count, total):.1f}%)") if timings: avg_t = sum(timings) / len(timings) print(f"\n Temps moyen : {avg_t:.1f}s/dossier") print(f" Temps total : {sum(timings):.1f}s") print(f"\n Gold confidence :") for conf, cnt in gold_confidences.most_common(): print(f" {conf:8s} : {cnt}") print(f"\n Sources DP (new) :") for src, cnt in source_sections.most_common(): print(f" {src:35s} : {cnt}") print(f"\n Verdicts :") for v, cnt in verdicts.most_common(): print(f" {v:15s} : {cnt}") if review_reasons: print(f"\n Top 5 review reasons :") reason_patterns = Counter() for r in review_reasons: if "aucun candidat" in r: reason_patterns["aucun candidat DP trouvé"] += 1 elif "delta insuffisant" in r: reason_patterns["delta insuffisant (ambiguïté)"] += 1 elif "evidence_excerpt vide" in r: reason_patterns["LLM: evidence_excerpt vide"] += 1 elif "comorbidité" in r: reason_patterns["LLM: comorbidité hors section forte"] += 1 elif "code invalide" in r: reason_patterns["LLM: code CIM-10 invalide"] += 1 elif "LLM non disponible" in r or "erreur LLM" in r: reason_patterns["LLM: erreur/indisponible"] += 1 elif "réponse LLM invalide" in r: reason_patterns["LLM: réponse invalide"] += 1 elif "section faible" in r or "confidence" in r: reason_patterns["LLM: garde-fou (section/confidence)"] += 1 else: reason_patterns[r[:60]] += 1 for reason, cnt in reason_patterns.most_common(5): print(f" [{cnt:2d}] {reason}") if errors: print(f"\n{'='*60}") print(f"ERREURS DP — {len(errors)} dossiers (5 premiers)") print(f"{'='*60}\n") for e in errors[:5]: fam_tag = " [fam4]" if e.get("is_f4") else (" [fam3]" if e.get("is_f3") else "") print(f" {e['crh']} ({e['dir']}){fam_tag}") print(f" Gold : {e['gold_code']} — {e['gold_label'][:60]} (conf={e['gold_conf']})") print(f" New : {e['new_code']} — {e['new_label'][:60]}") print(f" Source: {e['new_source']}, Verdict: {e['verdict']}") if e.get('winner_reason'): print(f" Reason: {e['winner_reason'][:80]}") if e['candidates']: print(f" Candidats :") for c in e['candidates']: print(f" {c['code']} — {c['label'][:50]} " f"(section={c['section']}, score={c['score']})") print() # Affichage des synthèses PMSI si activé if args.synthese_pmsi and syntheses: print(f"\n{'='*60}") print(f"SYNTHESES PMSI — {len(syntheses)} dossiers") print(f"{'='*60}") for s in syntheses: print(f"\n --- {s['crh']} (gold={s['gold_code']}, new={s['new_code']}) ---") syn = s.get("synthese") if not syn: print(" (échec génération)") continue print(f" Motif admission : {syn.get('motif_admission', '')[:100]}") print(f" Problème PEC : {syn.get('probleme_pris_en_charge', '')[:100]}") print(f" Diagnostic retenu : {syn.get('diagnostic_retenu', '')[:100]}") actes = syn.get("actes_ou_traitements_majeurs", []) if actes: print(f" Actes/traitements : {', '.join(a[:60] for a in actes[:4])}") compli = syn.get("complications", []) if compli: print(f" Complications : {', '.join(c[:60] for c in compli[:3])}") comor = syn.get("terrain_comorbidites", []) if comor: print(f" Terrain/comorbidités: {', '.join(c[:60] for c in comor[:5])}") preuves = syn.get("preuves", []) if preuves: print(f" Preuves ({len(preuves)}) :") for p in preuves[:3]: print(f" [{p.get('section', '?')}] {p.get('excerpt', '')[:120]}") # Affichage des résultats DP Pool Rank si activé if args.use_dp_pool_rank and pool_rank_results: print(f"\n{'='*60}") print(f"DP POOL RANK — {len(pool_rank_results)} dossiers") print(f"{'='*60}") chosen_ok = sum(1 for r in pool_rank_results if r["dp_label"]) print(f"\n Choix effectué : {chosen_ok}/{len(pool_rank_results)} " f"({chosen_ok/len(pool_rank_results)*100:.0f}%)") for r in pool_rank_results: print(f"\n --- {r['crh']} (gold={r['gold_code']}) ---") # SynthesePMSI syn = r.get("synthese") if syn: print(f" SynthesePMSI :") print(f" Motif admission : {syn.get('motif_admission', '')[:80]}") print(f" Problème PEC : {syn.get('probleme_pris_en_charge', '')[:80]}") print(f" Diag retenu : {syn.get('diagnostic_retenu', '')[:80]}") else: print(f" SynthesePMSI : (non disponible)") # Pool top 10 print(f" Pool ({r['pool_size']} candidats) :") for j, c in enumerate(r.get("pool_top10", [])[:10]): print(f" [{j}] {c['terme'][:55]:55s} ({c['section']}, {c['score']:.2f})") # Résultat LLM print(f" >>> DP choisi : {r['dp_label'][:70] or '(aucun)'}") print(f" Source : {r['dp_source']}") print(f" Verdict : {r['verdict']}") print(f" Reason : {(r['winner_reason'] or '')[:100]}") # Affichage des DP Candidate Pools si activé if args.dp_candidates and dp_pools: print(f"\n{'='*60}") print(f"DP CANDIDATE POOL — {len(dp_pools)} dossiers") print(f"{'='*60}") pool_sizes = [p["pool_size"] for p in dp_pools] print(f"\n Taille pool : min={min(pool_sizes)}, max={max(pool_sizes)}, " f"moy={sum(pool_sizes)/len(pool_sizes):.1f}") for p in dp_pools: print(f"\n --- {p['crh']} (gold={p['gold_code']}, new={p['new_code']}) " f"— {p['pool_size']} candidats ---") for i, c in enumerate(p["candidates"][:10], 1): print(f" [{i:2d}] {c['terme'][:60]:60s} " f"({c['section']}, score={c['score']:.2f})") if c["preuve"]: print(f" preuve: {c['preuve'][:100]}") # JSON exportable summary = { "mode": mode, "total": total, "evaluable": evaluable, "gold_none": gold_none, "exact_match": exact, "exact_match_pct": round(pct(exact, evaluable), 1), "family4": fam4, "family4_pct": round(pct(fam4, evaluable), 1), "family3": fam3, "family3_pct": round(pct(fam3, evaluable), 1), "coverage_dp": coverage_has_dp, "coverage_dp_pct": round(pct(coverage_has_dp, total), 1), "review_count": review_count, "review_pct": round(pct(review_count, total), 1), "confirmed_count": confirmed_count, "comorbidity_fallback_count": comorbidity_fallback_count, "comorbidity_fallback_pct": round(pct(comorbidity_fallback_count, total), 1), "errors": errors, } if args.synthese_pmsi: summary["syntheses_pmsi"] = syntheses if args.dp_candidates: summary["dp_pools"] = dp_pools if args.use_dp_pool_rank: summary["pool_rank_results"] = pool_rank_results suffix = "_llm" if args.use_llm else "" if args.use_dp_pool_rank: suffix = "_pool_rank" if args.synthese_pmsi: suffix += "_synthese" if args.dp_candidates: suffix += "_pool" out_path = BASE / "output" / f"benchmark_dp_quality{suffix}.json" out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") print(f"\nRésultats exportés : {out_path}") if __name__ == "__main__": main()