feat(pmsi): add DP candidate pool + pool rank LLM + benchmark
- DPPoolCandidate model (terme, section, preuve, score_initial) - build_dp_candidate_pool() with filters (_is_pool_excluded, _dedup_pool) - Pool exclusion: admin noise, bio values, vague symptoms, place names - DP_POOL_RANK template for LLM-based ranking among pool candidates - llm_dp_pool_rank() with guardrails (GF-1 evidence, GF-3 confidence) - benchmark_quality.py: --dp-candidates, --use-dp-pool-rank flags - 41 new tests (pool, exclusion, dedup, pool rank, synthese) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
660
benchmark_quality.py
Normal file
660
benchmark_quality.py
Normal file
@@ -0,0 +1,660 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Benchmark qualité DP scoring déterministe vs pipeline LLM.
|
||||
|
||||
Compare le DP trouvé par le nouveau scoring déterministe (± fallback LLM)
|
||||
avec le DP de référence (gold) extrait par le pipeline complet (avec LLM).
|
||||
|
||||
Métriques (calculées sur dossiers avec gold_dp non-None uniquement) :
|
||||
- exact_match : code identique
|
||||
- family4 : 4 premiers chars identiques (ex: K85.1 vs K85.0)
|
||||
- family3 : 3 premiers chars identiques (ex: K85.x vs K86.x → non)
|
||||
- coverage_dp : % de dossiers où un DP est proposé (new_code non-None)
|
||||
|
||||
Usage:
|
||||
.venv/bin/python3 benchmark_quality.py [--limit 50] [--verbose]
|
||||
.venv/bin/python3 benchmark_quality.py --limit 50 --use-llm --verbose
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent))
|
||||
|
||||
from src.extraction.crh_parser import parse_crh
|
||||
from src.config import DossierMedical, Diagnostic
|
||||
from src.medical.cim10_extractor import (
|
||||
_extract_sejour,
|
||||
_extract_actes,
|
||||
_extract_biologie,
|
||||
_extract_imagerie,
|
||||
)
|
||||
from src.medical.dp_scoring import (
|
||||
build_dp_shortlist,
|
||||
build_dp_candidate_pool,
|
||||
score_candidates,
|
||||
select_dp,
|
||||
llm_dp_fallback,
|
||||
llm_dp_pool_rank,
|
||||
generate_synthese_pmsi,
|
||||
)
|
||||
|
||||
BASE = Path(__file__).resolve().parent
|
||||
ANON_DIR = BASE / "output" / "anonymized"
|
||||
STRUCT_DIR = BASE / "output" / "structured"
|
||||
|
||||
|
||||
def find_crh_dossiers(limit: int = 50) -> list[dict]:
|
||||
"""Trouve les dossiers avec CRH anonymisé ET JSON gold."""
|
||||
dossiers = []
|
||||
for anon_dir in sorted(ANON_DIR.iterdir()):
|
||||
if not anon_dir.is_dir():
|
||||
continue
|
||||
dir_name = anon_dir.name
|
||||
crh_files = list(anon_dir.glob("CRH_*_anonymized.txt"))
|
||||
if not crh_files:
|
||||
continue
|
||||
crh_file = crh_files[0]
|
||||
crh_name = crh_file.stem.replace("_anonymized", "")
|
||||
gold_json = STRUCT_DIR / dir_name / f"{crh_name}_cim10.json"
|
||||
if not gold_json.exists():
|
||||
continue
|
||||
dossiers.append({
|
||||
"dir_name": dir_name,
|
||||
"crh_name": crh_name,
|
||||
"text_path": crh_file,
|
||||
"gold_path": gold_json,
|
||||
})
|
||||
if len(dossiers) >= limit:
|
||||
break
|
||||
return dossiers
|
||||
|
||||
|
||||
def load_gold_dp(gold_path: Path) -> dict:
|
||||
"""Charge le DP de référence depuis le JSON gold."""
|
||||
data = json.loads(gold_path.read_text(encoding="utf-8"))
|
||||
dp = data.get("diagnostic_principal", {})
|
||||
return {
|
||||
"code": dp.get("cim10_suggestion"),
|
||||
"label": dp.get("texte", ""),
|
||||
"confidence": dp.get("cim10_confidence", ""),
|
||||
"source": dp.get("source", ""),
|
||||
}
|
||||
|
||||
|
||||
def run_dp_only(text_path: Path, use_llm: bool = False) -> dict:
|
||||
"""Extraction DP ciblée : scoring déterministe + fallback LLM optionnel."""
|
||||
text = text_path.read_text(encoding="utf-8")
|
||||
parsed = parse_crh(text)
|
||||
|
||||
dossier = DossierMedical()
|
||||
dossier.document_type = parsed.get("type", "")
|
||||
_extract_sejour(parsed, dossier)
|
||||
_extract_biologie(text, dossier)
|
||||
_extract_actes(text, dossier)
|
||||
_extract_imagerie(text, dossier)
|
||||
|
||||
edsnlp_result = None
|
||||
try:
|
||||
from src.medical.edsnlp_pipeline import run_edsnlp
|
||||
edsnlp_result = run_edsnlp(text)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
candidates = build_dp_shortlist(parsed, text, edsnlp_result, dossier)
|
||||
candidates = score_candidates(candidates, dossier, full_text=text)
|
||||
selection = select_dp(candidates, dossier, use_llm=use_llm)
|
||||
|
||||
# Instrumentation : comorbidity fallback
|
||||
comorbidity_fallback = (
|
||||
selection.verdict == "review"
|
||||
and "comorbidité banale" in (selection.winner_reason or "")
|
||||
)
|
||||
dp_pre_llm = None
|
||||
if comorbidity_fallback and selection.candidates:
|
||||
c0 = selection.candidates[0]
|
||||
dp_pre_llm = {"code": c0.code, "section": c0.source_section}
|
||||
|
||||
if use_llm and selection.verdict == "review":
|
||||
llm_selection = llm_dp_fallback(
|
||||
parsed, text, dossier,
|
||||
dp_candidates=candidates,
|
||||
edsnlp_result=edsnlp_result,
|
||||
)
|
||||
if llm_selection.candidates:
|
||||
all_candidates = list(llm_selection.candidates)
|
||||
if selection.candidates:
|
||||
all_candidates.extend(selection.candidates)
|
||||
llm_selection.candidates = all_candidates
|
||||
selection = llm_selection
|
||||
|
||||
dossier.dp_selection = selection
|
||||
if selection.candidates:
|
||||
winner = selection.candidates[0]
|
||||
dossier.diagnostic_principal = Diagnostic(
|
||||
texte=winner.label,
|
||||
cim10_suggestion=winner.code,
|
||||
source=winner.source_section,
|
||||
source_page=winner.source_page,
|
||||
source_excerpt=winner.source_excerpt,
|
||||
)
|
||||
|
||||
result = {
|
||||
"dp_code": None,
|
||||
"dp_label": "",
|
||||
"dp_source": "",
|
||||
"verdict": None,
|
||||
"winner_reason": None,
|
||||
"candidates": [],
|
||||
"comorbidity_fallback": comorbidity_fallback,
|
||||
"dp_pre_llm": dp_pre_llm,
|
||||
}
|
||||
if dossier.diagnostic_principal:
|
||||
result["dp_code"] = dossier.diagnostic_principal.cim10_suggestion
|
||||
result["dp_label"] = dossier.diagnostic_principal.texte
|
||||
result["dp_source"] = dossier.diagnostic_principal.source or ""
|
||||
if dossier.dp_selection:
|
||||
sel = dossier.dp_selection
|
||||
result["verdict"] = sel.verdict
|
||||
result["winner_reason"] = sel.winner_reason
|
||||
result["candidates"] = [
|
||||
{"code": c.code, "label": c.label, "section": c.source_section,
|
||||
"score": c.score, "details": c.score_details}
|
||||
for c in sel.candidates
|
||||
]
|
||||
return result
|
||||
|
||||
|
||||
def run_dp_pool_rank(text_path: Path) -> dict:
|
||||
"""DP Pool Rank : génère SynthesePMSI + pool, puis LLM choisit parmi le pool."""
|
||||
text = text_path.read_text(encoding="utf-8")
|
||||
parsed = parse_crh(text)
|
||||
|
||||
dossier = DossierMedical()
|
||||
dossier.document_type = parsed.get("type", "")
|
||||
_extract_sejour(parsed, dossier)
|
||||
_extract_biologie(text, dossier)
|
||||
_extract_actes(text, dossier)
|
||||
_extract_imagerie(text, dossier)
|
||||
|
||||
edsnlp_result = None
|
||||
try:
|
||||
from src.medical.edsnlp_pipeline import run_edsnlp
|
||||
edsnlp_result = run_edsnlp(text)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 1. Synthèse PMSI
|
||||
synthese = generate_synthese_pmsi(parsed, text, dossier)
|
||||
|
||||
# 2. Pool de candidats
|
||||
pool = build_dp_candidate_pool(parsed, text, edsnlp_result, dossier)
|
||||
|
||||
# 3. LLM pool rank
|
||||
dp_shortlist = build_dp_shortlist(parsed, text, edsnlp_result, dossier)
|
||||
dp_shortlist = score_candidates(dp_shortlist, dossier, full_text=text)
|
||||
|
||||
selection = llm_dp_pool_rank(
|
||||
parsed, text, dossier,
|
||||
pool_candidates=pool,
|
||||
synthese=synthese,
|
||||
fallback_oneshot=True,
|
||||
dp_candidates=dp_shortlist,
|
||||
edsnlp_result=edsnlp_result,
|
||||
)
|
||||
|
||||
dossier.dp_selection = selection
|
||||
if selection.candidates:
|
||||
winner = selection.candidates[0]
|
||||
dossier.diagnostic_principal = Diagnostic(
|
||||
texte=winner.label,
|
||||
cim10_suggestion=winner.code,
|
||||
source=winner.source_section,
|
||||
source_page=winner.source_page,
|
||||
source_excerpt=winner.source_excerpt,
|
||||
)
|
||||
|
||||
result = {
|
||||
"dp_code": None,
|
||||
"dp_label": "",
|
||||
"dp_source": "",
|
||||
"verdict": None,
|
||||
"winner_reason": None,
|
||||
"candidates": [],
|
||||
"pool_size": len(pool),
|
||||
"pool_top10": [
|
||||
{"terme": c.terme, "section": c.section,
|
||||
"preuve": c.preuve[:120], "score": round(c.score_initial, 2)}
|
||||
for c in pool[:10]
|
||||
],
|
||||
"synthese": synthese.model_dump() if synthese else None,
|
||||
}
|
||||
if dossier.diagnostic_principal:
|
||||
result["dp_code"] = dossier.diagnostic_principal.cim10_suggestion
|
||||
result["dp_label"] = dossier.diagnostic_principal.texte
|
||||
result["dp_source"] = dossier.diagnostic_principal.source or ""
|
||||
if dossier.dp_selection:
|
||||
sel = dossier.dp_selection
|
||||
result["verdict"] = sel.verdict
|
||||
result["winner_reason"] = sel.winner_reason
|
||||
result["candidates"] = [
|
||||
{"code": c.code, "label": c.label, "section": c.source_section,
|
||||
"score": c.score, "details": c.score_details}
|
||||
for c in sel.candidates
|
||||
]
|
||||
return result
|
||||
|
||||
|
||||
# --- Matching helpers ---
|
||||
|
||||
def _norm(code: str) -> str:
|
||||
"""Normalise un code CIM-10 pour comparaison : supprime le point."""
|
||||
return code.replace(".", "")
|
||||
|
||||
|
||||
def match_exact(a: str | None, b: str | None) -> bool:
|
||||
if not a or not b:
|
||||
return False
|
||||
return a == b
|
||||
|
||||
|
||||
def match_family4(a: str | None, b: str | None) -> bool:
|
||||
"""4 premiers chars sans point identiques (ex: K851 vs K850 → True)."""
|
||||
if not a or not b:
|
||||
return False
|
||||
return _norm(a)[:4] == _norm(b)[:4]
|
||||
|
||||
|
||||
def match_family3(a: str | None, b: str | None) -> bool:
|
||||
"""3 premiers chars identiques (ex: K85.x → K85)."""
|
||||
if not a or not b:
|
||||
return False
|
||||
return a[:3] == b[:3]
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Benchmark qualité DP scoring")
|
||||
parser.add_argument("--limit", type=int, default=50, help="Nombre de dossiers")
|
||||
parser.add_argument("--verbose", action="store_true", help="Afficher chaque dossier")
|
||||
parser.add_argument("--use-llm", action="store_true",
|
||||
help="Activer le fallback LLM DP sur les REVIEW (nécessite Ollama)")
|
||||
parser.add_argument("--synthese-pmsi", action="store_true",
|
||||
help="Générer la SynthesePMSI pour chaque dossier (nécessite Ollama)")
|
||||
parser.add_argument("--dp-candidates", action="store_true",
|
||||
help="Générer et afficher le DP Candidate Pool pour chaque dossier")
|
||||
parser.add_argument("--use-dp-pool-rank", action="store_true",
|
||||
help="Utiliser le mode DP Pool Rank (LLM choisit parmi le pool, nécessite Ollama)")
|
||||
args = parser.parse_args()
|
||||
|
||||
mode = "déterministe + LLM fallback" if args.use_llm else "déterministe seul"
|
||||
if args.use_dp_pool_rank:
|
||||
mode = "DP Pool Rank (LLM choisit parmi pool)"
|
||||
if args.synthese_pmsi:
|
||||
mode += " + SynthesePMSI"
|
||||
if args.dp_candidates:
|
||||
mode += " + DPCandidatePool"
|
||||
print(f"=== Benchmark DP scoring {mode} (n={args.limit}) ===\n")
|
||||
|
||||
dossiers = find_crh_dossiers(limit=args.limit)
|
||||
print(f"Dossiers CRH trouvés : {len(dossiers)}\n")
|
||||
if not dossiers:
|
||||
print("ERREUR : aucun dossier CRH avec gold JSON trouvé")
|
||||
return
|
||||
|
||||
# Compteurs
|
||||
total = len(dossiers)
|
||||
gold_none = 0 # gold_dp = None (exclus des métriques de match)
|
||||
evaluable = 0 # gold_dp non-None → base pour exact/family
|
||||
exact = 0
|
||||
fam4 = 0
|
||||
fam3 = 0
|
||||
coverage_has_dp = 0 # new_code non-None (sur total)
|
||||
review_count = 0
|
||||
confirmed_count = 0
|
||||
comorbidity_fallback_count = 0
|
||||
errors: list[dict] = []
|
||||
review_reasons: list[str] = []
|
||||
syntheses: list[dict] = [] # (crh_name, gold_code, new_code, synthese_dict)
|
||||
dp_pools: list[dict] = []
|
||||
pool_rank_results: list[dict] = [] # résultats détaillés pool rank
|
||||
gold_confidences = Counter()
|
||||
verdicts = Counter()
|
||||
source_sections = Counter()
|
||||
timings: list[float] = []
|
||||
|
||||
for i, d in enumerate(dossiers):
|
||||
gold = load_gold_dp(d["gold_path"])
|
||||
gold_code = gold["code"]
|
||||
gold_confidences[gold["confidence"] or "none"] += 1
|
||||
|
||||
t0 = time.time()
|
||||
if args.use_dp_pool_rank:
|
||||
result = run_dp_pool_rank(d["text_path"])
|
||||
pool_rank_results.append({
|
||||
"crh": d["crh_name"],
|
||||
"gold_code": gold_code,
|
||||
"dp_label": result["dp_label"],
|
||||
"dp_source": result["dp_source"],
|
||||
"verdict": result["verdict"],
|
||||
"winner_reason": result["winner_reason"],
|
||||
"pool_size": result.get("pool_size", 0),
|
||||
"pool_top10": result.get("pool_top10", []),
|
||||
"synthese": result.get("synthese"),
|
||||
"candidates": result.get("candidates", []),
|
||||
})
|
||||
else:
|
||||
result = run_dp_only(d["text_path"], use_llm=args.use_llm)
|
||||
elapsed = time.time() - t0
|
||||
timings.append(elapsed)
|
||||
|
||||
new_code = result["dp_code"]
|
||||
|
||||
# SynthesePMSI optionnelle
|
||||
if args.synthese_pmsi:
|
||||
text_synth = d["text_path"].read_text(encoding="utf-8")
|
||||
parsed_synth = parse_crh(text_synth)
|
||||
dossier_tmp = DossierMedical()
|
||||
dossier_tmp.document_type = parsed_synth.get("type", "")
|
||||
_extract_sejour(parsed_synth, dossier_tmp)
|
||||
_extract_actes(text_synth, dossier_tmp)
|
||||
synthese = generate_synthese_pmsi(parsed_synth, text_synth, dossier_tmp)
|
||||
syntheses.append({
|
||||
"crh": d["crh_name"],
|
||||
"gold_code": gold_code,
|
||||
"new_code": new_code,
|
||||
"synthese": synthese.model_dump() if synthese else None,
|
||||
})
|
||||
# DP Candidate Pool optionnel
|
||||
if args.dp_candidates:
|
||||
text_pool = d["text_path"].read_text(encoding="utf-8")
|
||||
parsed_pool = parse_crh(text_pool)
|
||||
dossier_pool = DossierMedical()
|
||||
dossier_pool.document_type = parsed_pool.get("type", "")
|
||||
_extract_sejour(parsed_pool, dossier_pool)
|
||||
_extract_actes(text_pool, dossier_pool)
|
||||
edsnlp_pool = None
|
||||
try:
|
||||
from src.medical.edsnlp_pipeline import run_edsnlp
|
||||
edsnlp_pool = run_edsnlp(text_pool)
|
||||
except Exception:
|
||||
pass
|
||||
pool = build_dp_candidate_pool(parsed_pool, text_pool, edsnlp_pool, dossier_pool)
|
||||
dp_pools.append({
|
||||
"crh": d["crh_name"],
|
||||
"gold_code": gold_code,
|
||||
"new_code": new_code,
|
||||
"pool_size": len(pool),
|
||||
"candidates": [
|
||||
{"terme": c.terme, "section": c.section,
|
||||
"preuve": c.preuve[:120], "score": round(c.score_initial, 2)}
|
||||
for c in pool
|
||||
],
|
||||
})
|
||||
|
||||
verdict = result["verdict"]
|
||||
verdicts[verdict or "no_selection"] += 1
|
||||
|
||||
if result["dp_source"]:
|
||||
source_sections[result["dp_source"]] += 1
|
||||
|
||||
# Coverage : new_code proposé (sur total)
|
||||
if new_code:
|
||||
coverage_has_dp += 1
|
||||
|
||||
# Métriques de match : uniquement si gold_dp non-None
|
||||
if gold_code is None:
|
||||
gold_none += 1
|
||||
else:
|
||||
evaluable += 1
|
||||
is_exact = match_exact(new_code, gold_code)
|
||||
is_f4 = match_family4(new_code, gold_code)
|
||||
is_f3 = match_family3(new_code, gold_code)
|
||||
if is_exact:
|
||||
exact += 1
|
||||
if is_f4:
|
||||
fam4 += 1
|
||||
if is_f3:
|
||||
fam3 += 1
|
||||
|
||||
# Erreurs (non-exact avec gold)
|
||||
if not is_exact:
|
||||
errors.append({
|
||||
"dir": d["dir_name"],
|
||||
"crh": d["crh_name"],
|
||||
"gold_code": gold_code,
|
||||
"gold_label": gold["label"],
|
||||
"gold_conf": gold["confidence"],
|
||||
"new_code": new_code or "(aucun)",
|
||||
"new_label": result["dp_label"] or "(aucun)",
|
||||
"new_source": result["dp_source"],
|
||||
"verdict": verdict,
|
||||
"winner_reason": result["winner_reason"] or "",
|
||||
"candidates": result["candidates"][:3],
|
||||
"is_f4": is_f4,
|
||||
"is_f3": is_f3,
|
||||
})
|
||||
|
||||
if result.get("comorbidity_fallback"):
|
||||
comorbidity_fallback_count += 1
|
||||
|
||||
if verdict == "review":
|
||||
review_count += 1
|
||||
if result["winner_reason"]:
|
||||
review_reasons.append(result["winner_reason"])
|
||||
elif verdict == "confirmed":
|
||||
confirmed_count += 1
|
||||
|
||||
if args.verbose:
|
||||
if gold_code is None:
|
||||
tag = "SKIP"
|
||||
elif match_exact(new_code, gold_code):
|
||||
tag = "EXACT"
|
||||
elif match_family4(new_code, gold_code):
|
||||
tag = "FAM4"
|
||||
elif match_family3(new_code, gold_code):
|
||||
tag = "FAM3"
|
||||
else:
|
||||
tag = "MISS"
|
||||
print(f" [{i+1:3d}] {d['crh_name']} : gold={gold_code} new={new_code} "
|
||||
f"[{tag}] verdict={verdict} ({elapsed:.1f}s)")
|
||||
|
||||
# === Rapport ===
|
||||
print(f"\n{'='*60}")
|
||||
print(f"RESULTATS — {total} dossiers CRH ({mode})")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
pct = lambda n, d: n / d * 100 if d else 0
|
||||
|
||||
print(f" Évaluables (gold non-None) : {evaluable}/{total} (excl. {gold_none} sans gold DP)")
|
||||
print()
|
||||
print(f" DP exact match : {exact}/{evaluable} ({pct(exact, evaluable):.1f}%)")
|
||||
print(f" DP family4 : {fam4}/{evaluable} ({pct(fam4, evaluable):.1f}%)")
|
||||
print(f" DP family3 : {fam3}/{evaluable} ({pct(fam3, evaluable):.1f}%)")
|
||||
print(f" Coverage DP : {coverage_has_dp}/{total} ({pct(coverage_has_dp, total):.1f}%)")
|
||||
print()
|
||||
print(f" Verdict REVIEW : {review_count}/{total} ({pct(review_count, total):.1f}%)")
|
||||
print(f" Verdict CONFIRM: {confirmed_count}/{total} ({pct(confirmed_count, total):.1f}%)")
|
||||
print(f" Comorbidité FB : {comorbidity_fallback_count}/{total} ({pct(comorbidity_fallback_count, total):.1f}%)")
|
||||
|
||||
if timings:
|
||||
avg_t = sum(timings) / len(timings)
|
||||
print(f"\n Temps moyen : {avg_t:.1f}s/dossier")
|
||||
print(f" Temps total : {sum(timings):.1f}s")
|
||||
|
||||
print(f"\n Gold confidence :")
|
||||
for conf, cnt in gold_confidences.most_common():
|
||||
print(f" {conf:8s} : {cnt}")
|
||||
|
||||
print(f"\n Sources DP (new) :")
|
||||
for src, cnt in source_sections.most_common():
|
||||
print(f" {src:35s} : {cnt}")
|
||||
|
||||
print(f"\n Verdicts :")
|
||||
for v, cnt in verdicts.most_common():
|
||||
print(f" {v:15s} : {cnt}")
|
||||
|
||||
if review_reasons:
|
||||
print(f"\n Top 5 review reasons :")
|
||||
reason_patterns = Counter()
|
||||
for r in review_reasons:
|
||||
if "aucun candidat" in r:
|
||||
reason_patterns["aucun candidat DP trouvé"] += 1
|
||||
elif "delta insuffisant" in r:
|
||||
reason_patterns["delta insuffisant (ambiguïté)"] += 1
|
||||
elif "evidence_excerpt vide" in r:
|
||||
reason_patterns["LLM: evidence_excerpt vide"] += 1
|
||||
elif "comorbidité" in r:
|
||||
reason_patterns["LLM: comorbidité hors section forte"] += 1
|
||||
elif "code invalide" in r:
|
||||
reason_patterns["LLM: code CIM-10 invalide"] += 1
|
||||
elif "LLM non disponible" in r or "erreur LLM" in r:
|
||||
reason_patterns["LLM: erreur/indisponible"] += 1
|
||||
elif "réponse LLM invalide" in r:
|
||||
reason_patterns["LLM: réponse invalide"] += 1
|
||||
elif "section faible" in r or "confidence" in r:
|
||||
reason_patterns["LLM: garde-fou (section/confidence)"] += 1
|
||||
else:
|
||||
reason_patterns[r[:60]] += 1
|
||||
for reason, cnt in reason_patterns.most_common(5):
|
||||
print(f" [{cnt:2d}] {reason}")
|
||||
|
||||
if errors:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"ERREURS DP — {len(errors)} dossiers (5 premiers)")
|
||||
print(f"{'='*60}\n")
|
||||
for e in errors[:5]:
|
||||
fam_tag = " [fam4]" if e.get("is_f4") else (" [fam3]" if e.get("is_f3") else "")
|
||||
print(f" {e['crh']} ({e['dir']}){fam_tag}")
|
||||
print(f" Gold : {e['gold_code']} — {e['gold_label'][:60]} (conf={e['gold_conf']})")
|
||||
print(f" New : {e['new_code']} — {e['new_label'][:60]}")
|
||||
print(f" Source: {e['new_source']}, Verdict: {e['verdict']}")
|
||||
if e.get('winner_reason'):
|
||||
print(f" Reason: {e['winner_reason'][:80]}")
|
||||
if e['candidates']:
|
||||
print(f" Candidats :")
|
||||
for c in e['candidates']:
|
||||
print(f" {c['code']} — {c['label'][:50]} "
|
||||
f"(section={c['section']}, score={c['score']})")
|
||||
print()
|
||||
|
||||
# Affichage des synthèses PMSI si activé
|
||||
if args.synthese_pmsi and syntheses:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"SYNTHESES PMSI — {len(syntheses)} dossiers")
|
||||
print(f"{'='*60}")
|
||||
for s in syntheses:
|
||||
print(f"\n --- {s['crh']} (gold={s['gold_code']}, new={s['new_code']}) ---")
|
||||
syn = s.get("synthese")
|
||||
if not syn:
|
||||
print(" (échec génération)")
|
||||
continue
|
||||
print(f" Motif admission : {syn.get('motif_admission', '')[:100]}")
|
||||
print(f" Problème PEC : {syn.get('probleme_pris_en_charge', '')[:100]}")
|
||||
print(f" Diagnostic retenu : {syn.get('diagnostic_retenu', '')[:100]}")
|
||||
actes = syn.get("actes_ou_traitements_majeurs", [])
|
||||
if actes:
|
||||
print(f" Actes/traitements : {', '.join(a[:60] for a in actes[:4])}")
|
||||
compli = syn.get("complications", [])
|
||||
if compli:
|
||||
print(f" Complications : {', '.join(c[:60] for c in compli[:3])}")
|
||||
comor = syn.get("terrain_comorbidites", [])
|
||||
if comor:
|
||||
print(f" Terrain/comorbidités: {', '.join(c[:60] for c in comor[:5])}")
|
||||
preuves = syn.get("preuves", [])
|
||||
if preuves:
|
||||
print(f" Preuves ({len(preuves)}) :")
|
||||
for p in preuves[:3]:
|
||||
print(f" [{p.get('section', '?')}] {p.get('excerpt', '')[:120]}")
|
||||
|
||||
# Affichage des résultats DP Pool Rank si activé
|
||||
if args.use_dp_pool_rank and pool_rank_results:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"DP POOL RANK — {len(pool_rank_results)} dossiers")
|
||||
print(f"{'='*60}")
|
||||
chosen_ok = sum(1 for r in pool_rank_results if r["dp_label"])
|
||||
print(f"\n Choix effectué : {chosen_ok}/{len(pool_rank_results)} "
|
||||
f"({chosen_ok/len(pool_rank_results)*100:.0f}%)")
|
||||
for r in pool_rank_results:
|
||||
print(f"\n --- {r['crh']} (gold={r['gold_code']}) ---")
|
||||
# SynthesePMSI
|
||||
syn = r.get("synthese")
|
||||
if syn:
|
||||
print(f" SynthesePMSI :")
|
||||
print(f" Motif admission : {syn.get('motif_admission', '')[:80]}")
|
||||
print(f" Problème PEC : {syn.get('probleme_pris_en_charge', '')[:80]}")
|
||||
print(f" Diag retenu : {syn.get('diagnostic_retenu', '')[:80]}")
|
||||
else:
|
||||
print(f" SynthesePMSI : (non disponible)")
|
||||
# Pool top 10
|
||||
print(f" Pool ({r['pool_size']} candidats) :")
|
||||
for j, c in enumerate(r.get("pool_top10", [])[:10]):
|
||||
print(f" [{j}] {c['terme'][:55]:55s} ({c['section']}, {c['score']:.2f})")
|
||||
# Résultat LLM
|
||||
print(f" >>> DP choisi : {r['dp_label'][:70] or '(aucun)'}")
|
||||
print(f" Source : {r['dp_source']}")
|
||||
print(f" Verdict : {r['verdict']}")
|
||||
print(f" Reason : {(r['winner_reason'] or '')[:100]}")
|
||||
|
||||
# Affichage des DP Candidate Pools si activé
|
||||
if args.dp_candidates and dp_pools:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"DP CANDIDATE POOL — {len(dp_pools)} dossiers")
|
||||
print(f"{'='*60}")
|
||||
pool_sizes = [p["pool_size"] for p in dp_pools]
|
||||
print(f"\n Taille pool : min={min(pool_sizes)}, max={max(pool_sizes)}, "
|
||||
f"moy={sum(pool_sizes)/len(pool_sizes):.1f}")
|
||||
for p in dp_pools:
|
||||
print(f"\n --- {p['crh']} (gold={p['gold_code']}, new={p['new_code']}) "
|
||||
f"— {p['pool_size']} candidats ---")
|
||||
for i, c in enumerate(p["candidates"][:10], 1):
|
||||
print(f" [{i:2d}] {c['terme'][:60]:60s} "
|
||||
f"({c['section']}, score={c['score']:.2f})")
|
||||
if c["preuve"]:
|
||||
print(f" preuve: {c['preuve'][:100]}")
|
||||
|
||||
# JSON exportable
|
||||
summary = {
|
||||
"mode": mode,
|
||||
"total": total,
|
||||
"evaluable": evaluable,
|
||||
"gold_none": gold_none,
|
||||
"exact_match": exact,
|
||||
"exact_match_pct": round(pct(exact, evaluable), 1),
|
||||
"family4": fam4,
|
||||
"family4_pct": round(pct(fam4, evaluable), 1),
|
||||
"family3": fam3,
|
||||
"family3_pct": round(pct(fam3, evaluable), 1),
|
||||
"coverage_dp": coverage_has_dp,
|
||||
"coverage_dp_pct": round(pct(coverage_has_dp, total), 1),
|
||||
"review_count": review_count,
|
||||
"review_pct": round(pct(review_count, total), 1),
|
||||
"confirmed_count": confirmed_count,
|
||||
"comorbidity_fallback_count": comorbidity_fallback_count,
|
||||
"comorbidity_fallback_pct": round(pct(comorbidity_fallback_count, total), 1),
|
||||
"errors": errors,
|
||||
}
|
||||
if args.synthese_pmsi:
|
||||
summary["syntheses_pmsi"] = syntheses
|
||||
if args.dp_candidates:
|
||||
summary["dp_pools"] = dp_pools
|
||||
if args.use_dp_pool_rank:
|
||||
summary["pool_rank_results"] = pool_rank_results
|
||||
suffix = "_llm" if args.use_llm else ""
|
||||
if args.use_dp_pool_rank:
|
||||
suffix = "_pool_rank"
|
||||
if args.synthese_pmsi:
|
||||
suffix += "_synthese"
|
||||
if args.dp_candidates:
|
||||
suffix += "_pool"
|
||||
out_path = BASE / "output" / f"benchmark_dp_quality{suffix}.json"
|
||||
out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
print(f"\nRésultats exportés : {out_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user