chore: add .gitignore
This commit is contained in:
0
src/eval/__init__.py
Normal file
0
src/eval/__init__.py
Normal file
548
src/eval/gold_debug.py
Normal file
548
src/eval/gold_debug.py
Normal file
@@ -0,0 +1,548 @@
|
||||
"""Gold debug — génération de rapports détaillés pour l'évaluation NUKE-3.
|
||||
|
||||
Fonctions pures : reçoivent des dicts, produisent des fichiers/strings.
|
||||
Pas de dépendance Ollama ni de mock.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import json
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# B1 — Case report (JSON + Markdown)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def build_case_report(
|
||||
case_id: str,
|
||||
data: dict,
|
||||
dp_selection_raw: dict | None,
|
||||
gold_case: dict | None,
|
||||
eval_result: dict | None,
|
||||
) -> dict:
|
||||
"""Construit le JSON structuré pour un cas unique.
|
||||
|
||||
Args:
|
||||
case_id: identifiant du dossier
|
||||
data: JSON pipeline complet
|
||||
dp_selection_raw: dp_selection (dict) ou None
|
||||
gold_case: GoldCRHCase.model_dump() ou None
|
||||
eval_result: résultat evaluate_dp() ou None
|
||||
"""
|
||||
dp_sel = dp_selection_raw or {}
|
||||
candidates = dp_sel.get("candidates", [])
|
||||
evidence = dp_sel.get("evidence", [])
|
||||
debug = dp_sel.get("debug_scores") or {}
|
||||
|
||||
# Prediction
|
||||
prediction = {
|
||||
"chosen_code": dp_sel.get("chosen_code"),
|
||||
"chosen_term": dp_sel.get("chosen_term"),
|
||||
"verdict": dp_sel.get("verdict"),
|
||||
"confidence": dp_sel.get("confidence"),
|
||||
"reason": dp_sel.get("reason"),
|
||||
"review_reason_tag": _classify_review_reason(dp_sel.get("reason")),
|
||||
"evidence": evidence,
|
||||
"evidence_count": len(evidence),
|
||||
}
|
||||
|
||||
# Pool stats
|
||||
pool_stats = {
|
||||
"raw_pool_size": len(data.get("diagnostics_associes", [])) + (1 if data.get("diagnostic_principal") else 0),
|
||||
"filtered_pool_size": len(candidates),
|
||||
"topk_size": len(candidates),
|
||||
}
|
||||
|
||||
# Top candidates (max 10)
|
||||
top_candidates = []
|
||||
for i, c in enumerate(candidates[:10]):
|
||||
top_candidates.append({
|
||||
"rank": i + 1,
|
||||
"index": c.get("index", i),
|
||||
"code": c.get("code"),
|
||||
"term": _truncate(c.get("term", ""), 120),
|
||||
"score": c.get("score", 0),
|
||||
"flags": {
|
||||
"is_symptom_like": c.get("is_symptom_like", False),
|
||||
"is_comorbidity_like": c.get("is_comorbidity_like", False),
|
||||
"is_act_only": c.get("is_act_only", False),
|
||||
},
|
||||
"section_strength": c.get("section_strength", 0),
|
||||
"score_details": c.get("score_details", {}),
|
||||
"evidence_snippets": [], # pas de texte CRH complet
|
||||
})
|
||||
|
||||
# Gold
|
||||
gold_section = None
|
||||
if gold_case:
|
||||
gc = gold_case
|
||||
dp_exp = gc.get("dp_expected", {})
|
||||
gold_section = {
|
||||
"dp_expected": dp_exp,
|
||||
"dp_acceptable_codes": gc.get("dp_acceptable_codes", []),
|
||||
"dp_acceptable_family3": gc.get("dp_acceptable_family3", []),
|
||||
"allow_symptom_dp": gc.get("allow_symptom_dp", False),
|
||||
"confidence": gc.get("confidence", "probable"),
|
||||
}
|
||||
|
||||
# Match eval
|
||||
match_eval = None
|
||||
if eval_result:
|
||||
match_eval = {
|
||||
"strict_match": eval_result.get("exact_match_strict", False),
|
||||
"acceptable_match": eval_result.get("acceptable_match", False),
|
||||
"family3_match": eval_result.get("family3_match_tolerant", False),
|
||||
"symptom_not_allowed": eval_result.get("symptom_not_allowed", False),
|
||||
}
|
||||
|
||||
return {
|
||||
"case_id": case_id,
|
||||
"document_type": data.get("document_type", "?"),
|
||||
"gold": gold_section,
|
||||
"prediction": prediction,
|
||||
"pool_stats": pool_stats,
|
||||
"top_candidates": top_candidates,
|
||||
"match_eval": match_eval,
|
||||
}
|
||||
|
||||
|
||||
def render_case_markdown(report: dict) -> str:
|
||||
"""Génère le Markdown pour un cas unique."""
|
||||
lines: list[str] = []
|
||||
cid = report["case_id"]
|
||||
pred = report["prediction"]
|
||||
gold = report.get("gold")
|
||||
match = report.get("match_eval")
|
||||
pool = report["pool_stats"]
|
||||
|
||||
lines.append(f"# Case Debug — {cid}")
|
||||
lines.append("")
|
||||
lines.append(f"**Type** : {report['document_type']} ")
|
||||
lines.append(f"**Verdict** : {pred['verdict']} ")
|
||||
lines.append(f"**Confidence** : {pred['confidence']} ")
|
||||
lines.append(f"**Code choisi** : {pred['chosen_code'] or '-'} ")
|
||||
lines.append(f"**Reason** : {pred['reason'] or '-'} ")
|
||||
lines.append(f"**Evidence** : {pred['evidence_count']} extrait(s) ")
|
||||
lines.append(f"**Pool** : {pool['raw_pool_size']} raw → {pool['filtered_pool_size']} candidats ")
|
||||
if gold:
|
||||
lines.append(f"**DP attendu** : {gold['dp_expected'].get('code', '?')} ({gold['dp_expected'].get('label', '?')}) ")
|
||||
lines.append(f"**Confiance gold** : {gold['confidence']} ")
|
||||
if match:
|
||||
strict = "OK" if match["strict_match"] else "FAIL"
|
||||
accept = "OK" if match["acceptable_match"] else "FAIL"
|
||||
sym = "OUI" if match["symptom_not_allowed"] else "-"
|
||||
lines.append(f"**Match** : strict={strict}, acceptable={accept}, symptôme interdit={sym} ")
|
||||
lines.append("")
|
||||
|
||||
# Gold vs Prediction
|
||||
if gold:
|
||||
lines.append("## Gold vs Prediction")
|
||||
lines.append("")
|
||||
lines.append("| | Gold | NUKE-3 |")
|
||||
lines.append("|---|------|--------|")
|
||||
lines.append(f"| Code | {gold['dp_expected'].get('code', '-')} | {pred['chosen_code'] or '-'} |")
|
||||
lines.append(f"| Label | {gold['dp_expected'].get('label', '-')} | {_truncate(pred['chosen_term'] or '-', 60)} |")
|
||||
lines.append(f"| Codes acceptables | {', '.join(gold.get('dp_acceptable_codes', []) or ['-'])} | - |")
|
||||
lines.append(f"| Family3 | {', '.join(gold.get('dp_acceptable_family3', []) or ['-'])} | - |")
|
||||
lines.append(f"| Confiance | {gold['confidence']} | {pred['confidence'] or '-'} |")
|
||||
lines.append(f"| Symptôme autorisé | {'oui' if gold.get('allow_symptom_dp') else 'non'} | - |")
|
||||
lines.append("")
|
||||
|
||||
# Top candidates
|
||||
lines.append("## Top candidats")
|
||||
lines.append("")
|
||||
lines.append("| Rank | Code | Score | Term | Flags | Section |")
|
||||
lines.append("|------|------|-------|------|-------|---------|")
|
||||
for tc in report["top_candidates"]:
|
||||
flags_parts = []
|
||||
if tc["flags"]["is_symptom_like"]:
|
||||
flags_parts.append("R*")
|
||||
if tc["flags"]["is_comorbidity_like"]:
|
||||
flags_parts.append("comorb")
|
||||
if tc["flags"]["is_act_only"]:
|
||||
flags_parts.append("acte")
|
||||
flags_str = ", ".join(flags_parts) or "-"
|
||||
lines.append(
|
||||
f"| {tc['rank']} "
|
||||
f"| {tc['code'] or '-'} "
|
||||
f"| {tc['score']:.1f} "
|
||||
f"| {_truncate(tc['term'], 40)} "
|
||||
f"| {flags_str} "
|
||||
f"| {tc['section_strength']} |"
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
# Evidence
|
||||
if pred["evidence"]:
|
||||
lines.append("## Evidence")
|
||||
lines.append("")
|
||||
for i, ev in enumerate(pred["evidence"][:3], 1):
|
||||
lines.append(f"{i}. {_truncate(str(ev), 200)}")
|
||||
lines.append("")
|
||||
|
||||
# Hypothèse bug
|
||||
lines.append("## Hypothèse bug")
|
||||
lines.append("")
|
||||
lines.append(_diagnose_bug(report))
|
||||
lines.append("")
|
||||
|
||||
now = datetime.now().strftime("%Y-%m-%d %H:%M")
|
||||
lines.append(f"---\n*Généré le {now}*")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def write_case_report(report: dict, out_dir: Path) -> tuple[Path, Path]:
|
||||
"""Écrit case_<id>.json et case_<id>.md. Retourne les 2 chemins."""
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
cid = report["case_id"]
|
||||
|
||||
json_path = out_dir / f"case_{cid}.json"
|
||||
json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
md_path = out_dir / f"case_{cid}.md"
|
||||
md_path.write_text(render_case_markdown(report), encoding="utf-8")
|
||||
|
||||
return json_path, md_path
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# B2 — Top-N erreurs
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Colonnes CSV top-errors
|
||||
TOP_ERRORS_CSV_COLS = [
|
||||
"case_id", "document_type",
|
||||
"chosen_code", "chosen_term", "verdict", "confidence",
|
||||
"expected_code", "acceptable_codes", "acceptable_family3",
|
||||
"strict_match", "acceptable_match", "family3_match", "symptom_not_allowed",
|
||||
"raw_pool_size", "filtered_pool_size", "topk_size",
|
||||
"evidence_count", "review_reason_tag",
|
||||
"top1_score", "top2_score", "delta_top1_top2",
|
||||
"top3_codes", "top3_terms",
|
||||
]
|
||||
|
||||
|
||||
def build_error_entry(report: dict) -> dict:
|
||||
"""Construit une entrée pour le top-errors depuis un case report."""
|
||||
pred = report["prediction"]
|
||||
gold = report.get("gold") or {}
|
||||
match = report.get("match_eval") or {}
|
||||
pool = report["pool_stats"]
|
||||
cands = report["top_candidates"]
|
||||
|
||||
top1_score = cands[0]["score"] if cands else 0
|
||||
top2_score = cands[1]["score"] if len(cands) >= 2 else 0
|
||||
delta = top1_score - top2_score
|
||||
|
||||
top3_codes = [c["code"] or "?" for c in cands[:3]]
|
||||
top3_terms = [_truncate(c["term"], 40) for c in cands[:3]]
|
||||
|
||||
dp_exp = gold.get("dp_expected", {})
|
||||
|
||||
return {
|
||||
"case_id": report["case_id"],
|
||||
"document_type": report["document_type"],
|
||||
"chosen_code": pred["chosen_code"] or "",
|
||||
"chosen_term": _truncate(pred["chosen_term"] or "", 60),
|
||||
"verdict": pred["verdict"] or "",
|
||||
"confidence": pred["confidence"] or "",
|
||||
"expected_code": dp_exp.get("code", ""),
|
||||
"acceptable_codes": "|".join(gold.get("dp_acceptable_codes", [])),
|
||||
"acceptable_family3": "|".join(gold.get("dp_acceptable_family3", [])),
|
||||
"strict_match": match.get("strict_match", False),
|
||||
"acceptable_match": match.get("acceptable_match", False),
|
||||
"family3_match": match.get("family3_match", False),
|
||||
"symptom_not_allowed": match.get("symptom_not_allowed", False),
|
||||
"raw_pool_size": pool["raw_pool_size"],
|
||||
"filtered_pool_size": pool["filtered_pool_size"],
|
||||
"topk_size": pool["topk_size"],
|
||||
"evidence_count": pred["evidence_count"],
|
||||
"review_reason_tag": pred["review_reason_tag"],
|
||||
"top1_score": top1_score,
|
||||
"top2_score": top2_score,
|
||||
"delta_top1_top2": round(delta, 1),
|
||||
"top3_codes": "|".join(top3_codes),
|
||||
"top3_terms": "|".join(top3_terms),
|
||||
# Pour le tri
|
||||
"_sort_key": _error_sort_key(match, pred),
|
||||
}
|
||||
|
||||
|
||||
def _error_sort_key(match: dict, pred: dict) -> tuple:
|
||||
"""Clé de tri pour le top-errors (plus dangereux en premier).
|
||||
|
||||
Priorité :
|
||||
1. acceptable_match == False (vraies erreurs)
|
||||
2. verdict == CONFIRMED (les plus dangereuses)
|
||||
3. confidence == high (danger max)
|
||||
4. strict fail mais acceptable ok (moins grave)
|
||||
"""
|
||||
acceptable_fail = not match.get("acceptable_match", True)
|
||||
is_confirmed = pred.get("verdict") == "CONFIRMED"
|
||||
is_high = pred.get("confidence") == "high"
|
||||
strict_fail = not match.get("strict_match", True)
|
||||
|
||||
# Tri descendant : True avant False → on retourne des négatifs
|
||||
return (
|
||||
not acceptable_fail, # False first (True erreurs en tête)
|
||||
not is_confirmed, # False first (CONFIRMED en tête)
|
||||
not is_high, # False first (high en tête)
|
||||
not strict_fail, # False first (strict fail en tête)
|
||||
)
|
||||
|
||||
|
||||
def sort_error_entries(entries: list[dict]) -> list[dict]:
|
||||
"""Trie les entrées par priorité d'erreur (plus dangereux en premier)."""
|
||||
return sorted(entries, key=lambda e: e.get("_sort_key", (True, True, True, True)))
|
||||
|
||||
|
||||
def write_top_errors_csv(entries: list[dict], path: Path) -> None:
|
||||
"""Écrit le CSV top-errors."""
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(path, "w", newline="", encoding="utf-8") as f:
|
||||
writer = csv.DictWriter(f, fieldnames=TOP_ERRORS_CSV_COLS, extrasaction="ignore")
|
||||
writer.writeheader()
|
||||
for e in entries:
|
||||
writer.writerow(e)
|
||||
|
||||
|
||||
def write_top_errors_jsonl(entries: list[dict], path: Path) -> None:
|
||||
"""Écrit le JSONL top-errors (sans _sort_key)."""
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
for e in entries:
|
||||
clean = {k: v for k, v in e.items() if not k.startswith("_")}
|
||||
f.write(json.dumps(clean, ensure_ascii=False) + "\n")
|
||||
|
||||
|
||||
def write_top_errors_md(entries: list[dict], path: Path) -> None:
|
||||
"""Écrit le Markdown top-errors."""
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
now = datetime.now().strftime("%Y-%m-%d %H:%M")
|
||||
lines: list[str] = []
|
||||
lines.append("# NUKE-3 — Top erreurs gold CRH")
|
||||
lines.append("")
|
||||
lines.append(f"**Date** : {now} ")
|
||||
lines.append(f"**Cas** : {len(entries)} ")
|
||||
lines.append("")
|
||||
lines.append("| # | Case ID | Choisi | Attendu | Strict | Accept. | Verdict | Conf. | Delta | Reason |")
|
||||
lines.append("|---|---------|--------|---------|--------|---------|---------|-------|-------|--------|")
|
||||
|
||||
for i, e in enumerate(entries, 1):
|
||||
strict = "OK" if e["strict_match"] else "FAIL"
|
||||
accept = "OK" if e["acceptable_match"] else "FAIL"
|
||||
lines.append(
|
||||
f"| {i} "
|
||||
f"| {e['case_id']} "
|
||||
f"| {e['chosen_code'] or '-'} "
|
||||
f"| {e['expected_code']} "
|
||||
f"| {strict} "
|
||||
f"| {accept} "
|
||||
f"| {e['verdict']} "
|
||||
f"| {e['confidence']} "
|
||||
f"| {e['delta_top1_top2']} "
|
||||
f"| {_truncate(e.get('review_reason_tag', ''), 30)} |"
|
||||
)
|
||||
|
||||
lines.append("")
|
||||
lines.append("---")
|
||||
lines.append(f"*Généré le {now}*")
|
||||
path.write_text("\n".join(lines), encoding="utf-8")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# B3 — DIM Pack
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def select_dim_pack_cases(
|
||||
reports: list[dict],
|
||||
n: int,
|
||||
) -> list[dict]:
|
||||
"""Sélectionne N cas CRH les plus informatifs pour annotation DIM.
|
||||
|
||||
Mix :
|
||||
- Erreurs (acceptable_fail) en priorité
|
||||
- REVIEW fréquents
|
||||
- Cas symptôme autorisé/non
|
||||
- Cas comorbidités
|
||||
"""
|
||||
# Classer par intérêt pour la DIM
|
||||
scored: list[tuple[float, dict]] = []
|
||||
for r in reports:
|
||||
match = r.get("match_eval") or {}
|
||||
pred = r["prediction"]
|
||||
cands = r["top_candidates"]
|
||||
|
||||
interest = 0.0
|
||||
# Erreur acceptable = très intéressant
|
||||
if match and not match.get("acceptable_match", True):
|
||||
interest += 10
|
||||
# REVIEW = intéressant
|
||||
if pred.get("verdict") == "REVIEW":
|
||||
interest += 5
|
||||
# Symptôme interdit
|
||||
if match and match.get("symptom_not_allowed"):
|
||||
interest += 3
|
||||
# Comorbidité en DP
|
||||
if cands and cands[0].get("flags", {}).get("is_comorbidity_like"):
|
||||
interest += 2
|
||||
# Petit delta = ambiguïté
|
||||
if len(cands) >= 2:
|
||||
delta = cands[0]["score"] - cands[1]["score"]
|
||||
if delta < 2:
|
||||
interest += 1
|
||||
|
||||
scored.append((interest, r))
|
||||
|
||||
scored.sort(key=lambda x: x[0], reverse=True)
|
||||
return [r for _, r in scored[:n]]
|
||||
|
||||
|
||||
def write_dim_pack(cases: list[dict], out_dir: Path) -> tuple[Path, Path]:
|
||||
"""Écrit le DIM pack CSV + dossier de cas JSON.
|
||||
|
||||
Retourne (csv_path, cases_dir).
|
||||
"""
|
||||
date_str = datetime.now().strftime("%Y%m%d")
|
||||
csv_path = out_dir / f"DIM_PACK_{date_str}.csv"
|
||||
cases_dir = out_dir / f"DIM_PACK_{date_str}_cases"
|
||||
cases_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# CSV pré-rempli
|
||||
dim_cols = [
|
||||
"case_id", "document_type",
|
||||
"chosen_code", "chosen_term", "verdict", "confidence",
|
||||
"dp_expected_code", "dp_expected_label",
|
||||
"dp_acceptable_codes", "dp_acceptable_family3",
|
||||
"allow_symptom_dp", "confidence_gold",
|
||||
"notes",
|
||||
]
|
||||
|
||||
with open(csv_path, "w", newline="", encoding="utf-8") as f:
|
||||
writer = csv.DictWriter(f, fieldnames=dim_cols)
|
||||
writer.writeheader()
|
||||
for r in cases:
|
||||
gold = r.get("gold") or {}
|
||||
pred = r["prediction"]
|
||||
dp_exp = gold.get("dp_expected", {})
|
||||
writer.writerow({
|
||||
"case_id": r["case_id"],
|
||||
"document_type": r["document_type"],
|
||||
"chosen_code": pred["chosen_code"] or "",
|
||||
"chosen_term": _truncate(pred["chosen_term"] or "", 60),
|
||||
"verdict": pred["verdict"] or "",
|
||||
"confidence": pred["confidence"] or "",
|
||||
"dp_expected_code": dp_exp.get("code", ""),
|
||||
"dp_expected_label": dp_exp.get("label", ""),
|
||||
"dp_acceptable_codes": "|".join(gold.get("dp_acceptable_codes", [])),
|
||||
"dp_acceptable_family3": "|".join(gold.get("dp_acceptable_family3", [])),
|
||||
"allow_symptom_dp": gold.get("allow_symptom_dp", ""),
|
||||
"confidence_gold": gold.get("confidence", ""),
|
||||
"notes": "",
|
||||
})
|
||||
|
||||
# JSON individuel par cas
|
||||
for r in cases:
|
||||
case_path = cases_dir / f"{r['case_id']}.json"
|
||||
case_path.write_text(json.dumps(r, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
return csv_path, cases_dir
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers internes
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _truncate(s: str, maxlen: int) -> str:
|
||||
if len(s) <= maxlen:
|
||||
return s
|
||||
return s[: maxlen - 3] + "..."
|
||||
|
||||
|
||||
def _classify_review_reason(reason: str | None) -> str:
|
||||
"""Classifie la raison de REVIEW en tag court."""
|
||||
if not reason:
|
||||
return "unknown"
|
||||
r = reason.lower()
|
||||
if "aucun candidat" in r:
|
||||
return "no_candidates"
|
||||
if "mono-candidat" in r or "mono_candidat" in r:
|
||||
return "mono_fragile"
|
||||
if "écart" in r or "seuil" in r:
|
||||
return "low_delta"
|
||||
if "preuve" in r or "evidence" in r:
|
||||
return "no_evidence"
|
||||
if "llm" in r:
|
||||
return "llm_unavailable"
|
||||
return "other"
|
||||
|
||||
|
||||
def _diagnose_bug(report: dict) -> str:
|
||||
"""Génère une hypothèse de bug à partir du case report."""
|
||||
pred = report["prediction"]
|
||||
match = report.get("match_eval")
|
||||
gold = report.get("gold")
|
||||
cands = report["top_candidates"]
|
||||
pool = report["pool_stats"]
|
||||
|
||||
hypotheses: list[str] = []
|
||||
|
||||
if not cands:
|
||||
hypotheses.append("**Pool vide** — aucun candidat DP n'a été extrait. "
|
||||
"Vérifier l'extraction CIM-10 sur ce document.")
|
||||
return "\n".join(hypotheses)
|
||||
|
||||
if match and gold:
|
||||
expected_code = gold.get("dp_expected", {}).get("code", "")
|
||||
|
||||
# Le code attendu est-il dans le pool ?
|
||||
pool_codes = {c["code"] for c in cands if c.get("code")}
|
||||
if expected_code and expected_code not in pool_codes:
|
||||
# Vérifier family3
|
||||
exp_fam = expected_code[:3]
|
||||
fam_in_pool = any(c["code"][:3] == exp_fam for c in cands if c.get("code"))
|
||||
if fam_in_pool:
|
||||
hypotheses.append(
|
||||
f"**Code attendu absent mais famille présente** — "
|
||||
f"`{expected_code}` absent du pool, mais famille `{exp_fam}` présente. "
|
||||
f"Problème de spécificité du code extrait."
|
||||
)
|
||||
else:
|
||||
hypotheses.append(
|
||||
f"**Code attendu absent du pool** — `{expected_code}` non extrait. "
|
||||
f"Erreur d'extraction en amont (regex/edsnlp/LLM)."
|
||||
)
|
||||
|
||||
elif expected_code in pool_codes and not match.get("strict_match"):
|
||||
# Code dans le pool mais pas sélectionné
|
||||
winner_code = pred["chosen_code"]
|
||||
expected_rank = next(
|
||||
(i + 1 for i, c in enumerate(cands) if c["code"] == expected_code), None
|
||||
)
|
||||
if expected_rank:
|
||||
hypotheses.append(
|
||||
f"**Mauvais classement** — `{expected_code}` est dans le pool "
|
||||
f"(rank {expected_rank}) mais `{winner_code}` a été choisi. "
|
||||
f"Problème de scoring (bonus/malus)."
|
||||
)
|
||||
|
||||
if match.get("symptom_not_allowed"):
|
||||
hypotheses.append(
|
||||
"**Symptôme en DP non autorisé** — un code R* a été sélectionné "
|
||||
"alors que l'étiologie était disponible."
|
||||
)
|
||||
|
||||
if not hypotheses:
|
||||
if pred["verdict"] == "REVIEW":
|
||||
hypotheses.append(
|
||||
"**Delta trop faible** — les scores des candidats sont trop proches "
|
||||
"pour une décision automatique. Le LLM ranker pourrait aider."
|
||||
)
|
||||
else:
|
||||
hypotheses.append("Aucune anomalie détectée.")
|
||||
|
||||
return "\n".join(hypotheses)
|
||||
211
src/eval/gold_models.py
Normal file
211
src/eval/gold_models.py
Normal file
@@ -0,0 +1,211 @@
|
||||
"""Gold standard CRH — modèles Pydantic + chargement + évaluation tolérante.
|
||||
|
||||
Schéma tolérant TIM : dp_expected + dp_acceptable_codes + dp_acceptable_family3.
|
||||
Permet d'évaluer NUKE-3 sur des CRH réels avec des marges de tolérance DIM.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel, Field, field_validator
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Validation CIM-10
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_CIM10_RE = re.compile(r"^[A-Z]\d{2}(?:\.\d{1,2})?$")
|
||||
|
||||
|
||||
def is_valid_cim10_format(code: str) -> bool:
|
||||
"""Vérifie le format CIM-10 basique (lettre + 2 chiffres + optionnel .X ou .XX)."""
|
||||
return bool(_CIM10_RE.match(code.strip().upper()))
|
||||
|
||||
|
||||
def cim10_family3(code: str) -> str:
|
||||
"""Extrait le préfixe 3 caractères (famille) d'un code CIM-10."""
|
||||
return code.strip().upper()[:3]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Modèles
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class GoldEvidence(BaseModel):
|
||||
"""Extrait court servant de preuve pour le DP attendu."""
|
||||
section: str
|
||||
excerpt: str
|
||||
|
||||
@field_validator("excerpt")
|
||||
@classmethod
|
||||
def excerpt_max_length(cls, v: str) -> str:
|
||||
if len(v) > 240:
|
||||
raise ValueError(f"excerpt trop long ({len(v)} > 240 chars)")
|
||||
return v
|
||||
|
||||
|
||||
class GoldDPExpected(BaseModel):
|
||||
"""DP attendu avec code + label."""
|
||||
code: str
|
||||
label: str
|
||||
|
||||
@field_validator("code")
|
||||
@classmethod
|
||||
def code_format(cls, v: str) -> str:
|
||||
v = v.strip().upper()
|
||||
if not is_valid_cim10_format(v):
|
||||
raise ValueError(f"code CIM-10 invalide: {v}")
|
||||
return v
|
||||
|
||||
|
||||
class GoldCRHCase(BaseModel):
|
||||
"""Un cas gold CRH annoté pour évaluation NUKE-3."""
|
||||
case_id: str
|
||||
document_type: str = "crh"
|
||||
dp_expected: GoldDPExpected
|
||||
dp_acceptable_codes: list[str] = Field(default_factory=list)
|
||||
dp_acceptable_family3: list[str] = Field(default_factory=list)
|
||||
allow_symptom_dp: bool = False
|
||||
confidence: str = "certain" # certain | probable | ambiguous
|
||||
evidence: list[GoldEvidence] = Field(default_factory=list)
|
||||
notes: str = ""
|
||||
|
||||
@field_validator("confidence")
|
||||
@classmethod
|
||||
def confidence_valid(cls, v: str) -> str:
|
||||
allowed = {"certain", "probable", "ambiguous"}
|
||||
if v not in allowed:
|
||||
raise ValueError(f"confidence doit être parmi {allowed}, reçu: {v}")
|
||||
return v
|
||||
|
||||
@field_validator("dp_acceptable_codes")
|
||||
@classmethod
|
||||
def acceptable_codes_format(cls, v: list[str]) -> list[str]:
|
||||
result = []
|
||||
for code in v:
|
||||
code = code.strip().upper()
|
||||
if code and not is_valid_cim10_format(code):
|
||||
raise ValueError(f"code acceptable invalide: {code}")
|
||||
if code:
|
||||
result.append(code)
|
||||
return result
|
||||
|
||||
@field_validator("dp_acceptable_family3")
|
||||
@classmethod
|
||||
def family3_format(cls, v: list[str]) -> list[str]:
|
||||
result = []
|
||||
for fam in v:
|
||||
fam = fam.strip().upper()
|
||||
if fam and not re.match(r"^[A-Z]\d{2}$", fam):
|
||||
raise ValueError(f"family3 invalide: {fam} (attendu: lettre+2 chiffres)")
|
||||
if fam:
|
||||
result.append(fam)
|
||||
return result
|
||||
|
||||
@field_validator("notes")
|
||||
@classmethod
|
||||
def notes_max_length(cls, v: str) -> str:
|
||||
if len(v) > 400:
|
||||
raise ValueError(f"notes trop longues ({len(v)} > 400 chars)")
|
||||
return v
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Chargement JSONL
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def load_gold_jsonl(path: Path | str) -> list[GoldCRHCase]:
|
||||
"""Charge un fichier JSONL gold (1 JSON par ligne)."""
|
||||
path = Path(path)
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"Fichier gold introuvable: {path}")
|
||||
|
||||
cases: list[GoldCRHCase] = []
|
||||
errors: list[str] = []
|
||||
for i, line in enumerate(path.read_text("utf-8").splitlines(), 1):
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
cases.append(GoldCRHCase(**data))
|
||||
except Exception as e:
|
||||
errors.append(f"ligne {i}: {e}")
|
||||
|
||||
if errors:
|
||||
raise ValueError(f"{len(errors)} erreur(s) dans {path.name}:\n" + "\n".join(errors[:10]))
|
||||
|
||||
return cases
|
||||
|
||||
|
||||
def load_gold_index(path: Path | str) -> dict[str, GoldCRHCase]:
|
||||
"""Charge le gold et retourne un dict case_id → GoldCRHCase."""
|
||||
cases = load_gold_jsonl(path)
|
||||
index: dict[str, GoldCRHCase] = {}
|
||||
for c in cases:
|
||||
if c.case_id in index:
|
||||
raise ValueError(f"case_id dupliqué: {c.case_id}")
|
||||
index[c.case_id] = c
|
||||
return index
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Évaluation tolérante
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def evaluate_dp(chosen_code: str | None, gold: GoldCRHCase) -> dict:
|
||||
"""Évalue un code DP choisi contre un cas gold, avec tolérance.
|
||||
|
||||
Retourne un dict avec les métriques d'évaluation :
|
||||
- exact_match_strict: code choisi == dp_expected.code
|
||||
- exact_match_tolerant_codes: code choisi dans dp_acceptable_codes
|
||||
- family3_match_tolerant: family3(code choisi) dans dp_acceptable_family3
|
||||
- acceptable_match: tolerant_codes OU family3_match
|
||||
- symptom_not_allowed: R* choisi ET allow_symptom_dp == false
|
||||
"""
|
||||
result = {
|
||||
"case_id": gold.case_id,
|
||||
"dp_expected_code": gold.dp_expected.code,
|
||||
"dp_expected_label": gold.dp_expected.label,
|
||||
"chosen_code": chosen_code,
|
||||
"confidence_gold": gold.confidence,
|
||||
"allow_symptom_dp": gold.allow_symptom_dp,
|
||||
"exact_match_strict": False,
|
||||
"exact_match_tolerant_codes": False,
|
||||
"family3_match_tolerant": False,
|
||||
"acceptable_match": False,
|
||||
"symptom_not_allowed": False,
|
||||
}
|
||||
|
||||
if not chosen_code:
|
||||
return result
|
||||
|
||||
code_up = chosen_code.strip().upper()
|
||||
expected_up = gold.dp_expected.code.upper()
|
||||
|
||||
# 1) Strict match
|
||||
result["exact_match_strict"] = code_up == expected_up
|
||||
|
||||
# 2) Tolerant codes (includes expected)
|
||||
all_acceptable = {expected_up} | {c.upper() for c in gold.dp_acceptable_codes}
|
||||
result["exact_match_tolerant_codes"] = code_up in all_acceptable
|
||||
|
||||
# 3) Family3 match
|
||||
code_fam = cim10_family3(code_up)
|
||||
all_families = {expected_up[:3]} | {f.upper() for f in gold.dp_acceptable_family3}
|
||||
result["family3_match_tolerant"] = code_fam in all_families
|
||||
|
||||
# 4) Acceptable = tolerant OR family3
|
||||
result["acceptable_match"] = (
|
||||
result["exact_match_tolerant_codes"] or result["family3_match_tolerant"]
|
||||
)
|
||||
|
||||
# 5) Symptom penalty
|
||||
if code_up.startswith("R") and not gold.allow_symptom_dp:
|
||||
result["symptom_not_allowed"] = True
|
||||
|
||||
return result
|
||||
Reference in New Issue
Block a user