chore: add .gitignore

2026-03-05 00:37:41 +01:00
parent 542797a124
commit 2578afb6ff
1716 changed files with 1905609 additions and 18 deletions
--- a/src/eval/init.py
+++ b/src/eval/init.py
--- a/src/eval/gold_debug.py
+++ b/src/eval/gold_debug.py
@@ -0,0 +1,548 @@
+"""Gold debug — génération de rapports détaillés pour l'évaluation NUKE-3.
+
+Fonctions pures : reçoivent des dicts, produisent des fichiers/strings.
+Pas de dépendance Ollama ni de mock.
+"""
+
+from __future__ import annotations
+
+import csv
+import json
+from datetime import datetime
+from pathlib import Path
+
+
+# ---------------------------------------------------------------------------
+# B1 — Case report (JSON + Markdown)
+# ---------------------------------------------------------------------------
+
+def build_case_report(
+    case_id: str,
+    data: dict,
+    dp_selection_raw: dict | None,
+    gold_case: dict | None,
+    eval_result: dict | None,
+) -> dict:
+    """Construit le JSON structuré pour un cas unique.
+
+    Args:
+        case_id: identifiant du dossier
+        data: JSON pipeline complet
+        dp_selection_raw: dp_selection (dict) ou None
+        gold_case: GoldCRHCase.model_dump() ou None
+        eval_result: résultat evaluate_dp() ou None
+    """
+    dp_sel = dp_selection_raw or {}
+    candidates = dp_sel.get("candidates", [])
+    evidence = dp_sel.get("evidence", [])
+    debug = dp_sel.get("debug_scores") or {}
+
+    # Prediction
+    prediction = {
+        "chosen_code": dp_sel.get("chosen_code"),
+        "chosen_term": dp_sel.get("chosen_term"),
+        "verdict": dp_sel.get("verdict"),
+        "confidence": dp_sel.get("confidence"),
+        "reason": dp_sel.get("reason"),
+        "review_reason_tag": _classify_review_reason(dp_sel.get("reason")),
+        "evidence": evidence,
+        "evidence_count": len(evidence),
+    }
+
+    # Pool stats
+    pool_stats = {
+        "raw_pool_size": len(data.get("diagnostics_associes", [])) + (1 if data.get("diagnostic_principal") else 0),
+        "filtered_pool_size": len(candidates),
+        "topk_size": len(candidates),
+    }
+
+    # Top candidates (max 10)
+    top_candidates = []
+    for i, c in enumerate(candidates[:10]):
+        top_candidates.append({
+            "rank": i + 1,
+            "index": c.get("index", i),
+            "code": c.get("code"),
+            "term": _truncate(c.get("term", ""), 120),
+            "score": c.get("score", 0),
+            "flags": {
+                "is_symptom_like": c.get("is_symptom_like", False),
+                "is_comorbidity_like": c.get("is_comorbidity_like", False),
+                "is_act_only": c.get("is_act_only", False),
+            },
+            "section_strength": c.get("section_strength", 0),
+            "score_details": c.get("score_details", {}),
+            "evidence_snippets": [],  # pas de texte CRH complet
+        })
+
+    # Gold
+    gold_section = None
+    if gold_case:
+        gc = gold_case
+        dp_exp = gc.get("dp_expected", {})
+        gold_section = {
+            "dp_expected": dp_exp,
+            "dp_acceptable_codes": gc.get("dp_acceptable_codes", []),
+            "dp_acceptable_family3": gc.get("dp_acceptable_family3", []),
+            "allow_symptom_dp": gc.get("allow_symptom_dp", False),
+            "confidence": gc.get("confidence", "probable"),
+        }
+
+    # Match eval
+    match_eval = None
+    if eval_result:
+        match_eval = {
+            "strict_match": eval_result.get("exact_match_strict", False),
+            "acceptable_match": eval_result.get("acceptable_match", False),
+            "family3_match": eval_result.get("family3_match_tolerant", False),
+            "symptom_not_allowed": eval_result.get("symptom_not_allowed", False),
+        }
+
+    return {
+        "case_id": case_id,
+        "document_type": data.get("document_type", "?"),
+        "gold": gold_section,
+        "prediction": prediction,
+        "pool_stats": pool_stats,
+        "top_candidates": top_candidates,
+        "match_eval": match_eval,
+    }
+
+
+def render_case_markdown(report: dict) -> str:
+    """Génère le Markdown pour un cas unique."""
+    lines: list[str] = []
+    cid = report["case_id"]
+    pred = report["prediction"]
+    gold = report.get("gold")
+    match = report.get("match_eval")
+    pool = report["pool_stats"]
+
+    lines.append(f"# Case Debug — {cid}")
+    lines.append("")
+    lines.append(f"**Type** : {report['document_type']}  ")
+    lines.append(f"**Verdict** : {pred['verdict']}  ")
+    lines.append(f"**Confidence** : {pred['confidence']}  ")
+    lines.append(f"**Code choisi** : {pred['chosen_code'] or '-'}  ")
+    lines.append(f"**Reason** : {pred['reason'] or '-'}  ")
+    lines.append(f"**Evidence** : {pred['evidence_count']} extrait(s)  ")
+    lines.append(f"**Pool** : {pool['raw_pool_size']} raw → {pool['filtered_pool_size']} candidats  ")
+    if gold:
+        lines.append(f"**DP attendu** : {gold['dp_expected'].get('code', '?')} ({gold['dp_expected'].get('label', '?')})  ")
+        lines.append(f"**Confiance gold** : {gold['confidence']}  ")
+    if match:
+        strict = "OK" if match["strict_match"] else "FAIL"
+        accept = "OK" if match["acceptable_match"] else "FAIL"
+        sym = "OUI" if match["symptom_not_allowed"] else "-"
+        lines.append(f"**Match** : strict={strict}, acceptable={accept}, symptôme interdit={sym}  ")
+    lines.append("")
+
+    # Gold vs Prediction
+    if gold:
+        lines.append("## Gold vs Prediction")
+        lines.append("")
+        lines.append("| | Gold | NUKE-3 |")
+        lines.append("|---|------|--------|")
+        lines.append(f"| Code | {gold['dp_expected'].get('code', '-')} | {pred['chosen_code'] or '-'} |")
+        lines.append(f"| Label | {gold['dp_expected'].get('label', '-')} | {_truncate(pred['chosen_term'] or '-', 60)} |")
+        lines.append(f"| Codes acceptables | {', '.join(gold.get('dp_acceptable_codes', []) or ['-'])} | - |")
+        lines.append(f"| Family3 | {', '.join(gold.get('dp_acceptable_family3', []) or ['-'])} | - |")
+        lines.append(f"| Confiance | {gold['confidence']} | {pred['confidence'] or '-'} |")
+        lines.append(f"| Symptôme autorisé | {'oui' if gold.get('allow_symptom_dp') else 'non'} | - |")
+        lines.append("")
+
+    # Top candidates
+    lines.append("## Top candidats")
+    lines.append("")
+    lines.append("| Rank | Code | Score | Term | Flags | Section |")
+    lines.append("|------|------|-------|------|-------|---------|")
+    for tc in report["top_candidates"]:
+        flags_parts = []
+        if tc["flags"]["is_symptom_like"]:
+            flags_parts.append("R*")
+        if tc["flags"]["is_comorbidity_like"]:
+            flags_parts.append("comorb")
+        if tc["flags"]["is_act_only"]:
+            flags_parts.append("acte")
+        flags_str = ", ".join(flags_parts) or "-"
+        lines.append(
+            f"| {tc['rank']} "
+            f"| {tc['code'] or '-'} "
+            f"| {tc['score']:.1f} "
+            f"| {_truncate(tc['term'], 40)} "
+            f"| {flags_str} "
+            f"| {tc['section_strength']} |"
+        )
+    lines.append("")
+
+    # Evidence
+    if pred["evidence"]:
+        lines.append("## Evidence")
+        lines.append("")
+        for i, ev in enumerate(pred["evidence"][:3], 1):
+            lines.append(f"{i}. {_truncate(str(ev), 200)}")
+        lines.append("")
+
+    # Hypothèse bug
+    lines.append("## Hypothèse bug")
+    lines.append("")
+    lines.append(_diagnose_bug(report))
+    lines.append("")
+
+    now = datetime.now().strftime("%Y-%m-%d %H:%M")
+    lines.append(f"---\n*Généré le {now}*")
+
+    return "\n".join(lines)
+
+
+def write_case_report(report: dict, out_dir: Path) -> tuple[Path, Path]:
+    """Écrit case_<id>.json et case_<id>.md. Retourne les 2 chemins."""
+    out_dir.mkdir(parents=True, exist_ok=True)
+    cid = report["case_id"]
+
+    json_path = out_dir / f"case_{cid}.json"
+    json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
+
+    md_path = out_dir / f"case_{cid}.md"
+    md_path.write_text(render_case_markdown(report), encoding="utf-8")
+
+    return json_path, md_path
+
+
+# ---------------------------------------------------------------------------
+# B2 — Top-N erreurs
+# ---------------------------------------------------------------------------
+
+# Colonnes CSV top-errors
+TOP_ERRORS_CSV_COLS = [
+    "case_id", "document_type",
+    "chosen_code", "chosen_term", "verdict", "confidence",
+    "expected_code", "acceptable_codes", "acceptable_family3",
+    "strict_match", "acceptable_match", "family3_match", "symptom_not_allowed",
+    "raw_pool_size", "filtered_pool_size", "topk_size",
+    "evidence_count", "review_reason_tag",
+    "top1_score", "top2_score", "delta_top1_top2",
+    "top3_codes", "top3_terms",
+]
+
+
+def build_error_entry(report: dict) -> dict:
+    """Construit une entrée pour le top-errors depuis un case report."""
+    pred = report["prediction"]
+    gold = report.get("gold") or {}
+    match = report.get("match_eval") or {}
+    pool = report["pool_stats"]
+    cands = report["top_candidates"]
+
+    top1_score = cands[0]["score"] if cands else 0
+    top2_score = cands[1]["score"] if len(cands) >= 2 else 0
+    delta = top1_score - top2_score
+
+    top3_codes = [c["code"] or "?" for c in cands[:3]]
+    top3_terms = [_truncate(c["term"], 40) for c in cands[:3]]
+
+    dp_exp = gold.get("dp_expected", {})
+
+    return {
+        "case_id": report["case_id"],
+        "document_type": report["document_type"],
+        "chosen_code": pred["chosen_code"] or "",
+        "chosen_term": _truncate(pred["chosen_term"] or "", 60),
+        "verdict": pred["verdict"] or "",
+        "confidence": pred["confidence"] or "",
+        "expected_code": dp_exp.get("code", ""),
+        "acceptable_codes": "|".join(gold.get("dp_acceptable_codes", [])),
+        "acceptable_family3": "|".join(gold.get("dp_acceptable_family3", [])),
+        "strict_match": match.get("strict_match", False),
+        "acceptable_match": match.get("acceptable_match", False),
+        "family3_match": match.get("family3_match", False),
+        "symptom_not_allowed": match.get("symptom_not_allowed", False),
+        "raw_pool_size": pool["raw_pool_size"],
+        "filtered_pool_size": pool["filtered_pool_size"],
+        "topk_size": pool["topk_size"],
+        "evidence_count": pred["evidence_count"],
+        "review_reason_tag": pred["review_reason_tag"],
+        "top1_score": top1_score,
+        "top2_score": top2_score,
+        "delta_top1_top2": round(delta, 1),
+        "top3_codes": "|".join(top3_codes),
+        "top3_terms": "|".join(top3_terms),
+        # Pour le tri
+        "_sort_key": _error_sort_key(match, pred),
+    }
+
+
+def _error_sort_key(match: dict, pred: dict) -> tuple:
+    """Clé de tri pour le top-errors (plus dangereux en premier).
+
+    Priorité :
+    1. acceptable_match == False (vraies erreurs)
+    2. verdict == CONFIRMED (les plus dangereuses)
+    3. confidence == high (danger max)
+    4. strict fail mais acceptable ok (moins grave)
+    """
+    acceptable_fail = not match.get("acceptable_match", True)
+    is_confirmed = pred.get("verdict") == "CONFIRMED"
+    is_high = pred.get("confidence") == "high"
+    strict_fail = not match.get("strict_match", True)
+
+    # Tri descendant : True avant False → on retourne des négatifs
+    return (
+        not acceptable_fail,   # False first (True erreurs en tête)
+        not is_confirmed,      # False first (CONFIRMED en tête)
+        not is_high,           # False first (high en tête)
+        not strict_fail,       # False first (strict fail en tête)
+    )
+
+
+def sort_error_entries(entries: list[dict]) -> list[dict]:
+    """Trie les entrées par priorité d'erreur (plus dangereux en premier)."""
+    return sorted(entries, key=lambda e: e.get("_sort_key", (True, True, True, True)))
+
+
+def write_top_errors_csv(entries: list[dict], path: Path) -> None:
+    """Écrit le CSV top-errors."""
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with open(path, "w", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(f, fieldnames=TOP_ERRORS_CSV_COLS, extrasaction="ignore")
+        writer.writeheader()
+        for e in entries:
+            writer.writerow(e)
+
+
+def write_top_errors_jsonl(entries: list[dict], path: Path) -> None:
+    """Écrit le JSONL top-errors (sans _sort_key)."""
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with open(path, "w", encoding="utf-8") as f:
+        for e in entries:
+            clean = {k: v for k, v in e.items() if not k.startswith("_")}
+            f.write(json.dumps(clean, ensure_ascii=False) + "\n")
+
+
+def write_top_errors_md(entries: list[dict], path: Path) -> None:
+    """Écrit le Markdown top-errors."""
+    path.parent.mkdir(parents=True, exist_ok=True)
+    now = datetime.now().strftime("%Y-%m-%d %H:%M")
+    lines: list[str] = []
+    lines.append("# NUKE-3 — Top erreurs gold CRH")
+    lines.append("")
+    lines.append(f"**Date** : {now}  ")
+    lines.append(f"**Cas** : {len(entries)}  ")
+    lines.append("")
+    lines.append("| # | Case ID | Choisi | Attendu | Strict | Accept. | Verdict | Conf. | Delta | Reason |")
+    lines.append("|---|---------|--------|---------|--------|---------|---------|-------|-------|--------|")
+
+    for i, e in enumerate(entries, 1):
+        strict = "OK" if e["strict_match"] else "FAIL"
+        accept = "OK" if e["acceptable_match"] else "FAIL"
+        lines.append(
+            f"| {i} "
+            f"| {e['case_id']} "
+            f"| {e['chosen_code'] or '-'} "
+            f"| {e['expected_code']} "
+            f"| {strict} "
+            f"| {accept} "
+            f"| {e['verdict']} "
+            f"| {e['confidence']} "
+            f"| {e['delta_top1_top2']} "
+            f"| {_truncate(e.get('review_reason_tag', ''), 30)} |"
+        )
+
+    lines.append("")
+    lines.append("---")
+    lines.append(f"*Généré le {now}*")
+    path.write_text("\n".join(lines), encoding="utf-8")
+
+
+# ---------------------------------------------------------------------------
+# B3 — DIM Pack
+# ---------------------------------------------------------------------------
+
+def select_dim_pack_cases(
+    reports: list[dict],
+    n: int,
+) -> list[dict]:
+    """Sélectionne N cas CRH les plus informatifs pour annotation DIM.
+
+    Mix :
+    - Erreurs (acceptable_fail) en priorité
+    - REVIEW fréquents
+    - Cas symptôme autorisé/non
+    - Cas comorbidités
+    """
+    # Classer par intérêt pour la DIM
+    scored: list[tuple[float, dict]] = []
+    for r in reports:
+        match = r.get("match_eval") or {}
+        pred = r["prediction"]
+        cands = r["top_candidates"]
+
+        interest = 0.0
+        # Erreur acceptable = très intéressant
+        if match and not match.get("acceptable_match", True):
+            interest += 10
+        # REVIEW = intéressant
+        if pred.get("verdict") == "REVIEW":
+            interest += 5
+        # Symptôme interdit
+        if match and match.get("symptom_not_allowed"):
+            interest += 3
+        # Comorbidité en DP
+        if cands and cands[0].get("flags", {}).get("is_comorbidity_like"):
+            interest += 2
+        # Petit delta = ambiguïté
+        if len(cands) >= 2:
+            delta = cands[0]["score"] - cands[1]["score"]
+            if delta < 2:
+                interest += 1
+
+        scored.append((interest, r))
+
+    scored.sort(key=lambda x: x[0], reverse=True)
+    return [r for _, r in scored[:n]]
+
+
+def write_dim_pack(cases: list[dict], out_dir: Path) -> tuple[Path, Path]:
+    """Écrit le DIM pack CSV + dossier de cas JSON.
+
+    Retourne (csv_path, cases_dir).
+    """
+    date_str = datetime.now().strftime("%Y%m%d")
+    csv_path = out_dir / f"DIM_PACK_{date_str}.csv"
+    cases_dir = out_dir / f"DIM_PACK_{date_str}_cases"
+    cases_dir.mkdir(parents=True, exist_ok=True)
+
+    # CSV pré-rempli
+    dim_cols = [
+        "case_id", "document_type",
+        "chosen_code", "chosen_term", "verdict", "confidence",
+        "dp_expected_code", "dp_expected_label",
+        "dp_acceptable_codes", "dp_acceptable_family3",
+        "allow_symptom_dp", "confidence_gold",
+        "notes",
+    ]
+
+    with open(csv_path, "w", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(f, fieldnames=dim_cols)
+        writer.writeheader()
+        for r in cases:
+            gold = r.get("gold") or {}
+            pred = r["prediction"]
+            dp_exp = gold.get("dp_expected", {})
+            writer.writerow({
+                "case_id": r["case_id"],
+                "document_type": r["document_type"],
+                "chosen_code": pred["chosen_code"] or "",
+                "chosen_term": _truncate(pred["chosen_term"] or "", 60),
+                "verdict": pred["verdict"] or "",
+                "confidence": pred["confidence"] or "",
+                "dp_expected_code": dp_exp.get("code", ""),
+                "dp_expected_label": dp_exp.get("label", ""),
+                "dp_acceptable_codes": "|".join(gold.get("dp_acceptable_codes", [])),
+                "dp_acceptable_family3": "|".join(gold.get("dp_acceptable_family3", [])),
+                "allow_symptom_dp": gold.get("allow_symptom_dp", ""),
+                "confidence_gold": gold.get("confidence", ""),
+                "notes": "",
+            })
+
+    # JSON individuel par cas
+    for r in cases:
+        case_path = cases_dir / f"{r['case_id']}.json"
+        case_path.write_text(json.dumps(r, ensure_ascii=False, indent=2), encoding="utf-8")
+
+    return csv_path, cases_dir
+
+
+# ---------------------------------------------------------------------------
+# Helpers internes
+# ---------------------------------------------------------------------------
+
+def _truncate(s: str, maxlen: int) -> str:
+    if len(s) <= maxlen:
+        return s
+    return s[: maxlen - 3] + "..."
+
+
+def _classify_review_reason(reason: str | None) -> str:
+    """Classifie la raison de REVIEW en tag court."""
+    if not reason:
+        return "unknown"
+    r = reason.lower()
+    if "aucun candidat" in r:
+        return "no_candidates"
+    if "mono-candidat" in r or "mono_candidat" in r:
+        return "mono_fragile"
+    if "écart" in r or "seuil" in r:
+        return "low_delta"
+    if "preuve" in r or "evidence" in r:
+        return "no_evidence"
+    if "llm" in r:
+        return "llm_unavailable"
+    return "other"
+
+
+def _diagnose_bug(report: dict) -> str:
+    """Génère une hypothèse de bug à partir du case report."""
+    pred = report["prediction"]
+    match = report.get("match_eval")
+    gold = report.get("gold")
+    cands = report["top_candidates"]
+    pool = report["pool_stats"]
+
+    hypotheses: list[str] = []
+
+    if not cands:
+        hypotheses.append("**Pool vide** — aucun candidat DP n'a été extrait. "
+                          "Vérifier l'extraction CIM-10 sur ce document.")
+        return "\n".join(hypotheses)
+
+    if match and gold:
+        expected_code = gold.get("dp_expected", {}).get("code", "")
+
+        # Le code attendu est-il dans le pool ?
+        pool_codes = {c["code"] for c in cands if c.get("code")}
+        if expected_code and expected_code not in pool_codes:
+            # Vérifier family3
+            exp_fam = expected_code[:3]
+            fam_in_pool = any(c["code"][:3] == exp_fam for c in cands if c.get("code"))
+            if fam_in_pool:
+                hypotheses.append(
+                    f"**Code attendu absent mais famille présente** — "
+                    f"`{expected_code}` absent du pool, mais famille `{exp_fam}` présente. "
+                    f"Problème de spécificité du code extrait."
+                )
+            else:
+                hypotheses.append(
+                    f"**Code attendu absent du pool** — `{expected_code}` non extrait. "
+                    f"Erreur d'extraction en amont (regex/edsnlp/LLM)."
+                )
+
+        elif expected_code in pool_codes and not match.get("strict_match"):
+            # Code dans le pool mais pas sélectionné
+            winner_code = pred["chosen_code"]
+            expected_rank = next(
+                (i + 1 for i, c in enumerate(cands) if c["code"] == expected_code), None
+            )
+            if expected_rank:
+                hypotheses.append(
+                    f"**Mauvais classement** — `{expected_code}` est dans le pool "
+                    f"(rank {expected_rank}) mais `{winner_code}` a été choisi. "
+                    f"Problème de scoring (bonus/malus)."
+                )
+
+        if match.get("symptom_not_allowed"):
+            hypotheses.append(
+                "**Symptôme en DP non autorisé** — un code R* a été sélectionné "
+                "alors que l'étiologie était disponible."
+            )
+
+    if not hypotheses:
+        if pred["verdict"] == "REVIEW":
+            hypotheses.append(
+                "**Delta trop faible** — les scores des candidats sont trop proches "
+                "pour une décision automatique. Le LLM ranker pourrait aider."
+            )
+        else:
+            hypotheses.append("Aucune anomalie détectée.")
+
+    return "\n".join(hypotheses)
--- a/src/eval/gold_models.py
+++ b/src/eval/gold_models.py
@@ -0,0 +1,211 @@
+"""Gold standard CRH — modèles Pydantic + chargement + évaluation tolérante.
+
+Schéma tolérant TIM : dp_expected + dp_acceptable_codes + dp_acceptable_family3.
+Permet d'évaluer NUKE-3 sur des CRH réels avec des marges de tolérance DIM.
+"""
+
+from __future__ import annotations
+
+import json
+import re
+from pathlib import Path
+from typing import Optional
+
+from pydantic import BaseModel, Field, field_validator
+
+
+# ---------------------------------------------------------------------------
+# Validation CIM-10
+# ---------------------------------------------------------------------------
+
+_CIM10_RE = re.compile(r"^[A-Z]\d{2}(?:\.\d{1,2})?$")
+
+
+def is_valid_cim10_format(code: str) -> bool:
+    """Vérifie le format CIM-10 basique (lettre + 2 chiffres + optionnel .X ou .XX)."""
+    return bool(_CIM10_RE.match(code.strip().upper()))
+
+
+def cim10_family3(code: str) -> str:
+    """Extrait le préfixe 3 caractères (famille) d'un code CIM-10."""
+    return code.strip().upper()[:3]
+
+
+# ---------------------------------------------------------------------------
+# Modèles
+# ---------------------------------------------------------------------------
+
+class GoldEvidence(BaseModel):
+    """Extrait court servant de preuve pour le DP attendu."""
+    section: str
+    excerpt: str
+
+    @field_validator("excerpt")
+    @classmethod
+    def excerpt_max_length(cls, v: str) -> str:
+        if len(v) > 240:
+            raise ValueError(f"excerpt trop long ({len(v)} > 240 chars)")
+        return v
+
+
+class GoldDPExpected(BaseModel):
+    """DP attendu avec code + label."""
+    code: str
+    label: str
+
+    @field_validator("code")
+    @classmethod
+    def code_format(cls, v: str) -> str:
+        v = v.strip().upper()
+        if not is_valid_cim10_format(v):
+            raise ValueError(f"code CIM-10 invalide: {v}")
+        return v
+
+
+class GoldCRHCase(BaseModel):
+    """Un cas gold CRH annoté pour évaluation NUKE-3."""
+    case_id: str
+    document_type: str = "crh"
+    dp_expected: GoldDPExpected
+    dp_acceptable_codes: list[str] = Field(default_factory=list)
+    dp_acceptable_family3: list[str] = Field(default_factory=list)
+    allow_symptom_dp: bool = False
+    confidence: str = "certain"  # certain | probable | ambiguous
+    evidence: list[GoldEvidence] = Field(default_factory=list)
+    notes: str = ""
+
+    @field_validator("confidence")
+    @classmethod
+    def confidence_valid(cls, v: str) -> str:
+        allowed = {"certain", "probable", "ambiguous"}
+        if v not in allowed:
+            raise ValueError(f"confidence doit être parmi {allowed}, reçu: {v}")
+        return v
+
+    @field_validator("dp_acceptable_codes")
+    @classmethod
+    def acceptable_codes_format(cls, v: list[str]) -> list[str]:
+        result = []
+        for code in v:
+            code = code.strip().upper()
+            if code and not is_valid_cim10_format(code):
+                raise ValueError(f"code acceptable invalide: {code}")
+            if code:
+                result.append(code)
+        return result
+
+    @field_validator("dp_acceptable_family3")
+    @classmethod
+    def family3_format(cls, v: list[str]) -> list[str]:
+        result = []
+        for fam in v:
+            fam = fam.strip().upper()
+            if fam and not re.match(r"^[A-Z]\d{2}$", fam):
+                raise ValueError(f"family3 invalide: {fam} (attendu: lettre+2 chiffres)")
+            if fam:
+                result.append(fam)
+        return result
+
+    @field_validator("notes")
+    @classmethod
+    def notes_max_length(cls, v: str) -> str:
+        if len(v) > 400:
+            raise ValueError(f"notes trop longues ({len(v)} > 400 chars)")
+        return v
+
+
+# ---------------------------------------------------------------------------
+# Chargement JSONL
+# ---------------------------------------------------------------------------
+
+def load_gold_jsonl(path: Path | str) -> list[GoldCRHCase]:
+    """Charge un fichier JSONL gold (1 JSON par ligne)."""
+    path = Path(path)
+    if not path.exists():
+        raise FileNotFoundError(f"Fichier gold introuvable: {path}")
+
+    cases: list[GoldCRHCase] = []
+    errors: list[str] = []
+    for i, line in enumerate(path.read_text("utf-8").splitlines(), 1):
+        line = line.strip()
+        if not line or line.startswith("#"):
+            continue
+        try:
+            data = json.loads(line)
+            cases.append(GoldCRHCase(**data))
+        except Exception as e:
+            errors.append(f"ligne {i}: {e}")
+
+    if errors:
+        raise ValueError(f"{len(errors)} erreur(s) dans {path.name}:\n" + "\n".join(errors[:10]))
+
+    return cases
+
+
+def load_gold_index(path: Path | str) -> dict[str, GoldCRHCase]:
+    """Charge le gold et retourne un dict case_id → GoldCRHCase."""
+    cases = load_gold_jsonl(path)
+    index: dict[str, GoldCRHCase] = {}
+    for c in cases:
+        if c.case_id in index:
+            raise ValueError(f"case_id dupliqué: {c.case_id}")
+        index[c.case_id] = c
+    return index
+
+
+# ---------------------------------------------------------------------------
+# Évaluation tolérante
+# ---------------------------------------------------------------------------
+
+def evaluate_dp(chosen_code: str | None, gold: GoldCRHCase) -> dict:
+    """Évalue un code DP choisi contre un cas gold, avec tolérance.
+
+    Retourne un dict avec les métriques d'évaluation :
+    - exact_match_strict: code choisi == dp_expected.code
+    - exact_match_tolerant_codes: code choisi dans dp_acceptable_codes
+    - family3_match_tolerant: family3(code choisi) dans dp_acceptable_family3
+    - acceptable_match: tolerant_codes OU family3_match
+    - symptom_not_allowed: R* choisi ET allow_symptom_dp == false
+    """
+    result = {
+        "case_id": gold.case_id,
+        "dp_expected_code": gold.dp_expected.code,
+        "dp_expected_label": gold.dp_expected.label,
+        "chosen_code": chosen_code,
+        "confidence_gold": gold.confidence,
+        "allow_symptom_dp": gold.allow_symptom_dp,
+        "exact_match_strict": False,
+        "exact_match_tolerant_codes": False,
+        "family3_match_tolerant": False,
+        "acceptable_match": False,
+        "symptom_not_allowed": False,
+    }
+
+    if not chosen_code:
+        return result
+
+    code_up = chosen_code.strip().upper()
+    expected_up = gold.dp_expected.code.upper()
+
+    # 1) Strict match
+    result["exact_match_strict"] = code_up == expected_up
+
+    # 2) Tolerant codes (includes expected)
+    all_acceptable = {expected_up} | {c.upper() for c in gold.dp_acceptable_codes}
+    result["exact_match_tolerant_codes"] = code_up in all_acceptable
+
+    # 3) Family3 match
+    code_fam = cim10_family3(code_up)
+    all_families = {expected_up[:3]} | {f.upper() for f in gold.dp_acceptable_family3}
+    result["family3_match_tolerant"] = code_fam in all_families
+
+    # 4) Acceptable = tolerant OR family3
+    result["acceptable_match"] = (
+        result["exact_match_tolerant_codes"] or result["family3_match_tolerant"]
+    )
+
+    # 5) Symptom penalty
+    if code_up.startswith("R") and not gold.allow_symptom_dp:
+        result["symptom_not_allowed"] = True
+
+    return result