chore: add .gitignore

This commit is contained in:
dom
2026-03-05 00:37:41 +01:00
parent 542797a124
commit 2578afb6ff
1716 changed files with 1905609 additions and 18 deletions

0
src/eval/__init__.py Normal file
View File

548
src/eval/gold_debug.py Normal file
View File

@@ -0,0 +1,548 @@
"""Gold debug — génération de rapports détaillés pour l'évaluation NUKE-3.
Fonctions pures : reçoivent des dicts, produisent des fichiers/strings.
Pas de dépendance Ollama ni de mock.
"""
from __future__ import annotations
import csv
import json
from datetime import datetime
from pathlib import Path
# ---------------------------------------------------------------------------
# B1 — Case report (JSON + Markdown)
# ---------------------------------------------------------------------------
def build_case_report(
case_id: str,
data: dict,
dp_selection_raw: dict | None,
gold_case: dict | None,
eval_result: dict | None,
) -> dict:
"""Construit le JSON structuré pour un cas unique.
Args:
case_id: identifiant du dossier
data: JSON pipeline complet
dp_selection_raw: dp_selection (dict) ou None
gold_case: GoldCRHCase.model_dump() ou None
eval_result: résultat evaluate_dp() ou None
"""
dp_sel = dp_selection_raw or {}
candidates = dp_sel.get("candidates", [])
evidence = dp_sel.get("evidence", [])
debug = dp_sel.get("debug_scores") or {}
# Prediction
prediction = {
"chosen_code": dp_sel.get("chosen_code"),
"chosen_term": dp_sel.get("chosen_term"),
"verdict": dp_sel.get("verdict"),
"confidence": dp_sel.get("confidence"),
"reason": dp_sel.get("reason"),
"review_reason_tag": _classify_review_reason(dp_sel.get("reason")),
"evidence": evidence,
"evidence_count": len(evidence),
}
# Pool stats
pool_stats = {
"raw_pool_size": len(data.get("diagnostics_associes", [])) + (1 if data.get("diagnostic_principal") else 0),
"filtered_pool_size": len(candidates),
"topk_size": len(candidates),
}
# Top candidates (max 10)
top_candidates = []
for i, c in enumerate(candidates[:10]):
top_candidates.append({
"rank": i + 1,
"index": c.get("index", i),
"code": c.get("code"),
"term": _truncate(c.get("term", ""), 120),
"score": c.get("score", 0),
"flags": {
"is_symptom_like": c.get("is_symptom_like", False),
"is_comorbidity_like": c.get("is_comorbidity_like", False),
"is_act_only": c.get("is_act_only", False),
},
"section_strength": c.get("section_strength", 0),
"score_details": c.get("score_details", {}),
"evidence_snippets": [], # pas de texte CRH complet
})
# Gold
gold_section = None
if gold_case:
gc = gold_case
dp_exp = gc.get("dp_expected", {})
gold_section = {
"dp_expected": dp_exp,
"dp_acceptable_codes": gc.get("dp_acceptable_codes", []),
"dp_acceptable_family3": gc.get("dp_acceptable_family3", []),
"allow_symptom_dp": gc.get("allow_symptom_dp", False),
"confidence": gc.get("confidence", "probable"),
}
# Match eval
match_eval = None
if eval_result:
match_eval = {
"strict_match": eval_result.get("exact_match_strict", False),
"acceptable_match": eval_result.get("acceptable_match", False),
"family3_match": eval_result.get("family3_match_tolerant", False),
"symptom_not_allowed": eval_result.get("symptom_not_allowed", False),
}
return {
"case_id": case_id,
"document_type": data.get("document_type", "?"),
"gold": gold_section,
"prediction": prediction,
"pool_stats": pool_stats,
"top_candidates": top_candidates,
"match_eval": match_eval,
}
def render_case_markdown(report: dict) -> str:
"""Génère le Markdown pour un cas unique."""
lines: list[str] = []
cid = report["case_id"]
pred = report["prediction"]
gold = report.get("gold")
match = report.get("match_eval")
pool = report["pool_stats"]
lines.append(f"# Case Debug — {cid}")
lines.append("")
lines.append(f"**Type** : {report['document_type']} ")
lines.append(f"**Verdict** : {pred['verdict']} ")
lines.append(f"**Confidence** : {pred['confidence']} ")
lines.append(f"**Code choisi** : {pred['chosen_code'] or '-'} ")
lines.append(f"**Reason** : {pred['reason'] or '-'} ")
lines.append(f"**Evidence** : {pred['evidence_count']} extrait(s) ")
lines.append(f"**Pool** : {pool['raw_pool_size']} raw → {pool['filtered_pool_size']} candidats ")
if gold:
lines.append(f"**DP attendu** : {gold['dp_expected'].get('code', '?')} ({gold['dp_expected'].get('label', '?')}) ")
lines.append(f"**Confiance gold** : {gold['confidence']} ")
if match:
strict = "OK" if match["strict_match"] else "FAIL"
accept = "OK" if match["acceptable_match"] else "FAIL"
sym = "OUI" if match["symptom_not_allowed"] else "-"
lines.append(f"**Match** : strict={strict}, acceptable={accept}, symptôme interdit={sym} ")
lines.append("")
# Gold vs Prediction
if gold:
lines.append("## Gold vs Prediction")
lines.append("")
lines.append("| | Gold | NUKE-3 |")
lines.append("|---|------|--------|")
lines.append(f"| Code | {gold['dp_expected'].get('code', '-')} | {pred['chosen_code'] or '-'} |")
lines.append(f"| Label | {gold['dp_expected'].get('label', '-')} | {_truncate(pred['chosen_term'] or '-', 60)} |")
lines.append(f"| Codes acceptables | {', '.join(gold.get('dp_acceptable_codes', []) or ['-'])} | - |")
lines.append(f"| Family3 | {', '.join(gold.get('dp_acceptable_family3', []) or ['-'])} | - |")
lines.append(f"| Confiance | {gold['confidence']} | {pred['confidence'] or '-'} |")
lines.append(f"| Symptôme autorisé | {'oui' if gold.get('allow_symptom_dp') else 'non'} | - |")
lines.append("")
# Top candidates
lines.append("## Top candidats")
lines.append("")
lines.append("| Rank | Code | Score | Term | Flags | Section |")
lines.append("|------|------|-------|------|-------|---------|")
for tc in report["top_candidates"]:
flags_parts = []
if tc["flags"]["is_symptom_like"]:
flags_parts.append("R*")
if tc["flags"]["is_comorbidity_like"]:
flags_parts.append("comorb")
if tc["flags"]["is_act_only"]:
flags_parts.append("acte")
flags_str = ", ".join(flags_parts) or "-"
lines.append(
f"| {tc['rank']} "
f"| {tc['code'] or '-'} "
f"| {tc['score']:.1f} "
f"| {_truncate(tc['term'], 40)} "
f"| {flags_str} "
f"| {tc['section_strength']} |"
)
lines.append("")
# Evidence
if pred["evidence"]:
lines.append("## Evidence")
lines.append("")
for i, ev in enumerate(pred["evidence"][:3], 1):
lines.append(f"{i}. {_truncate(str(ev), 200)}")
lines.append("")
# Hypothèse bug
lines.append("## Hypothèse bug")
lines.append("")
lines.append(_diagnose_bug(report))
lines.append("")
now = datetime.now().strftime("%Y-%m-%d %H:%M")
lines.append(f"---\n*Généré le {now}*")
return "\n".join(lines)
def write_case_report(report: dict, out_dir: Path) -> tuple[Path, Path]:
"""Écrit case_<id>.json et case_<id>.md. Retourne les 2 chemins."""
out_dir.mkdir(parents=True, exist_ok=True)
cid = report["case_id"]
json_path = out_dir / f"case_{cid}.json"
json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
md_path = out_dir / f"case_{cid}.md"
md_path.write_text(render_case_markdown(report), encoding="utf-8")
return json_path, md_path
# ---------------------------------------------------------------------------
# B2 — Top-N erreurs
# ---------------------------------------------------------------------------
# Colonnes CSV top-errors
TOP_ERRORS_CSV_COLS = [
"case_id", "document_type",
"chosen_code", "chosen_term", "verdict", "confidence",
"expected_code", "acceptable_codes", "acceptable_family3",
"strict_match", "acceptable_match", "family3_match", "symptom_not_allowed",
"raw_pool_size", "filtered_pool_size", "topk_size",
"evidence_count", "review_reason_tag",
"top1_score", "top2_score", "delta_top1_top2",
"top3_codes", "top3_terms",
]
def build_error_entry(report: dict) -> dict:
"""Construit une entrée pour le top-errors depuis un case report."""
pred = report["prediction"]
gold = report.get("gold") or {}
match = report.get("match_eval") or {}
pool = report["pool_stats"]
cands = report["top_candidates"]
top1_score = cands[0]["score"] if cands else 0
top2_score = cands[1]["score"] if len(cands) >= 2 else 0
delta = top1_score - top2_score
top3_codes = [c["code"] or "?" for c in cands[:3]]
top3_terms = [_truncate(c["term"], 40) for c in cands[:3]]
dp_exp = gold.get("dp_expected", {})
return {
"case_id": report["case_id"],
"document_type": report["document_type"],
"chosen_code": pred["chosen_code"] or "",
"chosen_term": _truncate(pred["chosen_term"] or "", 60),
"verdict": pred["verdict"] or "",
"confidence": pred["confidence"] or "",
"expected_code": dp_exp.get("code", ""),
"acceptable_codes": "|".join(gold.get("dp_acceptable_codes", [])),
"acceptable_family3": "|".join(gold.get("dp_acceptable_family3", [])),
"strict_match": match.get("strict_match", False),
"acceptable_match": match.get("acceptable_match", False),
"family3_match": match.get("family3_match", False),
"symptom_not_allowed": match.get("symptom_not_allowed", False),
"raw_pool_size": pool["raw_pool_size"],
"filtered_pool_size": pool["filtered_pool_size"],
"topk_size": pool["topk_size"],
"evidence_count": pred["evidence_count"],
"review_reason_tag": pred["review_reason_tag"],
"top1_score": top1_score,
"top2_score": top2_score,
"delta_top1_top2": round(delta, 1),
"top3_codes": "|".join(top3_codes),
"top3_terms": "|".join(top3_terms),
# Pour le tri
"_sort_key": _error_sort_key(match, pred),
}
def _error_sort_key(match: dict, pred: dict) -> tuple:
"""Clé de tri pour le top-errors (plus dangereux en premier).
Priorité :
1. acceptable_match == False (vraies erreurs)
2. verdict == CONFIRMED (les plus dangereuses)
3. confidence == high (danger max)
4. strict fail mais acceptable ok (moins grave)
"""
acceptable_fail = not match.get("acceptable_match", True)
is_confirmed = pred.get("verdict") == "CONFIRMED"
is_high = pred.get("confidence") == "high"
strict_fail = not match.get("strict_match", True)
# Tri descendant : True avant False → on retourne des négatifs
return (
not acceptable_fail, # False first (True erreurs en tête)
not is_confirmed, # False first (CONFIRMED en tête)
not is_high, # False first (high en tête)
not strict_fail, # False first (strict fail en tête)
)
def sort_error_entries(entries: list[dict]) -> list[dict]:
"""Trie les entrées par priorité d'erreur (plus dangereux en premier)."""
return sorted(entries, key=lambda e: e.get("_sort_key", (True, True, True, True)))
def write_top_errors_csv(entries: list[dict], path: Path) -> None:
"""Écrit le CSV top-errors."""
path.parent.mkdir(parents=True, exist_ok=True)
with open(path, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=TOP_ERRORS_CSV_COLS, extrasaction="ignore")
writer.writeheader()
for e in entries:
writer.writerow(e)
def write_top_errors_jsonl(entries: list[dict], path: Path) -> None:
"""Écrit le JSONL top-errors (sans _sort_key)."""
path.parent.mkdir(parents=True, exist_ok=True)
with open(path, "w", encoding="utf-8") as f:
for e in entries:
clean = {k: v for k, v in e.items() if not k.startswith("_")}
f.write(json.dumps(clean, ensure_ascii=False) + "\n")
def write_top_errors_md(entries: list[dict], path: Path) -> None:
"""Écrit le Markdown top-errors."""
path.parent.mkdir(parents=True, exist_ok=True)
now = datetime.now().strftime("%Y-%m-%d %H:%M")
lines: list[str] = []
lines.append("# NUKE-3 — Top erreurs gold CRH")
lines.append("")
lines.append(f"**Date** : {now} ")
lines.append(f"**Cas** : {len(entries)} ")
lines.append("")
lines.append("| # | Case ID | Choisi | Attendu | Strict | Accept. | Verdict | Conf. | Delta | Reason |")
lines.append("|---|---------|--------|---------|--------|---------|---------|-------|-------|--------|")
for i, e in enumerate(entries, 1):
strict = "OK" if e["strict_match"] else "FAIL"
accept = "OK" if e["acceptable_match"] else "FAIL"
lines.append(
f"| {i} "
f"| {e['case_id']} "
f"| {e['chosen_code'] or '-'} "
f"| {e['expected_code']} "
f"| {strict} "
f"| {accept} "
f"| {e['verdict']} "
f"| {e['confidence']} "
f"| {e['delta_top1_top2']} "
f"| {_truncate(e.get('review_reason_tag', ''), 30)} |"
)
lines.append("")
lines.append("---")
lines.append(f"*Généré le {now}*")
path.write_text("\n".join(lines), encoding="utf-8")
# ---------------------------------------------------------------------------
# B3 — DIM Pack
# ---------------------------------------------------------------------------
def select_dim_pack_cases(
reports: list[dict],
n: int,
) -> list[dict]:
"""Sélectionne N cas CRH les plus informatifs pour annotation DIM.
Mix :
- Erreurs (acceptable_fail) en priorité
- REVIEW fréquents
- Cas symptôme autorisé/non
- Cas comorbidités
"""
# Classer par intérêt pour la DIM
scored: list[tuple[float, dict]] = []
for r in reports:
match = r.get("match_eval") or {}
pred = r["prediction"]
cands = r["top_candidates"]
interest = 0.0
# Erreur acceptable = très intéressant
if match and not match.get("acceptable_match", True):
interest += 10
# REVIEW = intéressant
if pred.get("verdict") == "REVIEW":
interest += 5
# Symptôme interdit
if match and match.get("symptom_not_allowed"):
interest += 3
# Comorbidité en DP
if cands and cands[0].get("flags", {}).get("is_comorbidity_like"):
interest += 2
# Petit delta = ambiguïté
if len(cands) >= 2:
delta = cands[0]["score"] - cands[1]["score"]
if delta < 2:
interest += 1
scored.append((interest, r))
scored.sort(key=lambda x: x[0], reverse=True)
return [r for _, r in scored[:n]]
def write_dim_pack(cases: list[dict], out_dir: Path) -> tuple[Path, Path]:
"""Écrit le DIM pack CSV + dossier de cas JSON.
Retourne (csv_path, cases_dir).
"""
date_str = datetime.now().strftime("%Y%m%d")
csv_path = out_dir / f"DIM_PACK_{date_str}.csv"
cases_dir = out_dir / f"DIM_PACK_{date_str}_cases"
cases_dir.mkdir(parents=True, exist_ok=True)
# CSV pré-rempli
dim_cols = [
"case_id", "document_type",
"chosen_code", "chosen_term", "verdict", "confidence",
"dp_expected_code", "dp_expected_label",
"dp_acceptable_codes", "dp_acceptable_family3",
"allow_symptom_dp", "confidence_gold",
"notes",
]
with open(csv_path, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=dim_cols)
writer.writeheader()
for r in cases:
gold = r.get("gold") or {}
pred = r["prediction"]
dp_exp = gold.get("dp_expected", {})
writer.writerow({
"case_id": r["case_id"],
"document_type": r["document_type"],
"chosen_code": pred["chosen_code"] or "",
"chosen_term": _truncate(pred["chosen_term"] or "", 60),
"verdict": pred["verdict"] or "",
"confidence": pred["confidence"] or "",
"dp_expected_code": dp_exp.get("code", ""),
"dp_expected_label": dp_exp.get("label", ""),
"dp_acceptable_codes": "|".join(gold.get("dp_acceptable_codes", [])),
"dp_acceptable_family3": "|".join(gold.get("dp_acceptable_family3", [])),
"allow_symptom_dp": gold.get("allow_symptom_dp", ""),
"confidence_gold": gold.get("confidence", ""),
"notes": "",
})
# JSON individuel par cas
for r in cases:
case_path = cases_dir / f"{r['case_id']}.json"
case_path.write_text(json.dumps(r, ensure_ascii=False, indent=2), encoding="utf-8")
return csv_path, cases_dir
# ---------------------------------------------------------------------------
# Helpers internes
# ---------------------------------------------------------------------------
def _truncate(s: str, maxlen: int) -> str:
if len(s) <= maxlen:
return s
return s[: maxlen - 3] + "..."
def _classify_review_reason(reason: str | None) -> str:
"""Classifie la raison de REVIEW en tag court."""
if not reason:
return "unknown"
r = reason.lower()
if "aucun candidat" in r:
return "no_candidates"
if "mono-candidat" in r or "mono_candidat" in r:
return "mono_fragile"
if "écart" in r or "seuil" in r:
return "low_delta"
if "preuve" in r or "evidence" in r:
return "no_evidence"
if "llm" in r:
return "llm_unavailable"
return "other"
def _diagnose_bug(report: dict) -> str:
"""Génère une hypothèse de bug à partir du case report."""
pred = report["prediction"]
match = report.get("match_eval")
gold = report.get("gold")
cands = report["top_candidates"]
pool = report["pool_stats"]
hypotheses: list[str] = []
if not cands:
hypotheses.append("**Pool vide** — aucun candidat DP n'a été extrait. "
"Vérifier l'extraction CIM-10 sur ce document.")
return "\n".join(hypotheses)
if match and gold:
expected_code = gold.get("dp_expected", {}).get("code", "")
# Le code attendu est-il dans le pool ?
pool_codes = {c["code"] for c in cands if c.get("code")}
if expected_code and expected_code not in pool_codes:
# Vérifier family3
exp_fam = expected_code[:3]
fam_in_pool = any(c["code"][:3] == exp_fam for c in cands if c.get("code"))
if fam_in_pool:
hypotheses.append(
f"**Code attendu absent mais famille présente** — "
f"`{expected_code}` absent du pool, mais famille `{exp_fam}` présente. "
f"Problème de spécificité du code extrait."
)
else:
hypotheses.append(
f"**Code attendu absent du pool** — `{expected_code}` non extrait. "
f"Erreur d'extraction en amont (regex/edsnlp/LLM)."
)
elif expected_code in pool_codes and not match.get("strict_match"):
# Code dans le pool mais pas sélectionné
winner_code = pred["chosen_code"]
expected_rank = next(
(i + 1 for i, c in enumerate(cands) if c["code"] == expected_code), None
)
if expected_rank:
hypotheses.append(
f"**Mauvais classement** — `{expected_code}` est dans le pool "
f"(rank {expected_rank}) mais `{winner_code}` a été choisi. "
f"Problème de scoring (bonus/malus)."
)
if match.get("symptom_not_allowed"):
hypotheses.append(
"**Symptôme en DP non autorisé** — un code R* a été sélectionné "
"alors que l'étiologie était disponible."
)
if not hypotheses:
if pred["verdict"] == "REVIEW":
hypotheses.append(
"**Delta trop faible** — les scores des candidats sont trop proches "
"pour une décision automatique. Le LLM ranker pourrait aider."
)
else:
hypotheses.append("Aucune anomalie détectée.")
return "\n".join(hypotheses)

211
src/eval/gold_models.py Normal file
View File

@@ -0,0 +1,211 @@
"""Gold standard CRH — modèles Pydantic + chargement + évaluation tolérante.
Schéma tolérant TIM : dp_expected + dp_acceptable_codes + dp_acceptable_family3.
Permet d'évaluer NUKE-3 sur des CRH réels avec des marges de tolérance DIM.
"""
from __future__ import annotations
import json
import re
from pathlib import Path
from typing import Optional
from pydantic import BaseModel, Field, field_validator
# ---------------------------------------------------------------------------
# Validation CIM-10
# ---------------------------------------------------------------------------
_CIM10_RE = re.compile(r"^[A-Z]\d{2}(?:\.\d{1,2})?$")
def is_valid_cim10_format(code: str) -> bool:
"""Vérifie le format CIM-10 basique (lettre + 2 chiffres + optionnel .X ou .XX)."""
return bool(_CIM10_RE.match(code.strip().upper()))
def cim10_family3(code: str) -> str:
"""Extrait le préfixe 3 caractères (famille) d'un code CIM-10."""
return code.strip().upper()[:3]
# ---------------------------------------------------------------------------
# Modèles
# ---------------------------------------------------------------------------
class GoldEvidence(BaseModel):
"""Extrait court servant de preuve pour le DP attendu."""
section: str
excerpt: str
@field_validator("excerpt")
@classmethod
def excerpt_max_length(cls, v: str) -> str:
if len(v) > 240:
raise ValueError(f"excerpt trop long ({len(v)} > 240 chars)")
return v
class GoldDPExpected(BaseModel):
"""DP attendu avec code + label."""
code: str
label: str
@field_validator("code")
@classmethod
def code_format(cls, v: str) -> str:
v = v.strip().upper()
if not is_valid_cim10_format(v):
raise ValueError(f"code CIM-10 invalide: {v}")
return v
class GoldCRHCase(BaseModel):
"""Un cas gold CRH annoté pour évaluation NUKE-3."""
case_id: str
document_type: str = "crh"
dp_expected: GoldDPExpected
dp_acceptable_codes: list[str] = Field(default_factory=list)
dp_acceptable_family3: list[str] = Field(default_factory=list)
allow_symptom_dp: bool = False
confidence: str = "certain" # certain | probable | ambiguous
evidence: list[GoldEvidence] = Field(default_factory=list)
notes: str = ""
@field_validator("confidence")
@classmethod
def confidence_valid(cls, v: str) -> str:
allowed = {"certain", "probable", "ambiguous"}
if v not in allowed:
raise ValueError(f"confidence doit être parmi {allowed}, reçu: {v}")
return v
@field_validator("dp_acceptable_codes")
@classmethod
def acceptable_codes_format(cls, v: list[str]) -> list[str]:
result = []
for code in v:
code = code.strip().upper()
if code and not is_valid_cim10_format(code):
raise ValueError(f"code acceptable invalide: {code}")
if code:
result.append(code)
return result
@field_validator("dp_acceptable_family3")
@classmethod
def family3_format(cls, v: list[str]) -> list[str]:
result = []
for fam in v:
fam = fam.strip().upper()
if fam and not re.match(r"^[A-Z]\d{2}$", fam):
raise ValueError(f"family3 invalide: {fam} (attendu: lettre+2 chiffres)")
if fam:
result.append(fam)
return result
@field_validator("notes")
@classmethod
def notes_max_length(cls, v: str) -> str:
if len(v) > 400:
raise ValueError(f"notes trop longues ({len(v)} > 400 chars)")
return v
# ---------------------------------------------------------------------------
# Chargement JSONL
# ---------------------------------------------------------------------------
def load_gold_jsonl(path: Path | str) -> list[GoldCRHCase]:
"""Charge un fichier JSONL gold (1 JSON par ligne)."""
path = Path(path)
if not path.exists():
raise FileNotFoundError(f"Fichier gold introuvable: {path}")
cases: list[GoldCRHCase] = []
errors: list[str] = []
for i, line in enumerate(path.read_text("utf-8").splitlines(), 1):
line = line.strip()
if not line or line.startswith("#"):
continue
try:
data = json.loads(line)
cases.append(GoldCRHCase(**data))
except Exception as e:
errors.append(f"ligne {i}: {e}")
if errors:
raise ValueError(f"{len(errors)} erreur(s) dans {path.name}:\n" + "\n".join(errors[:10]))
return cases
def load_gold_index(path: Path | str) -> dict[str, GoldCRHCase]:
"""Charge le gold et retourne un dict case_id → GoldCRHCase."""
cases = load_gold_jsonl(path)
index: dict[str, GoldCRHCase] = {}
for c in cases:
if c.case_id in index:
raise ValueError(f"case_id dupliqué: {c.case_id}")
index[c.case_id] = c
return index
# ---------------------------------------------------------------------------
# Évaluation tolérante
# ---------------------------------------------------------------------------
def evaluate_dp(chosen_code: str | None, gold: GoldCRHCase) -> dict:
"""Évalue un code DP choisi contre un cas gold, avec tolérance.
Retourne un dict avec les métriques d'évaluation :
- exact_match_strict: code choisi == dp_expected.code
- exact_match_tolerant_codes: code choisi dans dp_acceptable_codes
- family3_match_tolerant: family3(code choisi) dans dp_acceptable_family3
- acceptable_match: tolerant_codes OU family3_match
- symptom_not_allowed: R* choisi ET allow_symptom_dp == false
"""
result = {
"case_id": gold.case_id,
"dp_expected_code": gold.dp_expected.code,
"dp_expected_label": gold.dp_expected.label,
"chosen_code": chosen_code,
"confidence_gold": gold.confidence,
"allow_symptom_dp": gold.allow_symptom_dp,
"exact_match_strict": False,
"exact_match_tolerant_codes": False,
"family3_match_tolerant": False,
"acceptable_match": False,
"symptom_not_allowed": False,
}
if not chosen_code:
return result
code_up = chosen_code.strip().upper()
expected_up = gold.dp_expected.code.upper()
# 1) Strict match
result["exact_match_strict"] = code_up == expected_up
# 2) Tolerant codes (includes expected)
all_acceptable = {expected_up} | {c.upper() for c in gold.dp_acceptable_codes}
result["exact_match_tolerant_codes"] = code_up in all_acceptable
# 3) Family3 match
code_fam = cim10_family3(code_up)
all_families = {expected_up[:3]} | {f.upper() for f in gold.dp_acceptable_family3}
result["family3_match_tolerant"] = code_fam in all_families
# 4) Acceptable = tolerant OR family3
result["acceptable_match"] = (
result["exact_match_tolerant_codes"] or result["family3_match_tolerant"]
)
# 5) Symptom penalty
if code_up.startswith("R") and not gold.allow_symptom_dp:
result["symptom_not_allowed"] = True
return result