refactor: réorganisation référentiels, nouveaux modules extraction, nettoyage code obsolète

- Réorganisation data/referentiels/ : pdfs/, dicts/, user/ (structure unifiée) - Fix badges "Source absente" sur page admin référentiels - Ré-indexation COCOA 2025 (555 → 1451 chunks, couverture 94%) - Fix VRAM OOM : embeddings forcés CPU via T2A_EMBED_CPU - Nouveaux modules : document_router, docx_extractor, image_extractor, ocr_engine - Module complétude (quality/completude.py + config YAML) - Template DIM (synthèse dimensionnelle) - Gunicorn config + systemd service t2a-viewer - Suppression t2a_install_rag_cleanup/ (copie obsolète) - Suppression scripts/ et scripts_t2a_v2/ (anciens benchmarks) - Suppression 81 fichiers _doc.txt de test - Cache Ollama : TTL configurable, corrections loader YAML - Dashboard : améliorations templates (base, index, detail, cpam, validation) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-07 16:48:10 +01:00
parent 2578afb6ff
commit 4e2b4bd946
210 changed files with 6939 additions and 22104 deletions
--- a/scripts/benchmark_models.py
+++ b/scripts/benchmark_models.py
@@ -1,313 +0,0 @@
-#!/usr/bin/env python3
-"""Benchmark A/B : gemma3:12b (base) vs pmsi-coder-v2 (fine-tuné).
-
-Compare les codes CIM-10 produits par les deux modèles sur N dossiers.
-Teste DP + DAS (échantillon) pour chaque dossier.
-
-Usage: python scripts/benchmark_models.py [--n 50] [--das-max 5]
-"""
-
-from __future__ import annotations
-
-import json
-import random
-import sys
-import time
-from pathlib import Path
-
-sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
-
-from src.config import STRUCTURED_DIR, OLLAMA_URL, DossierMedical
-from src.medical.cim10_dict import load_dict, normalize_code, validate_code
-
-import requests
-
-MODEL_BASE = "gemma3:12b"
-MODEL_FINETUNED = "pmsi-coder-v2"
-
-PROMPT_TEMPLATE = """Tu es un médecin DIM expert en codage PMSI.
-Code le diagnostic suivant en CIM-10. Choisis le code le plus spécifique possible.
-
-DIAGNOSTIC : "{texte}"
-TYPE : {type_diag}
-
-{contexte}
-
-Réponds UNIQUEMENT avec un objet JSON :
-{{"code": "X99.9", "confidence": "high|medium|low", "justification": "explication courte"}}"""
-
-
-def call_model(prompt: str, model: str, timeout: int = 120) -> tuple[dict | None, float]:
-    """Appelle un modèle Ollama et retourne (résultat, durée_s)."""
-    t0 = time.time()
-    try:
-        resp = requests.post(
-            f"{OLLAMA_URL}/api/generate",
-            json={
-                "model": model,
-                "prompt": prompt,
-                "stream": False,
-                "format": "json",
-                "options": {"temperature": 0.1, "num_predict": 500},
-            },
-            timeout=timeout,
-        )
-        resp.raise_for_status()
-        raw = resp.json().get("response", "")
-        duration = time.time() - t0
-        try:
-            return json.loads(raw), duration
-        except json.JSONDecodeError:
-            return None, duration
-    except Exception as e:
-        return None, time.time() - t0
-
-
-def load_dossiers(n: int) -> list[dict]:
-    """Charge N dossiers fusionnés diversifiés."""
-    dossiers = []
-    for subdir in sorted(STRUCTURED_DIR.iterdir()):
-        if not subdir.is_dir():
-            continue
-        for f in subdir.glob("*fusionne*.json"):
-            if ".gemma_" in f.name or ".bak" in f.name:
-                continue
-            try:
-                data = json.loads(f.read_text(encoding="utf-8"))
-                d = DossierMedical.model_validate(data)
-                if d.diagnostic_principal and d.diagnostic_principal.cim10_suggestion:
-                    dossiers.append({
-                        "name": subdir.name,
-                        "dossier": d,
-                        "path": str(f),
-                    })
-            except Exception:
-                continue
-            break
-    random.seed(42)
-    random.shuffle(dossiers)
-    return dossiers[:n]
-
-
-def build_contexte(d: DossierMedical) -> str:
-    """Construit un contexte clinique résumé."""
-    parts = []
-    s = d.sejour
-    if s.age is not None:
-        parts.append(f"Patient {s.sexe or '?'}, {s.age} ans")
-    if s.duree_sejour is not None:
-        parts.append(f"Durée séjour : {s.duree_sejour}j")
-    if d.diagnostic_principal:
-        parts.append(f"DP : {d.diagnostic_principal.texte}")
-    bio = [f"{b.test}={b.valeur}" for b in d.biologie_cle[:5] if b.valeur]
-    if bio:
-        parts.append(f"Bio : {', '.join(bio)}")
-    return "CONTEXTE : " + " | ".join(parts) if parts else ""
-
-
-def code_match_level(code_a: str, code_b: str) -> str:
-    """Retourne le niveau de correspondance entre deux codes."""
-    if code_a == code_b:
-        return "exact"
-    if code_a[:3] == code_b[:3]:
-        return "categorie"
-    return "diff"
-
-
-def run_benchmark(n: int = 50, das_max: int = 5):
-    print(f"=== Benchmark A/B : {MODEL_BASE} vs {MODEL_FINETUNED} ===")
-    print(f"    Dossiers : {n}, DAS max/dossier : {das_max}\n")
-
-    # Vérifier que les deux modèles sont disponibles
-    for model in [MODEL_BASE, MODEL_FINETUNED]:
-        try:
-            resp = requests.post(
-                f"{OLLAMA_URL}/api/generate",
-                json={"model": model, "prompt": "test", "stream": False,
-                      "options": {"num_predict": 1}},
-                timeout=60,
-            )
-            resp.raise_for_status()
-            print(f"  {model} : OK")
-        except Exception as e:
-            print(f"  {model} : ERREUR — {e}")
-            sys.exit(1)
-
-    dossiers = load_dossiers(n)
-    print(f"\nDossiers chargés : {len(dossiers)}\n")
-
-    cim10 = load_dict()
-    t_global_start = time.time()
-
-    dp_results = []
-    das_results = []
-
-    for i, item in enumerate(dossiers, 1):
-        d = item["dossier"]
-        dp = d.diagnostic_principal
-        name = item["name"]
-        ctx = build_contexte(d)
-
-        # === DP ===
-        prompt_dp = PROMPT_TEMPLATE.format(
-            texte=dp.texte,
-            type_diag="DP (diagnostic principal)",
-            contexte=ctx,
-        )
-        res_base, t_base = call_model(prompt_dp, MODEL_BASE)
-        res_ft, t_ft = call_model(prompt_dp, MODEL_FINETUNED)
-
-        code_base = normalize_code(res_base.get("code", "")) if res_base else "ERREUR"
-        code_ft = normalize_code(res_ft.get("code", "")) if res_ft else "ERREUR"
-        conf_base = res_base.get("confidence", "?") if res_base else "?"
-        conf_ft = res_ft.get("confidence", "?") if res_ft else "?"
-        valid_base = validate_code(code_base)[0] if code_base != "ERREUR" else False
-        valid_ft = validate_code(code_ft)[0] if code_ft != "ERREUR" else False
-
-        pipeline_code = dp.cim10_suggestion
-        match_level = code_match_level(code_base, code_ft)
-
-        dp_result = {
-            "dossier": name,
-            "texte": dp.texte[:80],
-            "pipeline": pipeline_code,
-            "base": code_base,
-            "ft": code_ft,
-            "conf_base": conf_base,
-            "conf_ft": conf_ft,
-            "valid_base": valid_base,
-            "valid_ft": valid_ft,
-            "match": match_level,
-            "t_base": round(t_base, 2),
-            "t_ft": round(t_ft, 2),
-        }
-        dp_results.append(dp_result)
-
-        tag = {"exact": "=", "categorie": "~", "diff": "X"}[match_level]
-        print(f"  [{i:2d}/{len(dossiers)}] {name:<20s} DP=\"{dp.texte[:35]:<35s}\" "
-              f"base={code_base:<7s} ft={code_ft:<7s} [{tag}] "
-              f"({t_base:.1f}s / {t_ft:.1f}s)")
-
-        # === DAS (échantillon) ===
-        das_list = [das for das in d.diagnostics_associes
-                    if das.texte and das.cim10_suggestion]
-        if len(das_list) > das_max:
-            random.seed(hash(name))
-            das_list = random.sample(das_list, das_max)
-
-        for das in das_list:
-            prompt_das = PROMPT_TEMPLATE.format(
-                texte=das.texte,
-                type_diag="DAS (diagnostic associé significatif)",
-                contexte=ctx,
-            )
-            res_b, tb = call_model(prompt_das, MODEL_BASE)
-            res_f, tf = call_model(prompt_das, MODEL_FINETUNED)
-
-            cb = normalize_code(res_b.get("code", "")) if res_b else "ERREUR"
-            cf = normalize_code(res_f.get("code", "")) if res_f else "ERREUR"
-            vb = validate_code(cb)[0] if cb != "ERREUR" else False
-            vf = validate_code(cf)[0] if cf != "ERREUR" else False
-
-            das_results.append({
-                "dossier": name,
-                "texte": das.texte[:80],
-                "pipeline": das.cim10_suggestion,
-                "base": cb,
-                "ft": cf,
-                "conf_base": (res_b or {}).get("confidence", "?"),
-                "conf_ft": (res_f or {}).get("confidence", "?"),
-                "valid_base": vb,
-                "valid_ft": vf,
-                "match": code_match_level(cb, cf),
-                "t_base": round(tb, 2),
-                "t_ft": round(tf, 2),
-            })
-
-    t_global = time.time() - t_global_start
-
-    # === RÉSUMÉ ===
-    print(f"\n{'='*75}")
-    print(f"RÉSUMÉ — {len(dp_results)} dossiers, {len(das_results)} DAS testés")
-    print(f"Durée totale : {t_global/60:.1f} min\n")
-
-    for label, results in [("DP", dp_results), ("DAS", das_results)]:
-        if not results:
-            continue
-        nt = len(results)
-        n_exact = sum(1 for r in results if r["match"] == "exact")
-        n_cat = sum(1 for r in results if r["match"] == "categorie")
-        n_diff = sum(1 for r in results if r["match"] == "diff")
-        n_vb = sum(1 for r in results if r["valid_base"])
-        n_vf = sum(1 for r in results if r["valid_ft"])
-        avg_tb = sum(r["t_base"] for r in results) / nt
-        avg_tf = sum(r["t_ft"] for r in results) / nt
-
-        # Confiance
-        conf_b = {}
-        conf_f = {}
-        for r in results:
-            conf_b[r["conf_base"]] = conf_b.get(r["conf_base"], 0) + 1
-            conf_f[r["conf_ft"]] = conf_f.get(r["conf_ft"], 0) + 1
-
-        # Concordance avec pipeline (gemma run original)
-        n_base_eq_pipe = sum(1 for r in results if r["base"] == r["pipeline"])
-        n_ft_eq_pipe = sum(1 for r in results if r["ft"] == r["pipeline"])
-        n_base_cat_pipe = sum(1 for r in results
-                             if r["base"][:3] == r["pipeline"][:3])
-        n_ft_cat_pipe = sum(1 for r in results
-                           if r["ft"][:3] == r["pipeline"][:3])
-
-        print(f"  --- {label} ({nt} diagnostics) ---")
-        print(f"  Concordance base↔ft :")
-        print(f"    Exact      : {n_exact}/{nt} ({100*n_exact/nt:.0f}%)")
-        print(f"    Catégorie  : {n_exact+n_cat}/{nt} ({100*(n_exact+n_cat)/nt:.0f}%)")
-        print(f"    Différent  : {n_diff}/{nt} ({100*n_diff/nt:.0f}%)")
-        print(f"  Codes valides :")
-        print(f"    base       : {n_vb}/{nt} ({100*n_vb/nt:.0f}%)")
-        print(f"    ft         : {n_vf}/{nt} ({100*n_vf/nt:.0f}%)")
-        print(f"  vs pipeline (gemma original) :")
-        print(f"    base=pipe  : {n_base_eq_pipe}/{nt} exact, {n_base_cat_pipe}/{nt} catégorie")
-        print(f"    ft=pipe    : {n_ft_eq_pipe}/{nt} exact, {n_ft_cat_pipe}/{nt} catégorie")
-        print(f"  Temps moyen  : base={avg_tb:.2f}s  ft={avg_tf:.2f}s  (Δ={100*(avg_tf-avg_tb)/avg_tb:+.0f}%)")
-        print(f"  Confiance base : {conf_b}")
-        print(f"  Confiance ft   : {conf_f}")
-        print()
-
-    # Lister les différences DP
-    diffs_dp = [r for r in dp_results if r["match"] == "diff"]
-    if diffs_dp:
-        print(f"  Différences DP ({len(diffs_dp)}) :")
-        for r in diffs_dp:
-            vb = "✓" if r["valid_base"] else "✗"
-            vf = "✓" if r["valid_ft"] else "✗"
-            print(f"    {r['dossier']:<18s} \"{r['texte'][:40]}\"")
-            print(f"      base={r['base']:<7s}{vb}  ft={r['ft']:<7s}{vf}  pipe={r['pipeline']}")
-
-    # Sauvegarder
-    out = {
-        "meta": {
-            "date": time.strftime("%Y-%m-%dT%H:%M:%S"),
-            "model_base": MODEL_BASE,
-            "model_ft": MODEL_FINETUNED,
-            "n_dossiers": len(dp_results),
-            "n_das": len(das_results),
-            "duration_min": round(t_global / 60, 1),
-        },
-        "dp": dp_results,
-        "das": das_results,
-    }
-    out_path = Path(__file__).parent.parent / "output" / "benchmark_ab.json"
-    out_path.write_text(json.dumps(out, ensure_ascii=False, indent=2), encoding="utf-8")
-    print(f"\nRésultats détaillés : {out_path}")
-
-
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--n", type=int, default=50,
-                        help="Nombre de dossiers à tester")
-    parser.add_argument("--das-max", type=int, default=5,
-                        help="Max DAS testés par dossier")
-    args = parser.parse_args()
-    run_benchmark(args.n, args.das_max)
--- a/scripts/benchmark_nuke3_compare.py
+++ b/scripts/benchmark_nuke3_compare.py
@@ -1,874 +0,0 @@
-#!/usr/bin/env python3
-"""Benchmark NUKE-3 — rapport comparatif LLM off vs on.
-
-Analyse les dossiers JSON existants (output/structured/) pour produire
-des métriques DIM-like sur la sélection DP (NUKE-3).
-
-Mode 1 (par défaut) : analyse les JSON existants (pas d'Ollama requis).
-Mode 2 (--rerun)     : relance le pipeline 2× (LLM off puis LLM on) —
-                        nécessite Ollama pour le mode "on".
-
-Usage:
-  python scripts/benchmark_nuke3_compare.py                    # analyse offline
-  python scripts/benchmark_nuke3_compare.py --n 10             # top 10 dossiers
-  python scripts/benchmark_nuke3_compare.py --rerun --n 5      # relance pipeline
-  python scripts/benchmark_nuke3_compare.py --dossiers A,B,C   # dossiers spécifiques
-  python scripts/benchmark_nuke3_compare.py --gold data/gold_crh/gold_crh.jsonl
-  python scripts/benchmark_nuke3_compare.py --offline --case-id 74_23141536
-  python scripts/benchmark_nuke3_compare.py --offline --top-errors 20
-  python scripts/benchmark_nuke3_compare.py --offline --dim-pack 20
-"""
-
-from __future__ import annotations
-
-import argparse
-import csv
-import json
-import os
-import subprocess
-import sys
-import time
-from datetime import datetime
-from pathlib import Path
-from statistics import mean
-
-ROOT = Path(__file__).resolve().parent.parent
-sys.path.insert(0, str(ROOT))
-
-OUTPUT_DIR = ROOT / "output" / "structured"
-INPUT_DIR = ROOT / "input"
-REPORT_PATH = ROOT / "docs" / "NUKE3_BENCHMARK_REPORT.md"
-PY = str(ROOT / ".venv" / "bin" / "python3")
-
-
-# ---------------------------------------------------------------------------
-# Chargement JSON
-# ---------------------------------------------------------------------------
-
-def find_merged_json(dossier_id: str) -> Path | None:
-    """Trouve le JSON fusionné d'un dossier."""
-    d = OUTPUT_DIR / dossier_id
-    if not d.exists():
-        return None
-    fusions = list(d.glob("*fusionne_cim10.json"))
-    if fusions:
-        return fusions[0]
-    cim10s = list(d.glob("*_cim10.json"))
-    return cim10s[0] if cim10s else None
-
-
-def load_dossier_json(dossier_id: str) -> dict | None:
-    """Charge le JSON d'un dossier."""
-    path = find_merged_json(dossier_id)
-    if not path:
-        return None
-    try:
-        return json.loads(path.read_text("utf-8"))
-    except (json.JSONDecodeError, OSError):
-        return None
-
-
-def select_dossiers(n: int, specific: list[str] | None) -> list[str]:
-    """Sélectionne les dossiers à analyser."""
-    if specific:
-        return [d.strip() for d in specific if d.strip()]
-
-    all_dirs = sorted(
-        d.name for d in OUTPUT_DIR.iterdir()
-        if d.is_dir() and find_merged_json(d.name) is not None
-    )
-    return all_dirs[:n] if n > 0 else all_dirs
-
-
-# ---------------------------------------------------------------------------
-# Analyse NUKE-3 d'un dossier
-# ---------------------------------------------------------------------------
-
-def analyze_dp_selection(data: dict) -> dict:
-    """Extrait les métriques NUKE-3 d'un dossier."""
-    dp_sel = data.get("dp_selection")
-
-    result = {
-        "has_dp_selection": dp_sel is not None,
-        "verdict": None,
-        "confidence": None,
-        "chosen_code": None,
-        "n_candidates": 0,
-        "n_evidence": 0,
-        "is_comorbidity_dp": False,
-        "is_symptom_dp": False,
-        "is_act_only_dp": False,
-        "has_evidence": False,
-        "delta": None,
-        "reason": None,
-    }
-
-    if not dp_sel:
-        return result
-
-    result["verdict"] = dp_sel.get("verdict")
-    result["confidence"] = dp_sel.get("confidence")
-    result["chosen_code"] = dp_sel.get("chosen_code")
-
-    candidates = dp_sel.get("candidates", [])
-    result["n_candidates"] = len(candidates)
-
-    evidence = dp_sel.get("evidence", [])
-    result["n_evidence"] = len(evidence)
-    result["has_evidence"] = len(evidence) > 0
-
-    result["reason"] = dp_sel.get("reason")
-
-    # Debug scores
-    debug = dp_sel.get("debug_scores") or {}
-    result["delta"] = debug.get("delta")
-
-    # Flags du gagnant
-    if candidates:
-        winner = candidates[0]
-        result["is_comorbidity_dp"] = winner.get("is_comorbidity_like", False)
-        result["is_symptom_dp"] = winner.get("is_symptom_like", False)
-        result["is_act_only_dp"] = winner.get("is_act_only", False)
-
-    return result
-
-
-# ---------------------------------------------------------------------------
-# Agrégation
-# ---------------------------------------------------------------------------
-
-def compute_metrics(analyses: list[dict]) -> dict:
-    """Calcule les métriques agrégées DIM-like."""
-    n = len(analyses)
-    if n == 0:
-        return {"n": 0}
-
-    with_selection = [a for a in analyses if a["has_dp_selection"]]
-    n_sel = len(with_selection)
-
-    confirmed = [a for a in with_selection if a["verdict"] == "CONFIRMED"]
-    review = [a for a in with_selection if a["verdict"] == "REVIEW"]
-
-    # Métriques principales
-    confirmed_rate = len(confirmed) / n_sel if n_sel else 0
-
-    # Evidence
-    confirmed_with_evidence = sum(1 for a in confirmed if a["has_evidence"])
-    confirmed_evidence_rate = confirmed_with_evidence / len(confirmed) if confirmed else 0
-
-    # Codes problématiques en DP
-    symptom_count = sum(1 for a in with_selection if a["is_symptom_dp"])
-    comorbidity_count = sum(1 for a in with_selection if a["is_comorbidity_dp"])
-    act_only_count = sum(1 for a in with_selection if a["is_act_only_dp"])
-
-    # Confidence
-    conf_high = sum(1 for a in with_selection if a["confidence"] == "high")
-    conf_med = sum(1 for a in with_selection if a["confidence"] == "medium")
-    conf_low = sum(1 for a in with_selection if a["confidence"] == "low")
-
-    # R-codes en DP (symptômes)
-    r_code_count = sum(
-        1 for a in with_selection
-        if a["chosen_code"] and a["chosen_code"].startswith("R")
-    )
-
-    return {
-        "n_total": n,
-        "n_with_selection": n_sel,
-        "confirmed_count": len(confirmed),
-        "review_count": len(review),
-        "confirmed_rate": round(confirmed_rate, 3),
-        "review_rate": round(1 - confirmed_rate, 3) if n_sel else 0,
-        "confirmed_evidence_rate": round(confirmed_evidence_rate, 3),
-        "dp_symptom_rate": round(symptom_count / n_sel, 3) if n_sel else 0,
-        "dp_comorbidity_rate": round(comorbidity_count / n_sel, 3) if n_sel else 0,
-        "dp_act_only_rate": round(act_only_count / n_sel, 3) if n_sel else 0,
-        "dp_r_code_rate": round(r_code_count / n_sel, 3) if n_sel else 0,
-        "confidence": {
-            "high": conf_high,
-            "medium": conf_med,
-            "low": conf_low,
-        },
-        "confidence_high_rate": round(conf_high / n_sel, 3) if n_sel else 0,
-    }
-
-
-# ---------------------------------------------------------------------------
-# Évaluation gold CRH
-# ---------------------------------------------------------------------------
-
-def load_gold(gold_path: str | Path) -> dict:
-    """Charge le gold JSONL et retourne un index case_id → GoldCRHCase."""
-    from src.eval.gold_models import load_gold_index
-    return load_gold_index(Path(gold_path))
-
-
-def evaluate_gold_cases(
-    dossier_details: list[dict],
-    gold_index: dict,
-) -> list[dict]:
-    """Évalue les dossiers présents dans le gold. Retourne une liste d'évaluations."""
-    from src.eval.gold_models import evaluate_dp
-
-    evals: list[dict] = []
-    for d in dossier_details:
-        case_id = d["id"]
-        if case_id not in gold_index:
-            continue
-        gold_case = gold_index[case_id]
-        sel = d.get("dp_selection") or {}
-        chosen_code = sel.get("chosen_code")
-        verdict = sel.get("verdict")
-        confidence = sel.get("confidence")
-
-        ev = evaluate_dp(chosen_code, gold_case)
-        ev["verdict"] = verdict
-        ev["confidence_nuke3"] = confidence
-        evals.append(ev)
-    return evals
-
-
-def compute_gold_metrics(evals: list[dict]) -> dict:
-    """Calcule les métriques agrégées sur les cas gold."""
-    n = len(evals)
-    if n == 0:
-        return {"n": 0}
-
-    strict = sum(1 for e in evals if e["exact_match_strict"])
-    tolerant = sum(1 for e in evals if e["exact_match_tolerant_codes"])
-    family3 = sum(1 for e in evals if e["family3_match_tolerant"])
-    acceptable = sum(1 for e in evals if e["acceptable_match"])
-    symptom_bad = sum(1 for e in evals if e["symptom_not_allowed"])
-
-    # Confirmed-only accuracy
-    confirmed_evals = [e for e in evals if e["verdict"] == "CONFIRMED"]
-    n_conf = len(confirmed_evals)
-    conf_acceptable = sum(1 for e in confirmed_evals if e["acceptable_match"])
-
-    return {
-        "n": n,
-        "exact_match_strict": strict,
-        "exact_match_strict_rate": round(strict / n, 3),
-        "exact_match_tolerant": tolerant,
-        "exact_match_tolerant_rate": round(tolerant / n, 3),
-        "family3_match": family3,
-        "family3_match_rate": round(family3 / n, 3),
-        "acceptable_match": acceptable,
-        "acceptable_match_rate": round(acceptable / n, 3),
-        "confirmed_accuracy_tolerant": round(conf_acceptable / n_conf, 3) if n_conf else None,
-        "confirmed_count": n_conf,
-        "symptom_not_allowed": symptom_bad,
-        "symptom_not_allowed_rate": round(symptom_bad / n, 3),
-    }
-
-
-def write_gold_eval_csv(evals: list[dict], csv_path: Path) -> None:
-    """Écrit le CSV d'évaluation gold."""
-    cols = [
-        "case_id", "chosen_code", "verdict", "confidence_nuke3",
-        "dp_expected_code", "acceptable_match", "exact_match_strict",
-        "symptom_not_allowed", "allow_symptom_dp", "confidence_gold",
-    ]
-    csv_path.parent.mkdir(parents=True, exist_ok=True)
-    with open(csv_path, "w", newline="", encoding="utf-8") as f:
-        writer = csv.DictWriter(f, fieldnames=cols, extrasaction="ignore")
-        writer.writeheader()
-        for ev in evals:
-            row = {
-                "case_id": ev["case_id"],
-                "chosen_code": ev["chosen_code"] or "",
-                "verdict": ev["verdict"] or "",
-                "confidence_nuke3": ev["confidence_nuke3"] or "",
-                "dp_expected_code": ev["dp_expected_code"],
-                "acceptable_match": ev["acceptable_match"],
-                "exact_match_strict": ev["exact_match_strict"],
-                "symptom_not_allowed": ev["symptom_not_allowed"],
-                "allow_symptom_dp": ev["allow_symptom_dp"],
-                "confidence_gold": ev["confidence_gold"],
-            }
-            writer.writerow(row)
-
-
-# ---------------------------------------------------------------------------
-# Re-run pipeline (mode --rerun)
-# ---------------------------------------------------------------------------
-
-def check_ollama() -> bool:
-    """Vérifie que Ollama est joignable."""
-    try:
-        import urllib.request
-        url = os.environ.get("OLLAMA_URL", "http://localhost:11434")
-        req = urllib.request.Request(f"{url}/api/tags", method="GET")
-        urllib.request.urlopen(req, timeout=5)
-        return True
-    except Exception:
-        return False
-
-
-def run_pipeline_with_env(dossier_id: str, llm_flag: str) -> bool:
-    """Lance le pipeline sur un dossier avec T2A_DP_RANKER_LLM=flag."""
-    env = os.environ.copy()
-    env["T2A_DP_RANKER_LLM"] = llm_flag
-
-    try:
-        result = subprocess.run(
-            [PY, "-m", "src.main", str(INPUT_DIR / dossier_id)],
-            capture_output=True, text=True, cwd=str(ROOT),
-            timeout=600, env=env,
-        )
-        return result.returncode == 0
-    except Exception as e:
-        print(f"    ERREUR: {e}")
-        return False
-
-
-# ---------------------------------------------------------------------------
-# Rapport Markdown
-# ---------------------------------------------------------------------------
-
-def _pct(v: float) -> str:
-    return f"{v * 100:.1f}%"
-
-
-def generate_report(
-    metrics_off: dict,
-    metrics_on: dict | None,
-    dossier_details: list[dict],
-    args: argparse.Namespace,
-    gold_metrics: dict | None = None,
-    gold_evals: list[dict] | None = None,
-) -> str:
-    """Génère le rapport Markdown."""
-    lines: list[str] = []
-    now = datetime.now().strftime("%Y-%m-%d %H:%M")
-
-    # Commit hash
-    try:
-        commit = subprocess.check_output(
-            ["git", "rev-parse", "--short", "HEAD"],
-            cwd=str(ROOT), text=True, stderr=subprocess.DEVNULL,
-        ).strip()
-    except Exception:
-        commit = "?"
-
-    lines.append("# NUKE-3 — Benchmark Report")
-    lines.append("")
-    lines.append(f"**Date** : {now}  ")
-    lines.append(f"**Commit** : `{commit}`  ")
-    lines.append(f"**Dossiers analysés** : {metrics_off['n_total']}  ")
-    lines.append(f"**Mode** : {'rerun pipeline' if args.rerun else 'analyse offline (JSON existants)'}  ")
-    lines.append("")
-
-    # Table comparative
-    lines.append("## Métriques DIM-like")
-    lines.append("")
-
-    if metrics_on:
-        lines.append("| Métrique | LLM OFF | LLM ON | Delta |")
-        lines.append("|----------|---------|--------|-------|")
-
-        rows = [
-            ("CONFIRMED rate", "confirmed_rate"),
-            ("REVIEW rate", "review_rate"),
-            ("CONFIRMED + evidence", "confirmed_evidence_rate"),
-            ("DP symptôme (R*)", "dp_symptom_rate"),
-            ("DP comorbidité", "dp_comorbidity_rate"),
-            ("DP acte-seul", "dp_act_only_rate"),
-            ("DP R-code", "dp_r_code_rate"),
-            ("Confidence high", "confidence_high_rate"),
-        ]
-        for label, key in rows:
-            v_off = metrics_off.get(key, 0)
-            v_on = metrics_on.get(key, 0)
-            delta = v_on - v_off
-            sign = "+" if delta > 0 else ""
-            lines.append(
-                f"| {label} | {_pct(v_off)} | {_pct(v_on)} | {sign}{_pct(delta)} |"
-            )
-    else:
-        lines.append("| Métrique | Valeur |")
-        lines.append("|----------|--------|")
-        rows_single = [
-            ("CONFIRMED rate", "confirmed_rate"),
-            ("REVIEW rate", "review_rate"),
-            ("CONFIRMED + evidence", "confirmed_evidence_rate"),
-            ("DP symptôme (R*)", "dp_symptom_rate"),
-            ("DP comorbidité", "dp_comorbidity_rate"),
-            ("DP acte-seul", "dp_act_only_rate"),
-            ("DP R-code", "dp_r_code_rate"),
-            ("Confidence high", "confidence_high_rate"),
-        ]
-        for label, key in rows_single:
-            v = metrics_off.get(key, 0)
-            lines.append(f"| {label} | {_pct(v)} |")
-
-    lines.append("")
-
-    # Volumes
-    lines.append("## Volumes")
-    lines.append("")
-    lines.append(f"- Dossiers avec dp_selection : {metrics_off['n_with_selection']}/{metrics_off['n_total']}")
-    lines.append(f"- CONFIRMED : {metrics_off['confirmed_count']}")
-    lines.append(f"- REVIEW : {metrics_off['review_count']}")
-    c = metrics_off.get("confidence", {})
-    lines.append(f"- Confidence — high: {c.get('high', 0)}, medium: {c.get('medium', 0)}, low: {c.get('low', 0)}")
-    lines.append("")
-
-    # Détail par dossier
-    lines.append("## Détail par dossier")
-    lines.append("")
-    lines.append("| Dossier | Verdict | Code | Confidence | Evidence | Candidats | Reason |")
-    lines.append("|---------|---------|------|------------|----------|-----------|--------|")
-    for d in dossier_details:
-        sel = d.get("dp_selection", {})
-        if not sel:
-            lines.append(f"| {d['id']} | - | - | - | - | - | pas de dp_selection |")
-            continue
-        lines.append(
-            f"| {d['id']} "
-            f"| {sel.get('verdict', '-')} "
-            f"| {sel.get('chosen_code', '-')} "
-            f"| {sel.get('confidence', '-')} "
-            f"| {sel.get('n_evidence', 0)} "
-            f"| {sel.get('n_candidates', 0)} "
-            f"| {(sel.get('reason') or '-')[:60]} |"
-        )
-
-    # Section gold CRH
-    if gold_metrics and gold_metrics.get("n", 0) > 0:
-        gm = gold_metrics
-        lines.append("")
-        lines.append("## Évaluation Gold CRH")
-        lines.append("")
-        lines.append(f"**Cas gold évalués** : {gm['n']}  ")
-        lines.append("")
-        lines.append("| Métrique | Valeur |")
-        lines.append("|----------|--------|")
-        lines.append(f"| Exact match (strict) | {_pct(gm['exact_match_strict_rate'])} ({gm['exact_match_strict']}/{gm['n']}) |")
-        lines.append(f"| Exact match (codes tolérants) | {_pct(gm['exact_match_tolerant_rate'])} ({gm['exact_match_tolerant']}/{gm['n']}) |")
-        lines.append(f"| Family3 match (tolérant) | {_pct(gm['family3_match_rate'])} ({gm['family3_match']}/{gm['n']}) |")
-        lines.append(f"| Acceptable match (codes OU family3) | {_pct(gm['acceptable_match_rate'])} ({gm['acceptable_match']}/{gm['n']}) |")
-        if gm["confirmed_accuracy_tolerant"] is not None:
-            lines.append(f"| Confirmed accuracy (tolérant) | {_pct(gm['confirmed_accuracy_tolerant'])} ({gm['confirmed_count']} CONFIRMED) |")
-        lines.append(f"| Symptôme non autorisé | {gm['symptom_not_allowed']}/{gm['n']} |")
-        lines.append("")
-
-        # Détail par cas gold
-        if gold_evals:
-            lines.append("### Détail par cas gold")
-            lines.append("")
-            lines.append("| Case ID | Choisi | Attendu | Strict | Acceptable | Symptôme interdit | Verdict |")
-            lines.append("|---------|--------|---------|--------|------------|-------------------|---------|")
-            for ev in gold_evals:
-                ok_s = "OK" if ev["exact_match_strict"] else "FAIL"
-                ok_a = "OK" if ev["acceptable_match"] else "FAIL"
-                sym = "OUI" if ev["symptom_not_allowed"] else "-"
-                lines.append(
-                    f"| {ev['case_id']} "
-                    f"| {ev['chosen_code'] or '-'} "
-                    f"| {ev['dp_expected_code']} "
-                    f"| {ok_s} "
-                    f"| {ok_a} "
-                    f"| {sym} "
-                    f"| {ev['verdict'] or '-'} |"
-                )
-            lines.append("")
-
-    lines.append("")
-    lines.append("---")
-    lines.append(f"*Généré par `scripts/benchmark_nuke3_compare.py` — {now}*")
-
-    # Règle DIM rappel
-    lines.append("")
-    lines.append("> **Règle DIM** : `CONFIRMED` ⇒ `evidence` obligatoirement non vide.")
-    lines.append("> Un DP sans preuve exploitable est automatiquement `REVIEW`.")
-
-    return "\n".join(lines)
-
-
-# ---------------------------------------------------------------------------
-# Main
-# ---------------------------------------------------------------------------
-
-def _rebuild_and_select(data: dict) -> dict:
-    """Reconstruit un DossierMedical depuis le JSON et exécute select_dp() offline.
-
-    Utile quand les JSON n'ont pas de champ dp_selection (générés avant NUKE-3).
-    """
-    from src.config import DossierMedical, Diagnostic, Sejour
-    from src.medical.dp_selector import select_dp
-
-    dp_raw = data.get("diagnostic_principal", {})
-    das_raw = data.get("diagnostics_associes", [])
-    doc_type = data.get("document_type", "crh")
-    sej_raw = data.get("sejour", {})
-
-    dp_diag = None
-    if dp_raw and dp_raw.get("texte"):
-        dp_diag = Diagnostic(
-            texte=dp_raw.get("texte", ""),
-            cim10_suggestion=dp_raw.get("cim10_suggestion") or dp_raw.get("cim10_final"),
-            cim10_confidence=dp_raw.get("cim10_confidence"),
-            source=dp_raw.get("source"),
-        )
-    das_list = []
-    for d_item in das_raw:
-        code = d_item.get("cim10_suggestion") or d_item.get("cim10_final")
-        if not code:
-            continue
-        das_list.append(Diagnostic(
-            texte=d_item.get("texte", ""),
-            cim10_suggestion=code,
-            cim10_confidence=d_item.get("cim10_confidence"),
-            source=d_item.get("source"),
-            status=d_item.get("status"),
-        ))
-
-    safe_sej = {k: v for k, v in sej_raw.items() if k in Sejour.model_fields}
-    dossier = DossierMedical(
-        document_type=doc_type,
-        sejour=Sejour(**safe_sej),
-        diagnostic_principal=dp_diag,
-        diagnostics_associes=das_list,
-    )
-
-    # Construire synthese depuis les champs disponibles.
-    # Les JSONs pré-NUKE-3 n'ont pas de sections CRH stockées.
-    # On récupère le texte de conclusion depuis les source_excerpt si besoin.
-    conclusion = data.get("conclusion_medicale", "")
-    if not conclusion:
-        # Chercher "CONCLUSION" dans source_excerpt des DAS ou traitements.
-        # Prendre l'extrait le plus long (les courts sont souvent tronqués).
-        best = ""
-        for container in (das_raw, data.get("traitements_sortie", [])):
-            for item in container:
-                excerpt = item.get("source_excerpt", "")
-                up = excerpt.upper()
-                if "CONCLUSION" in up:
-                    idx = up.index("CONCLUSION")
-                    candidate = excerpt[idx:]
-                    if len(candidate) > len(best):
-                        best = candidate
-        conclusion = best
-
-    synthese = {
-        "motif": data.get("motif_hospitalisation", ""),
-        "conclusion": conclusion,
-        "diag_sortie": data.get("synthese_medicale", {}).get("diag_sortie", ""),
-        "diag_principal": data.get("synthese_medicale", {}).get("diag_principal", ""),
-        "synthese": data.get("synthese_medicale", {}).get("synthese", ""),
-    }
-
-    selection = select_dp(dossier, synthese, config={"llm_enabled": False})
-    dossier.dp_selection = selection
-
-    # Finalizer DP (arbitrage Trackare vs CRH, traçabilité)
-    try:
-        from src.medical.dp_finalizer import finalize_dp
-        finalize_dp(dossier)
-    except Exception:
-        pass
-
-    # Utiliser dp_final si disponible, sinon dp_selection
-    final = dossier.dp_final or selection
-
-    # Convertir en dict compatible analyze_dp_selection
-    cands = [c.model_dump() for c in final.candidates]
-    result = {
-        "dp_selection": {
-            "verdict": final.verdict,
-            "confidence": final.confidence,
-            "chosen_code": final.chosen_code,
-            "chosen_term": final.chosen_term,
-            "candidates": cands,
-            "evidence": final.evidence,
-            "reason": final.reason,
-            "debug_scores": final.debug_scores,
-        }
-    }
-    if dossier.dp_final:
-        result["dp_final"] = dossier.dp_final.model_dump(exclude_none=True)
-    if dossier.quality_flags:
-        result["quality_flags"] = dossier.quality_flags
-    return result
-
-
-def _run_debug_reports(
-    args: argparse.Namespace,
-    dossier_ids: list[str],
-    dossier_details: list[dict],
-    gold_index: dict | None,
-    gold_evals: list[dict] | None,
-    out_dir: Path,
-) -> None:
-    """Exécute les modes --case-id, --top-errors, --dim-pack."""
-    from src.eval.gold_debug import (
-        build_case_report,
-        write_case_report,
-        build_error_entry,
-        sort_error_entries,
-        write_top_errors_csv,
-        write_top_errors_md,
-        write_top_errors_jsonl,
-        select_dim_pack_cases,
-        write_dim_pack,
-    )
-    from src.eval.gold_models import evaluate_dp
-
-    has_debug = args.case_id or args.top_errors > 0 or args.dim_pack > 0
-    if not has_debug:
-        return
-
-    # Helper : build full report for a case
-    def _build_report_for(case_id: str) -> dict | None:
-        data = load_dossier_json(case_id)
-        if not data:
-            return None
-
-        # Offline rebuild si nécessaire
-        if args.offline and not data.get("dp_selection"):
-            rebuilt = _rebuild_and_select(data)
-            data["dp_selection"] = rebuilt["dp_selection"]
-
-        dp_sel = data.get("dp_selection")
-
-        gold_case_dict = None
-        eval_result = None
-        if gold_index and case_id in gold_index:
-            gc = gold_index[case_id]
-            gold_case_dict = gc.model_dump()
-            chosen_code = (dp_sel or {}).get("chosen_code")
-            eval_result = evaluate_dp(chosen_code, gc)
-
-        return build_case_report(case_id, data, dp_sel, gold_case_dict, eval_result)
-
-    # --case-id
-    if args.case_id:
-        cid = args.case_id.strip()
-        data = load_dossier_json(cid)
-        if not data:
-            print(f"ERREUR: output JSON introuvable pour {cid}")
-            print(f"  Suggestion : relancer le pipeline avec --rerun ou vérifier output/structured/{cid}/")
-            sys.exit(1)
-        if gold_index and cid not in gold_index:
-            print(f"ERREUR: {cid} absent du gold ({len(gold_index)} cas chargés)")
-            sys.exit(1)
-
-        report = _build_report_for(cid)
-        if report:
-            jp, mp = write_case_report(report, out_dir)
-            print(f"\n=== Case debug: {cid} ===")
-            print(f"  JSON : {jp}")
-            print(f"  MD   : {mp}")
-
-    # --top-errors
-    if args.top_errors > 0:
-        if not gold_index:
-            print("ERREUR: --top-errors requiert --gold (ou auto-détection gold_crh.jsonl)")
-            sys.exit(1)
-
-        # Build reports for all gold cases
-        all_reports: list[dict] = []
-        gold_case_ids = set(gold_index.keys())
-        for cid in dossier_ids:
-            if cid not in gold_case_ids:
-                continue
-            r = _build_report_for(cid)
-            if r:
-                all_reports.append(r)
-
-        entries = [build_error_entry(r) for r in all_reports]
-        entries = sort_error_entries(entries)
-        entries = entries[:args.top_errors]
-
-        csv_p = out_dir / "NUKE3_GOLD_TOP_ERRORS.csv"
-        md_p = out_dir / "NUKE3_GOLD_TOP_ERRORS.md"
-        jsonl_p = out_dir / "NUKE3_GOLD_TOP_ERRORS.jsonl"
-
-        write_top_errors_csv(entries, csv_p)
-        write_top_errors_md(entries, md_p)
-        write_top_errors_jsonl(entries, jsonl_p)
-
-        print(f"\n=== Top {len(entries)} erreurs gold ===")
-        print(f"  CSV   : {csv_p}")
-        print(f"  MD    : {md_p}")
-        print(f"  JSONL : {jsonl_p}")
-
-    # --dim-pack
-    if args.dim_pack > 0:
-        # Build reports for all CRH (non-trackare) dossiers
-        all_reports_dim: list[dict] = []
-        for cid in dossier_ids:
-            r = _build_report_for(cid)
-            if r and r["document_type"] != "trackare":
-                all_reports_dim.append(r)
-            elif r and r["prediction"]["verdict"] == "REVIEW":
-                # Include trackare-sans-DP too (they go through scoring)
-                all_reports_dim.append(r)
-
-        selected = select_dim_pack_cases(all_reports_dim, args.dim_pack)
-        csv_p, cases_dir = write_dim_pack(selected, out_dir)
-
-        print(f"\n=== DIM Pack ({len(selected)} cas) ===")
-        print(f"  CSV       : {csv_p}")
-        print(f"  Cas JSON  : {cases_dir}/")
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Benchmark NUKE-3 comparatif")
-    parser.add_argument("--n", type=int, default=0, help="Nombre de dossiers (0=tous)")
-    parser.add_argument("--dossiers", type=str, default="", help="IDs séparés par virgules")
-    parser.add_argument("--rerun", action="store_true", help="Relancer le pipeline (nécessite Ollama pour LLM on)")
-    parser.add_argument("--offline", action="store_true",
-                        help="Exécuter NUKE-3 offline (reconstruit DossierMedical depuis JSON, LLM off)")
-    parser.add_argument("--gold", type=str, default="",
-                        help="Fichier JSONL gold CRH (évaluation tolérante)")
-    parser.add_argument("--case-id", type=str, default="",
-                        help="Rapport détaillé pour un cas (ex: 74_23141536)")
-    parser.add_argument("--top-errors", type=int, default=0,
-                        help="Top N erreurs gold (ex: 20)")
-    parser.add_argument("--dim-pack", type=int, default=0,
-                        help="Pack DIM de N cas CRH à annoter (ex: 20)")
-    parser.add_argument("--out-dir", type=str, default=str(ROOT / "docs" / "gold_debug"),
-                        help="Dossier de sortie pour debug reports")
-    parser.add_argument("--output", type=str, default=str(REPORT_PATH), help="Chemin du rapport")
-    args = parser.parse_args()
-
-    specific = [d.strip() for d in args.dossiers.split(",") if d.strip()] if args.dossiers else None
-    dossier_ids = select_dossiers(args.n, specific)
-
-    if not dossier_ids:
-        print("ERREUR: aucun dossier trouvé")
-        sys.exit(1)
-
-    print(f"NUKE-3 benchmark — {len(dossier_ids)} dossiers")
-
-    # Mode rerun
-    if args.rerun:
-        ollama_ok = check_ollama()
-        print(f"  Ollama: {'OK' if ollama_ok else 'INDISPONIBLE'}")
-
-        # Pass 1 : LLM OFF
-        print("\n=== Pass 1 : T2A_DP_RANKER_LLM=0 ===")
-        for did in dossier_ids:
-            ok = run_pipeline_with_env(did, "0")
-            status = "OK" if ok else "FAIL"
-            print(f"  {did}: {status}")
-
-    # Analyse JSON existants (ou résultat du pass 1)
-    print("\n=== Analyse des dossiers ===")
-    analyses_off: list[dict] = []
-    dossier_details: list[dict] = []
-
-    for did in dossier_ids:
-        data = load_dossier_json(did)
-        if not data:
-            print(f"  {did}: JSON introuvable")
-            dossier_details.append({"id": did, "dp_selection": None})
-            continue
-
-        # Mode offline : reconstruire le DossierMedical et exécuter select_dp
-        if args.offline and not data.get("dp_selection"):
-            rebuilt = _rebuild_and_select(data)
-            data["dp_selection"] = rebuilt["dp_selection"]
-
-        analysis = analyze_dp_selection(data)
-        analyses_off.append(analysis)
-        dossier_details.append({"id": did, "dp_selection": analysis})
-
-        verdict = analysis["verdict"] or "-"
-        code = analysis["chosen_code"] or "-"
-        print(f"  {did}: {verdict} — {code} (evidence: {analysis['n_evidence']})")
-
-    metrics_off = compute_metrics(analyses_off)
-
-    # Pass 2 : LLM ON (si rerun + Ollama dispo)
-    metrics_on = None
-    if args.rerun:
-        if not check_ollama():
-            print("\nWARN: Ollama indisponible — pass LLM ON ignorée")
-            print("       Le rapport ne contiendra que les métriques LLM OFF")
-        else:
-            print("\n=== Pass 2 : T2A_DP_RANKER_LLM=1 ===")
-            for did in dossier_ids:
-                ok = run_pipeline_with_env(did, "1")
-                status = "OK" if ok else "FAIL"
-                print(f"  {did}: {status}")
-
-            analyses_on: list[dict] = []
-            for did in dossier_ids:
-                data = load_dossier_json(did)
-                if data:
-                    analyses_on.append(analyze_dp_selection(data))
-            metrics_on = compute_metrics(analyses_on)
-
-    # Gold CRH
-    gold_metrics = None
-    gold_evals = None
-    gold_index = None
-
-    gold_path = args.gold
-    if not gold_path:
-        # Auto-détection
-        default_gold = ROOT / "data" / "gold_crh" / "gold_crh.jsonl"
-        if default_gold.exists():
-            gold_path = str(default_gold)
-
-    if gold_path:
-        try:
-            gold_index = load_gold(gold_path)
-            print(f"\n=== Évaluation Gold CRH ({len(gold_index)} cas) ===")
-            gold_evals = evaluate_gold_cases(dossier_details, gold_index)
-            gold_metrics = compute_gold_metrics(gold_evals)
-
-            for ev in gold_evals:
-                match_str = "OK" if ev["acceptable_match"] else "FAIL"
-                sym_str = " [R* interdit]" if ev["symptom_not_allowed"] else ""
-                print(f"  {ev['case_id']}: {ev['chosen_code'] or '-'} vs {ev['dp_expected_code']}"
-                      f" → {match_str}{sym_str}")
-
-            # CSV évaluation
-            csv_out = ROOT / "docs" / "NUKE3_GOLD_EVAL.csv"
-            write_gold_eval_csv(gold_evals, csv_out)
-            print(f"\nCSV évaluation : {csv_out}")
-        except Exception as e:
-            print(f"\nERREUR gold : {e}")
-            gold_metrics = None
-            gold_evals = None
-
-    # --- Debug reports (--case-id, --top-errors, --dim-pack) ---
-    out_dir = Path(args.out_dir)
-    _run_debug_reports(args, dossier_ids, dossier_details, gold_index, gold_evals, out_dir)
-
-    # Rapport
-    report = generate_report(
-        metrics_off, metrics_on, dossier_details, args,
-        gold_metrics=gold_metrics, gold_evals=gold_evals,
-    )
-    output_path = Path(args.output)
-    output_path.parent.mkdir(parents=True, exist_ok=True)
-    output_path.write_text(report, encoding="utf-8")
-    print(f"\nRapport écrit : {output_path}")
-
-    # Résumé console
-    print(f"\n{'='*50}")
-    print(f"CONFIRMED : {metrics_off['confirmed_count']}/{metrics_off['n_with_selection']}"
-          f" ({_pct(metrics_off['confirmed_rate'])})")
-    print(f"REVIEW    : {metrics_off['review_count']}/{metrics_off['n_with_selection']}"
-          f" ({_pct(metrics_off['review_rate'])})")
-    print(f"Evidence  : {_pct(metrics_off['confirmed_evidence_rate'])} des CONFIRMED")
-    print(f"DP symptôme  : {_pct(metrics_off['dp_symptom_rate'])}")
-    print(f"DP comorbidité: {_pct(metrics_off['dp_comorbidity_rate'])}")
-    if gold_metrics and gold_metrics.get("n", 0) > 0:
-        gm = gold_metrics
-        print(f"\n--- Gold CRH ({gm['n']} cas) ---")
-        print(f"Strict match     : {_pct(gm['exact_match_strict_rate'])}")
-        print(f"Acceptable match : {_pct(gm['acceptable_match_rate'])}")
-        if gm['confirmed_accuracy_tolerant'] is not None:
-            print(f"Confirmed acc.   : {_pct(gm['confirmed_accuracy_tolerant'])}")
-        print(f"Symptôme interdit: {gm['symptom_not_allowed']}")
-    print(f"{'='*50}")
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts/benchmark_quality.py
+++ b/scripts/benchmark_quality.py
@@ -1,722 +0,0 @@
-#!/usr/bin/env python3
-"""Benchmark qualité T2A — validation end-to-end sur vrais dossiers.
-
-Compare la qualité des codes CIM-10, vetos, downgrades et CPAM
-entre runs successifs. Chaque run est sauvegardé dans un répertoire
-isolé pour permettre des comparaisons A/B.
-
-Usage:
-  python scripts/benchmark_quality.py --n 10
-  python scripts/benchmark_quality.py --n 10 --compare RUN_ID
-  python scripts/benchmark_quality.py --dossiers 116_23065570,45_23183041
-  python scripts/benchmark_quality.py --gold-standard
-"""
-
-from __future__ import annotations
-
-import argparse
-import json
-import os
-import shutil
-import subprocess
-import sys
-import time
-from datetime import datetime
-from pathlib import Path
-from statistics import mean, median
-
-ROOT = Path(__file__).resolve().parent.parent
-sys.path.insert(0, str(ROOT))
-
-BENCHMARKS_DIR = ROOT / "output" / "benchmarks"
-GOLD_STANDARD_FILE = ROOT / "data" / "gold_standard" / "_selection.json"
-INPUT_DIR = ROOT / "input"
-OUTPUT_DIR = ROOT / "output" / "structured"
-PY = str(ROOT / ".venv" / "bin" / "python3")
-
-
-# ---------------------------------------------------------------------------
-# Sélection des dossiers
-# ---------------------------------------------------------------------------
-
-def _gold_standard_ids() -> list[str]:
-    """Charge les IDs du gold standard."""
-    if not GOLD_STANDARD_FILE.exists():
-        print(f"ERREUR: {GOLD_STANDARD_FILE} introuvable")
-        sys.exit(1)
-    data = json.loads(GOLD_STANDARD_FILE.read_text("utf-8"))
-    # Format: "116_23065570/116_23065570_fusionne_cim10" → on prend la partie avant /
-    return [d.split("/")[0] for d in data["dossiers"]]
-
-
-def select_dossiers(n: int, gold_standard: bool, specific: list[str] | None, seed: int = 42) -> list[str]:
-    """Sélectionne les dossiers à benchmarker."""
-    if specific:
-        # Vérifier que les dossiers existent
-        valid = []
-        for d in specific:
-            if (INPUT_DIR / d).is_dir():
-                valid.append(d)
-            else:
-                print(f"  WARN: dossier {d} introuvable dans input/")
-        return valid
-
-    if gold_standard:
-        ids = _gold_standard_ids()
-        return ids[:n] if n < len(ids) else ids
-
-    # Sinon : prendre N dossiers depuis input/ (tri déterministe + seed pour reproductibilité)
-    all_dirs = sorted(
-        d.name for d in INPUT_DIR.iterdir()
-        if d.is_dir() and any(d.glob("*.pdf"))
-    )
-    if not all_dirs:
-        print("ERREUR: aucun dossier avec PDF dans input/")
-        sys.exit(1)
-
-    import random
-    rng = random.Random(seed)
-    rng.shuffle(all_dirs)
-    return all_dirs[:n]
-
-
-# ---------------------------------------------------------------------------
-# Exécution pipeline
-# ---------------------------------------------------------------------------
-
-def run_pipeline(dossier_id: str, clean: bool) -> tuple[float, bool]:
-    """Exécute le pipeline sur un dossier. Retourne (durée_s, succès)."""
-    input_path = INPUT_DIR / dossier_id
-
-    if clean:
-        for subdir in ["structured", "reports", "anonymized"]:
-            target = ROOT / "output" / subdir / dossier_id
-            if target.exists():
-                shutil.rmtree(target)
-
-    t0 = time.time()
-    try:
-        result = subprocess.run(
-            [PY, "-m", "src.main", str(input_path)],
-            capture_output=True,
-            text=True,
-            cwd=str(ROOT),
-            timeout=600,  # 10 min max par dossier
-        )
-        duration = time.time() - t0
-        if result.returncode != 0:
-            print(f"    STDERR: {result.stderr[-500:]}")
-            return duration, False
-        return duration, True
-    except subprocess.TimeoutExpired:
-        return time.time() - t0, False
-    except Exception as e:
-        print(f"    EXCEPTION: {e}")
-        return time.time() - t0, False
-
-
-# ---------------------------------------------------------------------------
-# Chargement dictionnaire CIM-10
-# ---------------------------------------------------------------------------
-
-def load_cim10_dict() -> dict[str, str]:
-    """Charge le dictionnaire CIM-10 (sans passer par le singleton)."""
-    dict_path = ROOT / "data" / "cim10_dict.json"
-    supp_path = ROOT / "data" / "cim10_supplements.json"
-    d = {}
-    if dict_path.exists():
-        d = json.loads(dict_path.read_text("utf-8"))
-    if supp_path.exists():
-        for code, label in json.loads(supp_path.read_text("utf-8")).items():
-            d.setdefault(code, label)
-    return d
-
-
-def normalize_code(code: str) -> str:
-    """K810 → K81.0, k85.1 → K85.1."""
-    code = code.strip().upper()
-    if len(code) > 3 and "." not in code:
-        code = code[:3] + "." + code[3:]
-    return code
-
-
-def is_valid_code(code: str, cim10: dict[str, str]) -> bool:
-    """Vérifie si un code CIM-10 existe dans le dictionnaire."""
-    nc = normalize_code(code)
-    return nc in cim10 or code.upper().strip() in cim10
-
-
-# ---------------------------------------------------------------------------
-# Analyse d'un dossier
-# ---------------------------------------------------------------------------
-
-def find_merged_json(dossier_id: str) -> Path | None:
-    """Trouve le JSON fusionné d'un dossier."""
-    d = OUTPUT_DIR / dossier_id
-    if not d.exists():
-        return None
-    # Chercher le fusionné d'abord
-    fusions = list(d.glob("*fusionne_cim10.json"))
-    if fusions:
-        return fusions[0]
-    # Sinon premier _cim10.json
-    cim10s = list(d.glob("*_cim10.json"))
-    return cim10s[0] if cim10s else None
-
-
-def analyze_dossier(dossier_id: str, cim10: dict[str, str], duration: float) -> dict:
-    """Analyse le JSON de sortie d'un dossier et extrait les métriques."""
-    result = {
-        "dossier_id": dossier_id,
-        "processing_time_s": round(duration, 1),
-        "success": False,
-    }
-
-    json_path = find_merged_json(dossier_id)
-    if not json_path:
-        return result
-
-    try:
-        data = json.loads(json_path.read_text("utf-8"))
-    except (json.JSONDecodeError, OSError):
-        return result
-
-    result["success"] = True
-
-    # --- DP ---
-    dp = data.get("diagnostic_principal", {})
-    dp_code = dp.get("cim10_final") or dp.get("cim10_suggestion") or ""
-    dp_suggestion = dp.get("cim10_suggestion") or ""
-    result["dp"] = {
-        "texte": (dp.get("texte") or "")[:80],
-        "code_suggestion": dp_suggestion,
-        "code_final": dp_code,
-        "confidence": dp.get("cim10_confidence", ""),
-        "has_code": bool(dp_code),
-        "valid_code": is_valid_code(dp_code, cim10) if dp_code else False,
-        "downgraded": bool(dp_code and dp_suggestion and dp_code != dp_suggestion),
-    }
-
-    # --- DAS ---
-    das_list = data.get("diagnostics_associes", [])
-    das_codes = []
-    das_conf = {"high": 0, "medium": 0, "low": 0}
-    das_valid = 0
-    das_no_code = 0
-    das_downgraded = 0
-
-    for d_item in das_list:
-        code = d_item.get("cim10_final") or d_item.get("cim10_suggestion") or ""
-        suggestion = d_item.get("cim10_suggestion") or ""
-        conf = d_item.get("cim10_confidence", "low")
-
-        if not code:
-            das_no_code += 1
-            continue
-
-        das_codes.append(code)
-        if conf in das_conf:
-            das_conf[conf] += 1
-
-        if is_valid_code(code, cim10):
-            das_valid += 1
-
-        if code and suggestion and code != suggestion:
-            das_downgraded += 1
-
-    n_das_with_code = len(das_codes)
-    result["das"] = {
-        "total": len(das_list),
-        "with_code": n_das_with_code,
-        "no_code": das_no_code,
-        "valid": das_valid,
-        "validity_rate": round(das_valid / n_das_with_code, 3) if n_das_with_code else 0,
-        "confidence": das_conf,
-        "downgraded": das_downgraded,
-        "downgrade_rate": round(das_downgraded / n_das_with_code, 3) if n_das_with_code else 0,
-        "codes_uniques": sorted(set(das_codes)),
-    }
-
-    # --- Metrics du dossier ---
-    metrics = data.get("metrics", {})
-    result["metrics"] = {
-        "das_active": metrics.get("das_active", 0),
-        "das_removed": metrics.get("das_removed", 0),
-        "das_ruled_out": metrics.get("das_ruled_out", 0),
-    }
-
-    # --- Veto ---
-    veto = data.get("veto_report", {})
-    issues = veto.get("issues", [])
-    result["veto"] = {
-        "verdict": veto.get("verdict", "NO_REPORT"),
-        "score": veto.get("score_contestabilite", 0),
-        "issues_count": len(issues),
-        "hard_count": sum(1 for i in issues if i.get("severity") == "HARD"),
-        "top_issues": [i.get("veto", i.get("type", "?")) for i in issues[:5]],
-    }
-
-    # --- GHM ---
-    ghm = data.get("ghm_estimation")
-    result["ghm"] = {
-        "estimated": ghm is not None and bool(ghm),
-        "cmd": ghm.get("cmd") if ghm else None,
-        "severity": ghm.get("severity") if ghm else None,
-        "ghm": ghm.get("ghm") if ghm else None,
-    }
-
-    # --- CPAM ---
-    cpam = data.get("controles_cpam", [])
-    result["cpam"] = {
-        "controls_count": len(cpam),
-        "has_response": any(bool(c.get("contre_argumentation")) for c in cpam),
-        "sources_count": sum(len(c.get("sources_reponse", [])) for c in cpam),
-    }
-
-    # --- Biologie ---
-    bio = data.get("biologie_cle", [])
-    result["biologie"] = {
-        "tests_count": len(bio),
-        "anomalies": sum(1 for b in bio if b.get("anomalie")),
-    }
-
-    # --- Codes CIM-10 invalides (détail) ---
-    invalid_codes = []
-    if dp_code and not is_valid_code(dp_code, cim10):
-        invalid_codes.append(f"DP:{dp_code}")
-    for code in das_codes:
-        if not is_valid_code(code, cim10):
-            invalid_codes.append(f"DAS:{code}")
-    result["invalid_codes"] = invalid_codes
-
-    return result
-
-
-# ---------------------------------------------------------------------------
-# Agrégation
-# ---------------------------------------------------------------------------
-
-def compute_aggregate(per_dossier: list[dict]) -> dict:
-    """Calcule les métriques agrégées sur tous les dossiers."""
-    successful = [d for d in per_dossier if d.get("success")]
-    n = len(successful)
-    if n == 0:
-        return {"n_total": len(per_dossier), "n_success": 0}
-
-    # DP
-    dp_has_code = sum(1 for d in successful if d["dp"]["has_code"])
-    dp_valid = sum(1 for d in successful if d["dp"]["valid_code"])
-    dp_conf = {"high": 0, "medium": 0, "low": 0}
-    for d in successful:
-        c = d["dp"]["confidence"]
-        if c in dp_conf:
-            dp_conf[c] += 1
-    dp_downgraded = sum(1 for d in successful if d["dp"]["downgraded"])
-
-    # DAS
-    total_das = sum(d["das"]["total"] for d in successful)
-    total_das_with_code = sum(d["das"]["with_code"] for d in successful)
-    total_das_valid = sum(d["das"]["valid"] for d in successful)
-    total_das_downgraded = sum(d["das"]["downgraded"] for d in successful)
-    das_conf_agg = {"high": 0, "medium": 0, "low": 0}
-    for d in successful:
-        for k in das_conf_agg:
-            das_conf_agg[k] += d["das"]["confidence"].get(k, 0)
-
-    # Veto
-    verdicts = {}
-    total_hard = 0
-    for d in successful:
-        v = d["veto"]["verdict"]
-        verdicts[v] = verdicts.get(v, 0) + 1
-        total_hard += d["veto"]["hard_count"]
-
-    # GHM
-    ghm_estimated = sum(1 for d in successful if d["ghm"]["estimated"])
-
-    # CPAM
-    cpam_total = sum(d["cpam"]["controls_count"] for d in successful)
-    cpam_with_response = sum(1 for d in successful if d["cpam"]["has_response"])
-
-    # Temps
-    times = [d["processing_time_s"] for d in successful]
-    times_sorted = sorted(times)
-    p90_idx = int(len(times_sorted) * 0.9)
-
-    # Codes invalides
-    all_invalid = []
-    for d in successful:
-        all_invalid.extend(d.get("invalid_codes", []))
-
-    return {
-        "n_total": len(per_dossier),
-        "n_success": n,
-        "n_failed": len(per_dossier) - n,
-        "dp": {
-            "has_code_rate": round(dp_has_code / n, 3),
-            "valid_code_rate": round(dp_valid / n, 3),
-            "confidence": dp_conf,
-            "downgraded": dp_downgraded,
-        },
-        "das": {
-            "total": total_das,
-            "mean_per_dossier": round(total_das / n, 1),
-            "with_code": total_das_with_code,
-            "valid": total_das_valid,
-            "validity_rate": round(total_das_valid / total_das_with_code, 3) if total_das_with_code else 0,
-            "confidence": das_conf_agg,
-            "confidence_high_rate": round(das_conf_agg["high"] / total_das_with_code, 3) if total_das_with_code else 0,
-            "downgraded": total_das_downgraded,
-            "downgrade_rate": round(total_das_downgraded / total_das_with_code, 3) if total_das_with_code else 0,
-        },
-        "veto": {
-            "verdicts": verdicts,
-            "hard_total": total_hard,
-            "dossiers_with_hard": sum(1 for d in successful if d["veto"]["hard_count"] > 0),
-        },
-        "ghm": {
-            "estimated_rate": round(ghm_estimated / n, 3),
-        },
-        "cpam": {
-            "controls_total": cpam_total,
-            "with_response": cpam_with_response,
-        },
-        "timing": {
-            "mean_s": round(mean(times), 1),
-            "median_s": round(median(times), 1),
-            "p90_s": round(times_sorted[p90_idx], 1) if times_sorted else 0,
-            "total_s": round(sum(times), 1),
-        },
-        "invalid_codes": all_invalid,
-        "invalid_codes_count": len(all_invalid),
-    }
-
-
-# ---------------------------------------------------------------------------
-# Rapport texte
-# ---------------------------------------------------------------------------
-
-def _pct(val: float) -> str:
-    return f"{val * 100:.1f}%"
-
-
-def _bar(val: float, width: int = 20) -> str:
-    filled = int(val * width)
-    return "█" * filled + "░" * (width - filled)
-
-
-def generate_report(run_id: str, config: dict, agg: dict, per_dossier: list[dict]) -> str:
-    """Génère un rapport lisible."""
-    lines = []
-    w = 66
-
-    lines.append("=" * w)
-    lines.append(f"  BENCHMARK QUALITÉ T2A — {run_id}")
-    lines.append("=" * w)
-    lines.append(f"  Date     : {config['timestamp']}")
-    lines.append(f"  Modèles  : coding={config['models'].get('coding','?')}  cpam={config['models'].get('cpam','?')}")
-    lines.append(f"             validation={config['models'].get('validation','?')}  qc={config['models'].get('qc','?')}")
-    lines.append(f"  Dossiers : {agg['n_success']}/{agg['n_total']} traités  ({agg.get('n_failed',0)} échecs)")
-    lines.append(f"  Durée    : {agg['timing']['total_s']:.0f}s ({agg['timing']['mean_s']:.1f}s/dossier)")
-    lines.append("-" * w)
-
-    # DP
-    dp = agg["dp"]
-    lines.append("")
-    lines.append("  DIAGNOSTIC PRINCIPAL (DP)")
-    lines.append(f"    Code obtenu     : {_bar(dp['has_code_rate'])} {_pct(dp['has_code_rate'])}")
-    lines.append(f"    Code CIM-10 valide : {_bar(dp['valid_code_rate'])} {_pct(dp['valid_code_rate'])}")
-    lines.append(f"    Confiance high  : {dp['confidence'].get('high',0)}/{agg['n_success']}  "
-                 f"medium: {dp['confidence'].get('medium',0)}  low: {dp['confidence'].get('low',0)}")
-    lines.append(f"    Downgrades      : {dp['downgraded']}")
-
-    # DAS
-    das = agg["das"]
-    lines.append("")
-    lines.append("  DIAGNOSTICS ASSOCIÉS (DAS)")
-    lines.append(f"    Total           : {das['total']}  (moy {das['mean_per_dossier']}/dossier)")
-    lines.append(f"    Avec code       : {das['with_code']}/{das['total']}")
-    lines.append(f"    Codes valides   : {_bar(das['validity_rate'])} {_pct(das['validity_rate'])}")
-    lines.append(f"    Confiance       : high={das['confidence']['high']}  "
-                 f"medium={das['confidence']['medium']}  low={das['confidence']['low']}")
-    lines.append(f"    Confiance high  : {_bar(das['confidence_high_rate'])} {_pct(das['confidence_high_rate'])}")
-    lines.append(f"    Downgrades      : {das['downgraded']} ({_pct(das['downgrade_rate'])})")
-
-    # Veto
-    veto = agg["veto"]
-    lines.append("")
-    lines.append("  VETOS / QUALITÉ")
-    for v, count in sorted(veto["verdicts"].items(), key=lambda x: -x[1]):
-        lines.append(f"    {v:12s} : {count}")
-    lines.append(f"    Issues HARD     : {veto['hard_total']} (dans {veto['dossiers_with_hard']} dossiers)")
-
-    # GHM
-    lines.append("")
-    lines.append("  GHM")
-    lines.append(f"    Estimé          : {_bar(agg['ghm']['estimated_rate'])} {_pct(agg['ghm']['estimated_rate'])}")
-
-    # CPAM
-    if agg["cpam"]["controls_total"] > 0:
-        lines.append("")
-        lines.append("  CPAM")
-        lines.append(f"    Contrôles       : {agg['cpam']['controls_total']}")
-        lines.append(f"    Avec réponse    : {agg['cpam']['with_response']}")
-
-    # Temps
-    lines.append("")
-    lines.append("  TEMPS DE TRAITEMENT")
-    lines.append(f"    Moyen  : {agg['timing']['mean_s']:.1f}s")
-    lines.append(f"    Médian : {agg['timing']['median_s']:.1f}s")
-    lines.append(f"    P90    : {agg['timing']['p90_s']:.1f}s")
-    lines.append(f"    Total  : {agg['timing']['total_s']:.0f}s")
-
-    # Codes invalides
-    if agg["invalid_codes"]:
-        lines.append("")
-        lines.append(f"  CODES CIM-10 INVALIDES ({agg['invalid_codes_count']})")
-        for code in agg["invalid_codes"][:20]:
-            lines.append(f"    {code}")
-        if agg["invalid_codes_count"] > 20:
-            lines.append(f"    ... et {agg['invalid_codes_count'] - 20} autres")
-
-    # Détail par dossier
-    lines.append("")
-    lines.append("-" * w)
-    lines.append("  DÉTAIL PAR DOSSIER")
-    lines.append("-" * w)
-    lines.append(f"  {'Dossier':<25s} {'DP':>6s} {'DAS':>4s} {'Valid%':>7s} {'Veto':>10s} {'Temps':>6s}")
-    lines.append(f"  {'-'*25:<25s} {'-'*6:>6s} {'-'*4:>4s} {'-'*7:>7s} {'-'*10:>10s} {'-'*6:>6s}")
-
-    for d in sorted(per_dossier, key=lambda x: x["dossier_id"]):
-        if not d.get("success"):
-            lines.append(f"  {d['dossier_id']:<25s} {'ÉCHEC':>6s}")
-            continue
-        dp_code = d["dp"]["code_final"] or "-"
-        dp_mark = "✓" if d["dp"]["valid_code"] else "✗"
-        n_das = d["das"]["total"]
-        vr = f"{d['das']['validity_rate']*100:.0f}%" if d["das"]["with_code"] else "-"
-        verdict = d["veto"]["verdict"]
-        t = f"{d['processing_time_s']:.0f}s"
-        lines.append(f"  {d['dossier_id']:<25s} {dp_code:>5s}{dp_mark} {n_das:>4d} {vr:>7s} {verdict:>10s} {t:>6s}")
-
-    lines.append("")
-    lines.append("=" * w)
-    return "\n".join(lines)
-
-
-# ---------------------------------------------------------------------------
-# Comparaison entre runs
-# ---------------------------------------------------------------------------
-
-def compare_runs(current_agg: dict, baseline_agg: dict, baseline_id: str) -> str:
-    """Compare deux runs et génère un rapport diff."""
-    lines = []
-    w = 66
-    lines.append("")
-    lines.append("=" * w)
-    lines.append(f"  COMPARAISON avec {baseline_id}")
-    lines.append("=" * w)
-
-    def _delta(cur: float, base: float, is_pct: bool = True) -> str:
-        d = cur - base
-        sign = "+" if d >= 0 else ""
-        if is_pct:
-            return f"{sign}{d*100:.1f}%"
-        return f"{sign}{d:.1f}"
-
-    def _row(label: str, cur_val: float, base_val: float, is_pct: bool = True):
-        if is_pct:
-            cur_s = _pct(cur_val)
-            base_s = _pct(base_val)
-        else:
-            cur_s = f"{cur_val:.1f}"
-            base_s = f"{base_val:.1f}"
-        delta_s = _delta(cur_val, base_val, is_pct)
-        lines.append(f"    {label:<24s} {base_s:>10s} {cur_s:>10s} {delta_s:>10s}")
-
-    lines.append(f"    {'Métrique':<24s} {'Baseline':>10s} {'Actuel':>10s} {'Delta':>10s}")
-    lines.append(f"    {'-'*24:<24s} {'-'*10:>10s} {'-'*10:>10s} {'-'*10:>10s}")
-
-    _row("DP code valide", current_agg["dp"]["valid_code_rate"], baseline_agg["dp"]["valid_code_rate"])
-    _row("DAS validité", current_agg["das"]["validity_rate"], baseline_agg["das"]["validity_rate"])
-    _row("DAS confiance high", current_agg["das"]["confidence_high_rate"], baseline_agg["das"]["confidence_high_rate"])
-    _row("DAS downgrade", current_agg["das"]["downgrade_rate"], baseline_agg["das"]["downgrade_rate"])
-    _row("GHM estimé", current_agg["ghm"]["estimated_rate"], baseline_agg["ghm"]["estimated_rate"])
-    _row("DAS moy/dossier", current_agg["das"]["mean_per_dossier"], baseline_agg["das"]["mean_per_dossier"], is_pct=False)
-    _row("Temps moyen (s)", current_agg["timing"]["mean_s"], baseline_agg["timing"]["mean_s"], is_pct=False)
-
-    # Codes invalides
-    cur_inv = set(current_agg.get("invalid_codes", []))
-    base_inv = set(baseline_agg.get("invalid_codes", []))
-    new_inv = cur_inv - base_inv
-    fixed_inv = base_inv - cur_inv
-    if new_inv:
-        lines.append(f"\n    Nouveaux codes invalides : {', '.join(sorted(new_inv))}")
-    if fixed_inv:
-        lines.append(f"    Codes corrigés          : {', '.join(sorted(fixed_inv))}")
-
-    lines.append("=" * w)
-    return "\n".join(lines)
-
-
-# ---------------------------------------------------------------------------
-# Main
-# ---------------------------------------------------------------------------
-
-def get_current_config() -> dict:
-    """Récupère la configuration modèle actuelle."""
-    try:
-        from src.config import OLLAMA_MODELS, OLLAMA_MODEL, OLLAMA_URL
-        return {
-            "models": dict(OLLAMA_MODELS),
-            "ollama_model": OLLAMA_MODEL,
-            "ollama_url": OLLAMA_URL,
-        }
-    except ImportError:
-        return {
-            "models": {
-                "coding": os.environ.get("T2A_MODEL_CODING", "?"),
-                "cpam": os.environ.get("T2A_MODEL_CPAM", "?"),
-                "validation": os.environ.get("T2A_MODEL_VALIDATION", "?"),
-                "qc": os.environ.get("T2A_MODEL_QC", "?"),
-            },
-            "ollama_model": os.environ.get("OLLAMA_MODEL", "?"),
-        }
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Benchmark qualité T2A")
-    parser.add_argument("--n", type=int, default=10, help="Nombre de dossiers")
-    parser.add_argument("--dossiers", type=str, help="IDs séparés par des virgules")
-    parser.add_argument("--gold-standard", action="store_true", help="Utiliser les 50 dossiers gold standard")
-    parser.add_argument("--compare", type=str, help="Run ID à comparer")
-    parser.add_argument("--label", type=str, default="", help="Label pour ce run")
-    parser.add_argument("--no-reprocess", action="store_true", help="Analyser les outputs existants sans relancer le pipeline")
-    parser.add_argument("--clean", action="store_true", help="Supprimer les outputs avant retraitement")
-    parser.add_argument("--seed", type=int, default=42, help="Seed pour la sélection aléatoire")
-    parser.add_argument("--workers", type=int, default=1, help="Nombre de dossiers traités en parallèle")
-    args = parser.parse_args()
-
-    # Sélection dossiers
-    specific = args.dossiers.split(",") if args.dossiers else None
-    dossiers = select_dossiers(args.n, args.gold_standard, specific, args.seed)
-    print(f"\n  Dossiers sélectionnés : {len(dossiers)}")
-    for d in dossiers:
-        print(f"    - {d}")
-
-    # Config
-    config = get_current_config()
-    run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
-    if args.label:
-        run_id = f"{run_id}_{args.label}"
-    config["timestamp"] = datetime.now().isoformat()
-    config["run_id"] = run_id
-    config["dossiers"] = dossiers
-    config["args"] = {
-        "n": args.n,
-        "gold_standard": args.gold_standard,
-        "clean": args.clean,
-        "no_reprocess": args.no_reprocess,
-        "seed": args.seed,
-        "label": args.label,
-    }
-
-    print(f"\n  Run ID  : {run_id}")
-    print(f"  Modèles : {config['models']}")
-    print(f"  Reprocess: {'NON' if args.no_reprocess else 'OUI (clean=' + str(args.clean) + ')'}")
-    print()
-
-    # Charger dictionnaire CIM-10
-    cim10 = load_cim10_dict()
-    print(f"  Dictionnaire CIM-10 : {len(cim10)} codes")
-    print()
-
-    # Traitement
-    per_dossier = []
-    total = len(dossiers)
-
-    if args.workers > 1 and not args.no_reprocess:
-        # Mode parallèle : exécuter les pipelines en parallèle puis analyser
-        from concurrent.futures import ThreadPoolExecutor, as_completed
-        print(f"  Mode parallèle : {args.workers} workers")
-        pipeline_results: dict[str, tuple[float, bool]] = {}
-        done = 0
-        with ThreadPoolExecutor(max_workers=args.workers) as executor:
-            futures = {
-                executor.submit(run_pipeline, dossier_id, args.clean): dossier_id
-                for dossier_id in dossiers
-            }
-            for future in as_completed(futures):
-                dossier_id = futures[future]
-                try:
-                    duration, success = future.result()
-                except Exception as e:
-                    print(f"    EXCEPTION {dossier_id}: {e}")
-                    duration, success = 0.0, False
-                pipeline_results[dossier_id] = (duration, success)
-                done += 1
-                mark = "✓" if success else "✗"
-                print(f"  [{done}/{total}] {dossier_id} — {duration:.1f}s {mark}")
-
-        # Analyse séquentielle (ordre stable)
-        for dossier_id in dossiers:
-            duration, success = pipeline_results[dossier_id]
-            metrics = analyze_dossier(dossier_id, cim10, duration)
-            per_dossier.append(metrics)
-    else:
-        # Mode séquentiel (ou --no-reprocess)
-        for i, dossier_id in enumerate(dossiers, 1):
-            print(f"  [{i}/{total}] {dossier_id}", end="", flush=True)
-
-            if args.no_reprocess:
-                duration = 0.0
-                success = find_merged_json(dossier_id) is not None
-                if not success:
-                    print(" — pas de JSON")
-                else:
-                    print(" — analyse existant")
-            else:
-                print(" — traitement...", end="", flush=True)
-                duration, success = run_pipeline(dossier_id, args.clean)
-                print(f" {duration:.1f}s {'✓' if success else '✗'}")
-
-            metrics = analyze_dossier(dossier_id, cim10, duration)
-            per_dossier.append(metrics)
-
-    # Agrégation
-    agg = compute_aggregate(per_dossier)
-
-    # Rapport
-    report = generate_report(run_id, config, agg, per_dossier)
-    print(report)
-
-    # Comparaison si demandée
-    comparison = ""
-    if args.compare:
-        baseline_path = BENCHMARKS_DIR / args.compare / "metrics.json"
-        if baseline_path.exists():
-            baseline = json.loads(baseline_path.read_text("utf-8"))
-            comparison = compare_runs(agg, baseline["aggregate"], args.compare)
-            print(comparison)
-        else:
-            print(f"\n  WARN: run baseline {args.compare} introuvable ({baseline_path})")
-
-    # Sauvegarde
-    run_dir = BENCHMARKS_DIR / run_id
-    run_dir.mkdir(parents=True, exist_ok=True)
-
-    (run_dir / "config.json").write_text(
-        json.dumps(config, ensure_ascii=False, indent=2), encoding="utf-8"
-    )
-    (run_dir / "metrics.json").write_text(
-        json.dumps({"aggregate": agg, "per_dossier": per_dossier}, ensure_ascii=False, indent=2),
-        encoding="utf-8",
-    )
-    (run_dir / "report.txt").write_text(report + comparison, encoding="utf-8")
-
-    print(f"\n  Résultats sauvegardés dans : {run_dir}")
-    print(f"  Pour comparer un futur run : python scripts/benchmark_quality.py --compare {run_id}")
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts/regenerate_tier_c.py
+++ b/scripts/regenerate_tier_c.py
@@ -1,163 +0,0 @@
-#!/usr/bin/env python3
-"""Régénération ciblée des contrôles CPAM classés Tier C ou sans response_data.
-
-Usage :
-    cd /home/dom/ai/t2a_v2
-    .venv/bin/python3 scripts/regenerate_tier_c.py [--dry-run]
-
-Le script :
-1. Scanne output/structured/ pour trouver les contrôles Tier C + ceux sans response_data
-2. Pour chaque contrôle, relance generate_cpam_response() avec le pipeline corrigé
-3. Sauvegarde le JSON mis à jour (backup automatique .bak)
-
-Options :
-    --dry-run   Affiche les contrôles ciblés sans régénérer
-"""
-
-from __future__ import annotations
-
-import json
-import logging
-import shutil
-import sys
-import time
-from pathlib import Path
-
-# Ajouter le répertoire racine au path
-ROOT = Path(__file__).resolve().parent.parent
-sys.path.insert(0, str(ROOT))
-
-from src.config import DossierMedical
-from src.control.cpam_response import generate_cpam_response
-
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s %(levelname)-7s %(message)s",
-    datefmt="%H:%M:%S",
-)
-logger = logging.getLogger(__name__)
-
-STRUCTURED_DIR = ROOT / "output" / "structured"
-
-
-def find_targets() -> list[tuple[Path, int]]:
-    """Trouve les fichiers JSON contenant des contrôles Tier C ou sans response_data.
-
-    Returns:
-        Liste de (chemin_json, index_du_controle_dans_la_liste).
-    """
-    targets: list[tuple[Path, int]] = []
-
-    for sub in sorted(STRUCTURED_DIR.iterdir()):
-        if not sub.is_dir():
-            continue
-        for jf in sub.glob("*_fusionne_cim10.json"):
-            data = json.loads(jf.read_text(encoding="utf-8"))
-            controles = data.get("controles_cpam", [])
-            for i, ctrl in enumerate(controles):
-                tier = ctrl.get("quality_tier")
-                has_resp = ctrl.get("response_data") is not None
-                if tier == "C" or not has_resp:
-                    targets.append((jf, i))
-
-    return targets
-
-
-def regenerate(targets: list[tuple[Path, int]]) -> dict[str, int]:
-    """Régénère les contrôles CPAM ciblés.
-
-    Returns:
-        Statistiques {tier_A, tier_B, tier_C, errors}.
-    """
-    stats = {"A": 0, "B": 0, "C": 0, "errors": 0}
-    # Grouper par fichier pour ne charger/sauver qu'une fois par dossier
-    by_file: dict[Path, list[int]] = {}
-    for path, idx in targets:
-        by_file.setdefault(path, []).append(idx)
-
-    total = len(targets)
-    done = 0
-
-    for json_path, indices in by_file.items():
-        dossier_id = json_path.parent.name
-        logger.info("=== Dossier %s (%d contrôle(s) à régénérer) ===", dossier_id, len(indices))
-
-        # Charger le dossier
-        data = json.loads(json_path.read_text(encoding="utf-8"))
-        dossier = DossierMedical.model_validate(data)
-
-        modified = False
-
-        for idx in indices:
-            ctrl = dossier.controles_cpam[idx]
-            done += 1
-            old_tier = ctrl.quality_tier or "?"
-            logger.info("[%d/%d] OGC %d — %s (ancien tier: %s)",
-                        done, total, ctrl.numero_ogc, ctrl.titre[:60], old_tier)
-
-            t0 = time.time()
-            try:
-                text, response_data, sources = generate_cpam_response(dossier, ctrl)
-                elapsed = time.time() - t0
-
-                ctrl.contre_argumentation = text
-                ctrl.response_data = response_data
-                ctrl.sources_reponse = sources
-
-                new_tier = ctrl.quality_tier or "?"
-                stats[new_tier] = stats.get(new_tier, 0) + 1
-                modified = True
-
-                logger.info("  Résultat : tier %s → %s (%d chars, %.1fs)",
-                            old_tier, new_tier, len(text), elapsed)
-            except Exception:
-                logger.exception("  ERREUR sur OGC %d", ctrl.numero_ogc)
-                stats["errors"] += 1
-
-        if modified:
-            # Backup + sauvegarde
-            backup_path = json_path.with_suffix(".json.bak")
-            shutil.copy2(json_path, backup_path)
-            json_path.write_text(
-                dossier.model_dump_json(indent=2, exclude_none=True),
-                encoding="utf-8",
-            )
-            logger.info("  Sauvegardé : %s (backup: %s)", json_path.name, backup_path.name)
-
-    return stats
-
-
-def main() -> None:
-    dry_run = "--dry-run" in sys.argv
-
-    logger.info("Recherche des contrôles Tier C et sans response_data...")
-    targets = find_targets()
-
-    if not targets:
-        logger.info("Aucun contrôle à régénérer.")
-        return
-
-    logger.info("Trouvé %d contrôle(s) à régénérer :", len(targets))
-    for path, idx in targets:
-        data = json.loads(path.read_text(encoding="utf-8"))
-        ctrl = data["controles_cpam"][idx]
-        tier = ctrl.get("quality_tier", "?")
-        has_resp = "oui" if ctrl.get("response_data") else "NON"
-        logger.info("  %s OGC %d — tier %s, response_data: %s",
-                     path.parent.name, ctrl["numero_ogc"], tier, has_resp)
-
-    if dry_run:
-        logger.info("Mode dry-run — aucune régénération effectuée.")
-        return
-
-    t0 = time.time()
-    stats = regenerate(targets)
-    elapsed = time.time() - t0
-
-    logger.info("=== TERMINÉ en %.1f min ===", elapsed / 60)
-    logger.info("Distribution : A=%d, B=%d, C=%d, erreurs=%d",
-                stats.get("A", 0), stats.get("B", 0), stats.get("C", 0), stats["errors"])
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts/select_validation_dossiers.py
+++ b/scripts/select_validation_dossiers.py
@@ -1,231 +0,0 @@
-#!/usr/bin/env python3
-"""Sélectionne 50 dossiers pour le gold standard de validation DIM.
-
- 25 dossiers CPAM (cas complexes, déjà contrôlés)
- 25 dossiers non-CPAM stratifiés par CMD, confiance DP, nombre de DAS
-
-Crée data/gold_standard/_selection.json et initialise les annotations vides.
-"""
-
-from __future__ import annotations
-
-import json
-import random
-import sys
-from pathlib import Path
-
-# Ajouter le répertoire racine au path
-sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
-
-from src.config import STRUCTURED_DIR, BASE_DIR, DossierMedical
-
-GOLD_DIR = BASE_DIR / "data" / "gold_standard"
-TARGET_TOTAL = 50
-TARGET_CPAM = 25
-
-
-def load_all_dossiers() -> list[dict]:
-    """Charge tous les dossiers fusionnés depuis output/structured/."""
-    dossiers = []
-    for subdir in sorted(STRUCTURED_DIR.iterdir()):
-        if not subdir.is_dir():
-            continue
-        # Chercher le fichier fusionné
-        fusionne = None
-        for f in subdir.glob("*fusionne*.json"):
-            fusionne = f
-            break
-        if not fusionne:
-            # Prendre le premier JSON du dossier
-            jsons = sorted(subdir.glob("*.json"))
-            if jsons:
-                fusionne = jsons[0]
-        if not fusionne:
-            continue
-
-        try:
-            data = json.loads(fusionne.read_text(encoding="utf-8"))
-            dossier = DossierMedical.model_validate(data)
-            rel_path = str(fusionne.relative_to(STRUCTURED_DIR))
-            group_name = subdir.name
-            dossiers.append({
-                "dossier_id": f"{group_name}/{fusionne.stem}",
-                "group_name": group_name,
-                "path_rel": rel_path,
-                "dossier": dossier,
-            })
-        except Exception as e:
-            print(f"  Erreur chargement {fusionne.name}: {e}")
-    return dossiers
-
-
-def select_dossiers(all_dossiers: list[dict]) -> list[dict]:
-    """Sélectionne les 50 dossiers selon la stratégie définie."""
-    # Séparer CPAM / non-CPAM
-    cpam = [d for d in all_dossiers if d["dossier"].controles_cpam]
-    non_cpam = [d for d in all_dossiers if not d["dossier"].controles_cpam]
-
-    print(f"Dossiers CPAM disponibles : {len(cpam)}")
-    print(f"Dossiers non-CPAM disponibles : {len(non_cpam)}")
-
-    # Prendre tous les CPAM (ou max TARGET_CPAM)
-    selected_cpam = cpam[:TARGET_CPAM]
-    remaining_target = TARGET_TOTAL - len(selected_cpam)
-
-    # Stratifier les non-CPAM
-    selected_non_cpam = stratified_sample(non_cpam, remaining_target)
-
-    selected = selected_cpam + selected_non_cpam
-    print(f"\nSélection finale : {len(selected)} dossiers")
-    print(f"  - CPAM : {len(selected_cpam)}")
-    print(f"  - Non-CPAM : {len(selected_non_cpam)}")
-
-    return selected
-
-
-def stratified_sample(dossiers: list[dict], n: int) -> list[dict]:
-    """Échantillonnage stratifié par CMD, confiance DP et nombre de DAS."""
-    if len(dossiers) <= n:
-        return dossiers
-
-    # Grouper par CMD
-    by_cmd: dict[str, list[dict]] = {}
-    for d in dossiers:
-        ghm = d["dossier"].ghm_estimation
-        cmd = ghm.cmd if ghm else "inconnu"
-        by_cmd.setdefault(cmd or "inconnu", []).append(d)
-
-    selected = []
-    seen_ids = set()
-
-    # Phase 1 : 1 dossier par CMD (diversité maximale)
-    cmds = sorted(by_cmd.keys())
-    random.seed(42)  # Reproductible
-    for cmd in cmds:
-        if len(selected) >= n:
-            break
-        candidates = by_cmd[cmd]
-        # Préférer un mix de confiances
-        random.shuffle(candidates)
-        d = candidates[0]
-        selected.append(d)
-        seen_ids.add(d["dossier_id"])
-
-    # Phase 2 : compléter avec diversité confiance DP
-    if len(selected) < n:
-        remaining = [d for d in dossiers if d["dossier_id"] not in seen_ids]
-        # Trier par confiance DP (low > medium > high pour surreprésenter les cas difficiles)
-        conf_order = {"low": 0, "medium": 1, "high": 2, None: 3}
-        remaining.sort(key=lambda d: (
-            conf_order.get(
-                d["dossier"].diagnostic_principal.cim10_confidence
-                if d["dossier"].diagnostic_principal else None,
-                3
-            ),
-            -len(d["dossier"].diagnostics_associes),  # beaucoup de DAS d'abord
-        ))
-        for d in remaining:
-            if len(selected) >= n:
-                break
-            selected.append(d)
-
-    return selected[:n]
-
-
-def create_empty_annotation(dossier_id: str, dossier: DossierMedical) -> dict:
-    """Crée une annotation vide pour un dossier."""
-    dp = dossier.diagnostic_principal
-    das_list = []
-    for i, das in enumerate(dossier.diagnostics_associes):
-        das_list.append({
-            "index": i,
-            "texte_original": das.texte,
-            "code_pipeline": das.cim10_suggestion or "",
-            "confidence": das.cim10_confidence or "",
-            "source": das.source or "",
-            "statut": "correct",
-            "code_corrige": None,
-            "commentaire": "",
-        })
-
-    return {
-        "dossier_id": dossier_id,
-        "validateur": "",
-        "date_validation": "",
-        "statut": "non_commence",
-        "dp": {
-            "texte_original": dp.texte if dp else "",
-            "code_pipeline": dp.cim10_suggestion if dp else "",
-            "confidence": dp.cim10_confidence if dp else "",
-            "statut": "correct",
-            "code_corrige": None,
-            "commentaire": "",
-        },
-        "das": das_list,
-        "das_ajoutes": [],
-        "commentaire_general": "",
-    }
-
-
-def main():
-    print("=== Sélection des dossiers pour validation DIM ===\n")
-
-    all_dossiers = load_all_dossiers()
-    print(f"Total dossiers chargés : {len(all_dossiers)}\n")
-
-    if not all_dossiers:
-        print("Aucun dossier trouvé dans output/structured/")
-        sys.exit(1)
-
-    selected = select_dossiers(all_dossiers)
-
-    # Créer le répertoire gold standard
-    GOLD_DIR.mkdir(parents=True, exist_ok=True)
-
-    # Sauvegarder la sélection
-    selection = {
-        "date_selection": __import__("datetime").datetime.now().isoformat(timespec="seconds"),
-        "total": len(selected),
-        "cpam": sum(1 for d in selected if d["dossier"].controles_cpam),
-        "non_cpam": sum(1 for d in selected if not d["dossier"].controles_cpam),
-        "dossiers": [d["dossier_id"] for d in selected],
-    }
-    selection_path = GOLD_DIR / "_selection.json"
-    selection_path.write_text(
-        json.dumps(selection, ensure_ascii=False, indent=2),
-        encoding="utf-8",
-    )
-    print(f"\nSélection sauvegardée : {selection_path}")
-
-    # Initialiser les annotations vides
-    created = 0
-    for d in selected:
-        dossier_id = d["dossier_id"]
-        safe_name = dossier_id.replace("/", "__") + ".json"
-        annot_path = GOLD_DIR / safe_name
-        if not annot_path.exists():
-            annotation = create_empty_annotation(dossier_id, d["dossier"])
-            annot_path.write_text(
-                json.dumps(annotation, ensure_ascii=False, indent=2),
-                encoding="utf-8",
-            )
-            created += 1
-
-    print(f"Annotations vides créées : {created}")
-    print(f"Annotations existantes préservées : {len(selected) - created}")
-
-    # Résumé
-    print(f"\n--- Résumé ---")
-    for i, d in enumerate(selected, 1):
-        dos = d["dossier"]
-        dp_code = dos.diagnostic_principal.cim10_suggestion if dos.diagnostic_principal else "?"
-        dp_conf = (dos.diagnostic_principal.cim10_confidence or "?") if dos.diagnostic_principal else "?"
-        n_das = len(dos.diagnostics_associes)
-        cpam_flag = " [CPAM]" if dos.controles_cpam else ""
-        ghm = dos.ghm_estimation
-        cmd = ghm.cmd if ghm else "?"
-        print(f"  {i:2d}. {d['group_name']:<20s} DP={dp_code:<6s} conf={dp_conf:<7s} DAS={n_das:2d} CMD={cmd}{cpam_flag}")
-
-
-if __name__ == "__main__":
-    main()