feat: architecture multi-modèles LLM + quality engine + benchmark

- Multi-modèles : 4 rôles LLM (coding=gemma3:27b-cloud, cpam=gemma3:27b-cloud,
  validation=deepseek-v3.2:cloud, qc=gemma3:12b) avec get_model(role)
- Prompts externalisés : 7 templates dans src/prompts/templates.py
- Cache Ollama : modèle stocké par entrée (migration auto ancien format)
- call_ollama() : paramètre role= (priorité: model > role > global)
- Quality engine : veto_engine + decision_engine + rules_router (YAML)
- Benchmark qualité : scripts/benchmark_quality.py (A/B, métriques CIM-10)
- Fix biologie : valeurs qualitatives (troponine négative) non filtrées
- Fix CPAM : gemma3:27b-cloud au lieu de deepseek (JSON tronqué par thinking)
- CPAM max_tokens 4000→6000, viewer admin multi-modèles
- Benchmark 10 dossiers : 100% DAS valides, 10/10 CPAM, 243s/dossier

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
dom
2026-02-20 00:21:09 +01:00
parent 5c8c2817ec
commit 909e051cc9
39 changed files with 5092 additions and 574 deletions

313
scripts/benchmark_models.py Normal file
View File

@@ -0,0 +1,313 @@
#!/usr/bin/env python3
"""Benchmark A/B : gemma3:12b (base) vs pmsi-coder-v2 (fine-tuné).
Compare les codes CIM-10 produits par les deux modèles sur N dossiers.
Teste DP + DAS (échantillon) pour chaque dossier.
Usage: python scripts/benchmark_models.py [--n 50] [--das-max 5]
"""
from __future__ import annotations
import json
import random
import sys
import time
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from src.config import STRUCTURED_DIR, OLLAMA_URL, DossierMedical
from src.medical.cim10_dict import load_dict, normalize_code, validate_code
import requests
MODEL_BASE = "gemma3:12b"
MODEL_FINETUNED = "pmsi-coder-v2"
PROMPT_TEMPLATE = """Tu es un médecin DIM expert en codage PMSI.
Code le diagnostic suivant en CIM-10. Choisis le code le plus spécifique possible.
DIAGNOSTIC : "{texte}"
TYPE : {type_diag}
{contexte}
Réponds UNIQUEMENT avec un objet JSON :
{{"code": "X99.9", "confidence": "high|medium|low", "justification": "explication courte"}}"""
def call_model(prompt: str, model: str, timeout: int = 120) -> tuple[dict | None, float]:
"""Appelle un modèle Ollama et retourne (résultat, durée_s)."""
t0 = time.time()
try:
resp = requests.post(
f"{OLLAMA_URL}/api/generate",
json={
"model": model,
"prompt": prompt,
"stream": False,
"format": "json",
"options": {"temperature": 0.1, "num_predict": 500},
},
timeout=timeout,
)
resp.raise_for_status()
raw = resp.json().get("response", "")
duration = time.time() - t0
try:
return json.loads(raw), duration
except json.JSONDecodeError:
return None, duration
except Exception as e:
return None, time.time() - t0
def load_dossiers(n: int) -> list[dict]:
"""Charge N dossiers fusionnés diversifiés."""
dossiers = []
for subdir in sorted(STRUCTURED_DIR.iterdir()):
if not subdir.is_dir():
continue
for f in subdir.glob("*fusionne*.json"):
if ".gemma_" in f.name or ".bak" in f.name:
continue
try:
data = json.loads(f.read_text(encoding="utf-8"))
d = DossierMedical.model_validate(data)
if d.diagnostic_principal and d.diagnostic_principal.cim10_suggestion:
dossiers.append({
"name": subdir.name,
"dossier": d,
"path": str(f),
})
except Exception:
continue
break
random.seed(42)
random.shuffle(dossiers)
return dossiers[:n]
def build_contexte(d: DossierMedical) -> str:
"""Construit un contexte clinique résumé."""
parts = []
s = d.sejour
if s.age is not None:
parts.append(f"Patient {s.sexe or '?'}, {s.age} ans")
if s.duree_sejour is not None:
parts.append(f"Durée séjour : {s.duree_sejour}j")
if d.diagnostic_principal:
parts.append(f"DP : {d.diagnostic_principal.texte}")
bio = [f"{b.test}={b.valeur}" for b in d.biologie_cle[:5] if b.valeur]
if bio:
parts.append(f"Bio : {', '.join(bio)}")
return "CONTEXTE : " + " | ".join(parts) if parts else ""
def code_match_level(code_a: str, code_b: str) -> str:
"""Retourne le niveau de correspondance entre deux codes."""
if code_a == code_b:
return "exact"
if code_a[:3] == code_b[:3]:
return "categorie"
return "diff"
def run_benchmark(n: int = 50, das_max: int = 5):
print(f"=== Benchmark A/B : {MODEL_BASE} vs {MODEL_FINETUNED} ===")
print(f" Dossiers : {n}, DAS max/dossier : {das_max}\n")
# Vérifier que les deux modèles sont disponibles
for model in [MODEL_BASE, MODEL_FINETUNED]:
try:
resp = requests.post(
f"{OLLAMA_URL}/api/generate",
json={"model": model, "prompt": "test", "stream": False,
"options": {"num_predict": 1}},
timeout=60,
)
resp.raise_for_status()
print(f" {model} : OK")
except Exception as e:
print(f" {model} : ERREUR — {e}")
sys.exit(1)
dossiers = load_dossiers(n)
print(f"\nDossiers chargés : {len(dossiers)}\n")
cim10 = load_dict()
t_global_start = time.time()
dp_results = []
das_results = []
for i, item in enumerate(dossiers, 1):
d = item["dossier"]
dp = d.diagnostic_principal
name = item["name"]
ctx = build_contexte(d)
# === DP ===
prompt_dp = PROMPT_TEMPLATE.format(
texte=dp.texte,
type_diag="DP (diagnostic principal)",
contexte=ctx,
)
res_base, t_base = call_model(prompt_dp, MODEL_BASE)
res_ft, t_ft = call_model(prompt_dp, MODEL_FINETUNED)
code_base = normalize_code(res_base.get("code", "")) if res_base else "ERREUR"
code_ft = normalize_code(res_ft.get("code", "")) if res_ft else "ERREUR"
conf_base = res_base.get("confidence", "?") if res_base else "?"
conf_ft = res_ft.get("confidence", "?") if res_ft else "?"
valid_base = validate_code(code_base)[0] if code_base != "ERREUR" else False
valid_ft = validate_code(code_ft)[0] if code_ft != "ERREUR" else False
pipeline_code = dp.cim10_suggestion
match_level = code_match_level(code_base, code_ft)
dp_result = {
"dossier": name,
"texte": dp.texte[:80],
"pipeline": pipeline_code,
"base": code_base,
"ft": code_ft,
"conf_base": conf_base,
"conf_ft": conf_ft,
"valid_base": valid_base,
"valid_ft": valid_ft,
"match": match_level,
"t_base": round(t_base, 2),
"t_ft": round(t_ft, 2),
}
dp_results.append(dp_result)
tag = {"exact": "=", "categorie": "~", "diff": "X"}[match_level]
print(f" [{i:2d}/{len(dossiers)}] {name:<20s} DP=\"{dp.texte[:35]:<35s}\" "
f"base={code_base:<7s} ft={code_ft:<7s} [{tag}] "
f"({t_base:.1f}s / {t_ft:.1f}s)")
# === DAS (échantillon) ===
das_list = [das for das in d.diagnostics_associes
if das.texte and das.cim10_suggestion]
if len(das_list) > das_max:
random.seed(hash(name))
das_list = random.sample(das_list, das_max)
for das in das_list:
prompt_das = PROMPT_TEMPLATE.format(
texte=das.texte,
type_diag="DAS (diagnostic associé significatif)",
contexte=ctx,
)
res_b, tb = call_model(prompt_das, MODEL_BASE)
res_f, tf = call_model(prompt_das, MODEL_FINETUNED)
cb = normalize_code(res_b.get("code", "")) if res_b else "ERREUR"
cf = normalize_code(res_f.get("code", "")) if res_f else "ERREUR"
vb = validate_code(cb)[0] if cb != "ERREUR" else False
vf = validate_code(cf)[0] if cf != "ERREUR" else False
das_results.append({
"dossier": name,
"texte": das.texte[:80],
"pipeline": das.cim10_suggestion,
"base": cb,
"ft": cf,
"conf_base": (res_b or {}).get("confidence", "?"),
"conf_ft": (res_f or {}).get("confidence", "?"),
"valid_base": vb,
"valid_ft": vf,
"match": code_match_level(cb, cf),
"t_base": round(tb, 2),
"t_ft": round(tf, 2),
})
t_global = time.time() - t_global_start
# === RÉSUMÉ ===
print(f"\n{'='*75}")
print(f"RÉSUMÉ — {len(dp_results)} dossiers, {len(das_results)} DAS testés")
print(f"Durée totale : {t_global/60:.1f} min\n")
for label, results in [("DP", dp_results), ("DAS", das_results)]:
if not results:
continue
nt = len(results)
n_exact = sum(1 for r in results if r["match"] == "exact")
n_cat = sum(1 for r in results if r["match"] == "categorie")
n_diff = sum(1 for r in results if r["match"] == "diff")
n_vb = sum(1 for r in results if r["valid_base"])
n_vf = sum(1 for r in results if r["valid_ft"])
avg_tb = sum(r["t_base"] for r in results) / nt
avg_tf = sum(r["t_ft"] for r in results) / nt
# Confiance
conf_b = {}
conf_f = {}
for r in results:
conf_b[r["conf_base"]] = conf_b.get(r["conf_base"], 0) + 1
conf_f[r["conf_ft"]] = conf_f.get(r["conf_ft"], 0) + 1
# Concordance avec pipeline (gemma run original)
n_base_eq_pipe = sum(1 for r in results if r["base"] == r["pipeline"])
n_ft_eq_pipe = sum(1 for r in results if r["ft"] == r["pipeline"])
n_base_cat_pipe = sum(1 for r in results
if r["base"][:3] == r["pipeline"][:3])
n_ft_cat_pipe = sum(1 for r in results
if r["ft"][:3] == r["pipeline"][:3])
print(f" --- {label} ({nt} diagnostics) ---")
print(f" Concordance base↔ft :")
print(f" Exact : {n_exact}/{nt} ({100*n_exact/nt:.0f}%)")
print(f" Catégorie : {n_exact+n_cat}/{nt} ({100*(n_exact+n_cat)/nt:.0f}%)")
print(f" Différent : {n_diff}/{nt} ({100*n_diff/nt:.0f}%)")
print(f" Codes valides :")
print(f" base : {n_vb}/{nt} ({100*n_vb/nt:.0f}%)")
print(f" ft : {n_vf}/{nt} ({100*n_vf/nt:.0f}%)")
print(f" vs pipeline (gemma original) :")
print(f" base=pipe : {n_base_eq_pipe}/{nt} exact, {n_base_cat_pipe}/{nt} catégorie")
print(f" ft=pipe : {n_ft_eq_pipe}/{nt} exact, {n_ft_cat_pipe}/{nt} catégorie")
print(f" Temps moyen : base={avg_tb:.2f}s ft={avg_tf:.2f}s (Δ={100*(avg_tf-avg_tb)/avg_tb:+.0f}%)")
print(f" Confiance base : {conf_b}")
print(f" Confiance ft : {conf_f}")
print()
# Lister les différences DP
diffs_dp = [r for r in dp_results if r["match"] == "diff"]
if diffs_dp:
print(f" Différences DP ({len(diffs_dp)}) :")
for r in diffs_dp:
vb = "" if r["valid_base"] else ""
vf = "" if r["valid_ft"] else ""
print(f" {r['dossier']:<18s} \"{r['texte'][:40]}\"")
print(f" base={r['base']:<7s}{vb} ft={r['ft']:<7s}{vf} pipe={r['pipeline']}")
# Sauvegarder
out = {
"meta": {
"date": time.strftime("%Y-%m-%dT%H:%M:%S"),
"model_base": MODEL_BASE,
"model_ft": MODEL_FINETUNED,
"n_dossiers": len(dp_results),
"n_das": len(das_results),
"duration_min": round(t_global / 60, 1),
},
"dp": dp_results,
"das": das_results,
}
out_path = Path(__file__).parent.parent / "output" / "benchmark_ab.json"
out_path.write_text(json.dumps(out, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"\nRésultats détaillés : {out_path}")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--n", type=int, default=50,
help="Nombre de dossiers à tester")
parser.add_argument("--das-max", type=int, default=5,
help="Max DAS testés par dossier")
args = parser.parse_args()
run_benchmark(args.n, args.das_max)

View File

@@ -0,0 +1,689 @@
#!/usr/bin/env python3
"""Benchmark qualité T2A — validation end-to-end sur vrais dossiers.
Compare la qualité des codes CIM-10, vetos, downgrades et CPAM
entre runs successifs. Chaque run est sauvegardé dans un répertoire
isolé pour permettre des comparaisons A/B.
Usage:
python scripts/benchmark_quality.py --n 10
python scripts/benchmark_quality.py --n 10 --compare RUN_ID
python scripts/benchmark_quality.py --dossiers 116_23065570,45_23183041
python scripts/benchmark_quality.py --gold-standard
"""
from __future__ import annotations
import argparse
import json
import os
import shutil
import subprocess
import sys
import time
from datetime import datetime
from pathlib import Path
from statistics import mean, median
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
BENCHMARKS_DIR = ROOT / "output" / "benchmarks"
GOLD_STANDARD_FILE = ROOT / "data" / "gold_standard" / "_selection.json"
INPUT_DIR = ROOT / "input"
OUTPUT_DIR = ROOT / "output" / "structured"
PY = str(ROOT / ".venv" / "bin" / "python3")
# ---------------------------------------------------------------------------
# Sélection des dossiers
# ---------------------------------------------------------------------------
def _gold_standard_ids() -> list[str]:
"""Charge les IDs du gold standard."""
if not GOLD_STANDARD_FILE.exists():
print(f"ERREUR: {GOLD_STANDARD_FILE} introuvable")
sys.exit(1)
data = json.loads(GOLD_STANDARD_FILE.read_text("utf-8"))
# Format: "116_23065570/116_23065570_fusionne_cim10" → on prend la partie avant /
return [d.split("/")[0] for d in data["dossiers"]]
def select_dossiers(n: int, gold_standard: bool, specific: list[str] | None, seed: int = 42) -> list[str]:
"""Sélectionne les dossiers à benchmarker."""
if specific:
# Vérifier que les dossiers existent
valid = []
for d in specific:
if (INPUT_DIR / d).is_dir():
valid.append(d)
else:
print(f" WARN: dossier {d} introuvable dans input/")
return valid
if gold_standard:
ids = _gold_standard_ids()
return ids[:n] if n < len(ids) else ids
# Sinon : prendre N dossiers depuis input/ (tri déterministe + seed pour reproductibilité)
all_dirs = sorted(
d.name for d in INPUT_DIR.iterdir()
if d.is_dir() and any(d.glob("*.pdf"))
)
if not all_dirs:
print("ERREUR: aucun dossier avec PDF dans input/")
sys.exit(1)
import random
rng = random.Random(seed)
rng.shuffle(all_dirs)
return all_dirs[:n]
# ---------------------------------------------------------------------------
# Exécution pipeline
# ---------------------------------------------------------------------------
def run_pipeline(dossier_id: str, clean: bool) -> tuple[float, bool]:
"""Exécute le pipeline sur un dossier. Retourne (durée_s, succès)."""
input_path = INPUT_DIR / dossier_id
if clean:
for subdir in ["structured", "reports", "anonymized"]:
target = ROOT / "output" / subdir / dossier_id
if target.exists():
shutil.rmtree(target)
t0 = time.time()
try:
result = subprocess.run(
[PY, "-m", "src.main", str(input_path)],
capture_output=True,
text=True,
cwd=str(ROOT),
timeout=600, # 10 min max par dossier
)
duration = time.time() - t0
if result.returncode != 0:
print(f" STDERR: {result.stderr[-500:]}")
return duration, False
return duration, True
except subprocess.TimeoutExpired:
return time.time() - t0, False
except Exception as e:
print(f" EXCEPTION: {e}")
return time.time() - t0, False
# ---------------------------------------------------------------------------
# Chargement dictionnaire CIM-10
# ---------------------------------------------------------------------------
def load_cim10_dict() -> dict[str, str]:
"""Charge le dictionnaire CIM-10 (sans passer par le singleton)."""
dict_path = ROOT / "data" / "cim10_dict.json"
supp_path = ROOT / "data" / "cim10_supplements.json"
d = {}
if dict_path.exists():
d = json.loads(dict_path.read_text("utf-8"))
if supp_path.exists():
for code, label in json.loads(supp_path.read_text("utf-8")).items():
d.setdefault(code, label)
return d
def normalize_code(code: str) -> str:
"""K810 → K81.0, k85.1 → K85.1."""
code = code.strip().upper()
if len(code) > 3 and "." not in code:
code = code[:3] + "." + code[3:]
return code
def is_valid_code(code: str, cim10: dict[str, str]) -> bool:
"""Vérifie si un code CIM-10 existe dans le dictionnaire."""
nc = normalize_code(code)
return nc in cim10 or code.upper().strip() in cim10
# ---------------------------------------------------------------------------
# Analyse d'un dossier
# ---------------------------------------------------------------------------
def find_merged_json(dossier_id: str) -> Path | None:
"""Trouve le JSON fusionné d'un dossier."""
d = OUTPUT_DIR / dossier_id
if not d.exists():
return None
# Chercher le fusionné d'abord
fusions = list(d.glob("*fusionne_cim10.json"))
if fusions:
return fusions[0]
# Sinon premier _cim10.json
cim10s = list(d.glob("*_cim10.json"))
return cim10s[0] if cim10s else None
def analyze_dossier(dossier_id: str, cim10: dict[str, str], duration: float) -> dict:
"""Analyse le JSON de sortie d'un dossier et extrait les métriques."""
result = {
"dossier_id": dossier_id,
"processing_time_s": round(duration, 1),
"success": False,
}
json_path = find_merged_json(dossier_id)
if not json_path:
return result
try:
data = json.loads(json_path.read_text("utf-8"))
except (json.JSONDecodeError, OSError):
return result
result["success"] = True
# --- DP ---
dp = data.get("diagnostic_principal", {})
dp_code = dp.get("cim10_final") or dp.get("cim10_suggestion") or ""
dp_suggestion = dp.get("cim10_suggestion") or ""
result["dp"] = {
"texte": (dp.get("texte") or "")[:80],
"code_suggestion": dp_suggestion,
"code_final": dp_code,
"confidence": dp.get("cim10_confidence", ""),
"has_code": bool(dp_code),
"valid_code": is_valid_code(dp_code, cim10) if dp_code else False,
"downgraded": bool(dp_code and dp_suggestion and dp_code != dp_suggestion),
}
# --- DAS ---
das_list = data.get("diagnostics_associes", [])
das_codes = []
das_conf = {"high": 0, "medium": 0, "low": 0}
das_valid = 0
das_no_code = 0
das_downgraded = 0
for d_item in das_list:
code = d_item.get("cim10_final") or d_item.get("cim10_suggestion") or ""
suggestion = d_item.get("cim10_suggestion") or ""
conf = d_item.get("cim10_confidence", "low")
if not code:
das_no_code += 1
continue
das_codes.append(code)
if conf in das_conf:
das_conf[conf] += 1
if is_valid_code(code, cim10):
das_valid += 1
if code and suggestion and code != suggestion:
das_downgraded += 1
n_das_with_code = len(das_codes)
result["das"] = {
"total": len(das_list),
"with_code": n_das_with_code,
"no_code": das_no_code,
"valid": das_valid,
"validity_rate": round(das_valid / n_das_with_code, 3) if n_das_with_code else 0,
"confidence": das_conf,
"downgraded": das_downgraded,
"downgrade_rate": round(das_downgraded / n_das_with_code, 3) if n_das_with_code else 0,
"codes_uniques": sorted(set(das_codes)),
}
# --- Metrics du dossier ---
metrics = data.get("metrics", {})
result["metrics"] = {
"das_active": metrics.get("das_active", 0),
"das_removed": metrics.get("das_removed", 0),
"das_ruled_out": metrics.get("das_ruled_out", 0),
}
# --- Veto ---
veto = data.get("veto_report", {})
issues = veto.get("issues", [])
result["veto"] = {
"verdict": veto.get("verdict", "NO_REPORT"),
"score": veto.get("score_contestabilite", 0),
"issues_count": len(issues),
"hard_count": sum(1 for i in issues if i.get("severity") == "HARD"),
"top_issues": [i.get("veto", i.get("type", "?")) for i in issues[:5]],
}
# --- GHM ---
ghm = data.get("ghm_estimation")
result["ghm"] = {
"estimated": ghm is not None and bool(ghm),
"cmd": ghm.get("cmd") if ghm else None,
"severity": ghm.get("severity") if ghm else None,
"ghm": ghm.get("ghm") if ghm else None,
}
# --- CPAM ---
cpam = data.get("controles_cpam", [])
result["cpam"] = {
"controls_count": len(cpam),
"has_response": any(bool(c.get("contre_argumentation")) for c in cpam),
"sources_count": sum(len(c.get("sources_reponse", [])) for c in cpam),
}
# --- Biologie ---
bio = data.get("biologie_cle", [])
result["biologie"] = {
"tests_count": len(bio),
"anomalies": sum(1 for b in bio if b.get("anomalie")),
}
# --- Codes CIM-10 invalides (détail) ---
invalid_codes = []
if dp_code and not is_valid_code(dp_code, cim10):
invalid_codes.append(f"DP:{dp_code}")
for code in das_codes:
if not is_valid_code(code, cim10):
invalid_codes.append(f"DAS:{code}")
result["invalid_codes"] = invalid_codes
return result
# ---------------------------------------------------------------------------
# Agrégation
# ---------------------------------------------------------------------------
def compute_aggregate(per_dossier: list[dict]) -> dict:
"""Calcule les métriques agrégées sur tous les dossiers."""
successful = [d for d in per_dossier if d.get("success")]
n = len(successful)
if n == 0:
return {"n_total": len(per_dossier), "n_success": 0}
# DP
dp_has_code = sum(1 for d in successful if d["dp"]["has_code"])
dp_valid = sum(1 for d in successful if d["dp"]["valid_code"])
dp_conf = {"high": 0, "medium": 0, "low": 0}
for d in successful:
c = d["dp"]["confidence"]
if c in dp_conf:
dp_conf[c] += 1
dp_downgraded = sum(1 for d in successful if d["dp"]["downgraded"])
# DAS
total_das = sum(d["das"]["total"] for d in successful)
total_das_with_code = sum(d["das"]["with_code"] for d in successful)
total_das_valid = sum(d["das"]["valid"] for d in successful)
total_das_downgraded = sum(d["das"]["downgraded"] for d in successful)
das_conf_agg = {"high": 0, "medium": 0, "low": 0}
for d in successful:
for k in das_conf_agg:
das_conf_agg[k] += d["das"]["confidence"].get(k, 0)
# Veto
verdicts = {}
total_hard = 0
for d in successful:
v = d["veto"]["verdict"]
verdicts[v] = verdicts.get(v, 0) + 1
total_hard += d["veto"]["hard_count"]
# GHM
ghm_estimated = sum(1 for d in successful if d["ghm"]["estimated"])
# CPAM
cpam_total = sum(d["cpam"]["controls_count"] for d in successful)
cpam_with_response = sum(1 for d in successful if d["cpam"]["has_response"])
# Temps
times = [d["processing_time_s"] for d in successful]
times_sorted = sorted(times)
p90_idx = int(len(times_sorted) * 0.9)
# Codes invalides
all_invalid = []
for d in successful:
all_invalid.extend(d.get("invalid_codes", []))
return {
"n_total": len(per_dossier),
"n_success": n,
"n_failed": len(per_dossier) - n,
"dp": {
"has_code_rate": round(dp_has_code / n, 3),
"valid_code_rate": round(dp_valid / n, 3),
"confidence": dp_conf,
"downgraded": dp_downgraded,
},
"das": {
"total": total_das,
"mean_per_dossier": round(total_das / n, 1),
"with_code": total_das_with_code,
"valid": total_das_valid,
"validity_rate": round(total_das_valid / total_das_with_code, 3) if total_das_with_code else 0,
"confidence": das_conf_agg,
"confidence_high_rate": round(das_conf_agg["high"] / total_das_with_code, 3) if total_das_with_code else 0,
"downgraded": total_das_downgraded,
"downgrade_rate": round(total_das_downgraded / total_das_with_code, 3) if total_das_with_code else 0,
},
"veto": {
"verdicts": verdicts,
"hard_total": total_hard,
"dossiers_with_hard": sum(1 for d in successful if d["veto"]["hard_count"] > 0),
},
"ghm": {
"estimated_rate": round(ghm_estimated / n, 3),
},
"cpam": {
"controls_total": cpam_total,
"with_response": cpam_with_response,
},
"timing": {
"mean_s": round(mean(times), 1),
"median_s": round(median(times), 1),
"p90_s": round(times_sorted[p90_idx], 1) if times_sorted else 0,
"total_s": round(sum(times), 1),
},
"invalid_codes": all_invalid,
"invalid_codes_count": len(all_invalid),
}
# ---------------------------------------------------------------------------
# Rapport texte
# ---------------------------------------------------------------------------
def _pct(val: float) -> str:
return f"{val * 100:.1f}%"
def _bar(val: float, width: int = 20) -> str:
filled = int(val * width)
return "" * filled + "" * (width - filled)
def generate_report(run_id: str, config: dict, agg: dict, per_dossier: list[dict]) -> str:
"""Génère un rapport lisible."""
lines = []
w = 66
lines.append("=" * w)
lines.append(f" BENCHMARK QUALITÉ T2A — {run_id}")
lines.append("=" * w)
lines.append(f" Date : {config['timestamp']}")
lines.append(f" Modèles : coding={config['models'].get('coding','?')} cpam={config['models'].get('cpam','?')}")
lines.append(f" validation={config['models'].get('validation','?')} qc={config['models'].get('qc','?')}")
lines.append(f" Dossiers : {agg['n_success']}/{agg['n_total']} traités ({agg.get('n_failed',0)} échecs)")
lines.append(f" Durée : {agg['timing']['total_s']:.0f}s ({agg['timing']['mean_s']:.1f}s/dossier)")
lines.append("-" * w)
# DP
dp = agg["dp"]
lines.append("")
lines.append(" DIAGNOSTIC PRINCIPAL (DP)")
lines.append(f" Code obtenu : {_bar(dp['has_code_rate'])} {_pct(dp['has_code_rate'])}")
lines.append(f" Code CIM-10 valide : {_bar(dp['valid_code_rate'])} {_pct(dp['valid_code_rate'])}")
lines.append(f" Confiance high : {dp['confidence'].get('high',0)}/{agg['n_success']} "
f"medium: {dp['confidence'].get('medium',0)} low: {dp['confidence'].get('low',0)}")
lines.append(f" Downgrades : {dp['downgraded']}")
# DAS
das = agg["das"]
lines.append("")
lines.append(" DIAGNOSTICS ASSOCIÉS (DAS)")
lines.append(f" Total : {das['total']} (moy {das['mean_per_dossier']}/dossier)")
lines.append(f" Avec code : {das['with_code']}/{das['total']}")
lines.append(f" Codes valides : {_bar(das['validity_rate'])} {_pct(das['validity_rate'])}")
lines.append(f" Confiance : high={das['confidence']['high']} "
f"medium={das['confidence']['medium']} low={das['confidence']['low']}")
lines.append(f" Confiance high : {_bar(das['confidence_high_rate'])} {_pct(das['confidence_high_rate'])}")
lines.append(f" Downgrades : {das['downgraded']} ({_pct(das['downgrade_rate'])})")
# Veto
veto = agg["veto"]
lines.append("")
lines.append(" VETOS / QUALITÉ")
for v, count in sorted(veto["verdicts"].items(), key=lambda x: -x[1]):
lines.append(f" {v:12s} : {count}")
lines.append(f" Issues HARD : {veto['hard_total']} (dans {veto['dossiers_with_hard']} dossiers)")
# GHM
lines.append("")
lines.append(" GHM")
lines.append(f" Estimé : {_bar(agg['ghm']['estimated_rate'])} {_pct(agg['ghm']['estimated_rate'])}")
# CPAM
if agg["cpam"]["controls_total"] > 0:
lines.append("")
lines.append(" CPAM")
lines.append(f" Contrôles : {agg['cpam']['controls_total']}")
lines.append(f" Avec réponse : {agg['cpam']['with_response']}")
# Temps
lines.append("")
lines.append(" TEMPS DE TRAITEMENT")
lines.append(f" Moyen : {agg['timing']['mean_s']:.1f}s")
lines.append(f" Médian : {agg['timing']['median_s']:.1f}s")
lines.append(f" P90 : {agg['timing']['p90_s']:.1f}s")
lines.append(f" Total : {agg['timing']['total_s']:.0f}s")
# Codes invalides
if agg["invalid_codes"]:
lines.append("")
lines.append(f" CODES CIM-10 INVALIDES ({agg['invalid_codes_count']})")
for code in agg["invalid_codes"][:20]:
lines.append(f" {code}")
if agg["invalid_codes_count"] > 20:
lines.append(f" ... et {agg['invalid_codes_count'] - 20} autres")
# Détail par dossier
lines.append("")
lines.append("-" * w)
lines.append(" DÉTAIL PAR DOSSIER")
lines.append("-" * w)
lines.append(f" {'Dossier':<25s} {'DP':>6s} {'DAS':>4s} {'Valid%':>7s} {'Veto':>10s} {'Temps':>6s}")
lines.append(f" {'-'*25:<25s} {'-'*6:>6s} {'-'*4:>4s} {'-'*7:>7s} {'-'*10:>10s} {'-'*6:>6s}")
for d in sorted(per_dossier, key=lambda x: x["dossier_id"]):
if not d.get("success"):
lines.append(f" {d['dossier_id']:<25s} {'ÉCHEC':>6s}")
continue
dp_code = d["dp"]["code_final"] or "-"
dp_mark = "" if d["dp"]["valid_code"] else ""
n_das = d["das"]["total"]
vr = f"{d['das']['validity_rate']*100:.0f}%" if d["das"]["with_code"] else "-"
verdict = d["veto"]["verdict"]
t = f"{d['processing_time_s']:.0f}s"
lines.append(f" {d['dossier_id']:<25s} {dp_code:>5s}{dp_mark} {n_das:>4d} {vr:>7s} {verdict:>10s} {t:>6s}")
lines.append("")
lines.append("=" * w)
return "\n".join(lines)
# ---------------------------------------------------------------------------
# Comparaison entre runs
# ---------------------------------------------------------------------------
def compare_runs(current_agg: dict, baseline_agg: dict, baseline_id: str) -> str:
"""Compare deux runs et génère un rapport diff."""
lines = []
w = 66
lines.append("")
lines.append("=" * w)
lines.append(f" COMPARAISON avec {baseline_id}")
lines.append("=" * w)
def _delta(cur: float, base: float, is_pct: bool = True) -> str:
d = cur - base
sign = "+" if d >= 0 else ""
if is_pct:
return f"{sign}{d*100:.1f}%"
return f"{sign}{d:.1f}"
def _row(label: str, cur_val: float, base_val: float, is_pct: bool = True):
if is_pct:
cur_s = _pct(cur_val)
base_s = _pct(base_val)
else:
cur_s = f"{cur_val:.1f}"
base_s = f"{base_val:.1f}"
delta_s = _delta(cur_val, base_val, is_pct)
lines.append(f" {label:<24s} {base_s:>10s} {cur_s:>10s} {delta_s:>10s}")
lines.append(f" {'Métrique':<24s} {'Baseline':>10s} {'Actuel':>10s} {'Delta':>10s}")
lines.append(f" {'-'*24:<24s} {'-'*10:>10s} {'-'*10:>10s} {'-'*10:>10s}")
_row("DP code valide", current_agg["dp"]["valid_code_rate"], baseline_agg["dp"]["valid_code_rate"])
_row("DAS validité", current_agg["das"]["validity_rate"], baseline_agg["das"]["validity_rate"])
_row("DAS confiance high", current_agg["das"]["confidence_high_rate"], baseline_agg["das"]["confidence_high_rate"])
_row("DAS downgrade", current_agg["das"]["downgrade_rate"], baseline_agg["das"]["downgrade_rate"])
_row("GHM estimé", current_agg["ghm"]["estimated_rate"], baseline_agg["ghm"]["estimated_rate"])
_row("DAS moy/dossier", current_agg["das"]["mean_per_dossier"], baseline_agg["das"]["mean_per_dossier"], is_pct=False)
_row("Temps moyen (s)", current_agg["timing"]["mean_s"], baseline_agg["timing"]["mean_s"], is_pct=False)
# Codes invalides
cur_inv = set(current_agg.get("invalid_codes", []))
base_inv = set(baseline_agg.get("invalid_codes", []))
new_inv = cur_inv - base_inv
fixed_inv = base_inv - cur_inv
if new_inv:
lines.append(f"\n Nouveaux codes invalides : {', '.join(sorted(new_inv))}")
if fixed_inv:
lines.append(f" Codes corrigés : {', '.join(sorted(fixed_inv))}")
lines.append("=" * w)
return "\n".join(lines)
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def get_current_config() -> dict:
"""Récupère la configuration modèle actuelle."""
try:
from src.config import OLLAMA_MODELS, OLLAMA_MODEL, OLLAMA_URL
return {
"models": dict(OLLAMA_MODELS),
"ollama_model": OLLAMA_MODEL,
"ollama_url": OLLAMA_URL,
}
except ImportError:
return {
"models": {
"coding": os.environ.get("T2A_MODEL_CODING", "?"),
"cpam": os.environ.get("T2A_MODEL_CPAM", "?"),
"validation": os.environ.get("T2A_MODEL_VALIDATION", "?"),
"qc": os.environ.get("T2A_MODEL_QC", "?"),
},
"ollama_model": os.environ.get("OLLAMA_MODEL", "?"),
}
def main():
parser = argparse.ArgumentParser(description="Benchmark qualité T2A")
parser.add_argument("--n", type=int, default=10, help="Nombre de dossiers")
parser.add_argument("--dossiers", type=str, help="IDs séparés par des virgules")
parser.add_argument("--gold-standard", action="store_true", help="Utiliser les 50 dossiers gold standard")
parser.add_argument("--compare", type=str, help="Run ID à comparer")
parser.add_argument("--label", type=str, default="", help="Label pour ce run")
parser.add_argument("--no-reprocess", action="store_true", help="Analyser les outputs existants sans relancer le pipeline")
parser.add_argument("--clean", action="store_true", help="Supprimer les outputs avant retraitement")
parser.add_argument("--seed", type=int, default=42, help="Seed pour la sélection aléatoire")
args = parser.parse_args()
# Sélection dossiers
specific = args.dossiers.split(",") if args.dossiers else None
dossiers = select_dossiers(args.n, args.gold_standard, specific, args.seed)
print(f"\n Dossiers sélectionnés : {len(dossiers)}")
for d in dossiers:
print(f" - {d}")
# Config
config = get_current_config()
run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
if args.label:
run_id = f"{run_id}_{args.label}"
config["timestamp"] = datetime.now().isoformat()
config["run_id"] = run_id
config["dossiers"] = dossiers
config["args"] = {
"n": args.n,
"gold_standard": args.gold_standard,
"clean": args.clean,
"no_reprocess": args.no_reprocess,
"seed": args.seed,
"label": args.label,
}
print(f"\n Run ID : {run_id}")
print(f" Modèles : {config['models']}")
print(f" Reprocess: {'NON' if args.no_reprocess else 'OUI (clean=' + str(args.clean) + ')'}")
print()
# Charger dictionnaire CIM-10
cim10 = load_cim10_dict()
print(f" Dictionnaire CIM-10 : {len(cim10)} codes")
print()
# Traitement
per_dossier = []
for i, dossier_id in enumerate(dossiers, 1):
print(f" [{i}/{len(dossiers)}] {dossier_id}", end="", flush=True)
if args.no_reprocess:
duration = 0.0
success = find_merged_json(dossier_id) is not None
if not success:
print(" — pas de JSON")
else:
print(" — analyse existant")
else:
print(" — traitement...", end="", flush=True)
duration, success = run_pipeline(dossier_id, args.clean)
print(f" {duration:.1f}s {'' if success else ''}")
metrics = analyze_dossier(dossier_id, cim10, duration)
per_dossier.append(metrics)
# Agrégation
agg = compute_aggregate(per_dossier)
# Rapport
report = generate_report(run_id, config, agg, per_dossier)
print(report)
# Comparaison si demandée
comparison = ""
if args.compare:
baseline_path = BENCHMARKS_DIR / args.compare / "metrics.json"
if baseline_path.exists():
baseline = json.loads(baseline_path.read_text("utf-8"))
comparison = compare_runs(agg, baseline["aggregate"], args.compare)
print(comparison)
else:
print(f"\n WARN: run baseline {args.compare} introuvable ({baseline_path})")
# Sauvegarde
run_dir = BENCHMARKS_DIR / run_id
run_dir.mkdir(parents=True, exist_ok=True)
(run_dir / "config.json").write_text(
json.dumps(config, ensure_ascii=False, indent=2), encoding="utf-8"
)
(run_dir / "metrics.json").write_text(
json.dumps({"aggregate": agg, "per_dossier": per_dossier}, ensure_ascii=False, indent=2),
encoding="utf-8",
)
(run_dir / "report.txt").write_text(report + comparison, encoding="utf-8")
print(f"\n Résultats sauvegardés dans : {run_dir}")
print(f" Pour comparer un futur run : python scripts/benchmark_quality.py --compare {run_id}")
if __name__ == "__main__":
main()