refactor: réorganisation référentiels, nouveaux modules extraction, nettoyage code obsolète

- Réorganisation data/referentiels/ : pdfs/, dicts/, user/ (structure unifiée)
- Fix badges "Source absente" sur page admin référentiels
- Ré-indexation COCOA 2025 (555 → 1451 chunks, couverture 94%)
- Fix VRAM OOM : embeddings forcés CPU via T2A_EMBED_CPU
- Nouveaux modules : document_router, docx_extractor, image_extractor, ocr_engine
- Module complétude (quality/completude.py + config YAML)
- Template DIM (synthèse dimensionnelle)
- Gunicorn config + systemd service t2a-viewer
- Suppression t2a_install_rag_cleanup/ (copie obsolète)
- Suppression scripts/ et scripts_t2a_v2/ (anciens benchmarks)
- Suppression 81 fichiers _doc.txt de test
- Cache Ollama : TTL configurable, corrections loader YAML
- Dashboard : améliorations templates (base, index, detail, cpam, validation)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
dom
2026-03-07 16:48:10 +01:00
parent 2578afb6ff
commit 4e2b4bd946
210 changed files with 6939 additions and 22104 deletions

View File

@@ -1,313 +0,0 @@
#!/usr/bin/env python3
"""Benchmark A/B : gemma3:12b (base) vs pmsi-coder-v2 (fine-tuné).
Compare les codes CIM-10 produits par les deux modèles sur N dossiers.
Teste DP + DAS (échantillon) pour chaque dossier.
Usage: python scripts/benchmark_models.py [--n 50] [--das-max 5]
"""
from __future__ import annotations
import json
import random
import sys
import time
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from src.config import STRUCTURED_DIR, OLLAMA_URL, DossierMedical
from src.medical.cim10_dict import load_dict, normalize_code, validate_code
import requests
MODEL_BASE = "gemma3:12b"
MODEL_FINETUNED = "pmsi-coder-v2"
PROMPT_TEMPLATE = """Tu es un médecin DIM expert en codage PMSI.
Code le diagnostic suivant en CIM-10. Choisis le code le plus spécifique possible.
DIAGNOSTIC : "{texte}"
TYPE : {type_diag}
{contexte}
Réponds UNIQUEMENT avec un objet JSON :
{{"code": "X99.9", "confidence": "high|medium|low", "justification": "explication courte"}}"""
def call_model(prompt: str, model: str, timeout: int = 120) -> tuple[dict | None, float]:
"""Appelle un modèle Ollama et retourne (résultat, durée_s)."""
t0 = time.time()
try:
resp = requests.post(
f"{OLLAMA_URL}/api/generate",
json={
"model": model,
"prompt": prompt,
"stream": False,
"format": "json",
"options": {"temperature": 0.1, "num_predict": 500},
},
timeout=timeout,
)
resp.raise_for_status()
raw = resp.json().get("response", "")
duration = time.time() - t0
try:
return json.loads(raw), duration
except json.JSONDecodeError:
return None, duration
except Exception as e:
return None, time.time() - t0
def load_dossiers(n: int) -> list[dict]:
"""Charge N dossiers fusionnés diversifiés."""
dossiers = []
for subdir in sorted(STRUCTURED_DIR.iterdir()):
if not subdir.is_dir():
continue
for f in subdir.glob("*fusionne*.json"):
if ".gemma_" in f.name or ".bak" in f.name:
continue
try:
data = json.loads(f.read_text(encoding="utf-8"))
d = DossierMedical.model_validate(data)
if d.diagnostic_principal and d.diagnostic_principal.cim10_suggestion:
dossiers.append({
"name": subdir.name,
"dossier": d,
"path": str(f),
})
except Exception:
continue
break
random.seed(42)
random.shuffle(dossiers)
return dossiers[:n]
def build_contexte(d: DossierMedical) -> str:
"""Construit un contexte clinique résumé."""
parts = []
s = d.sejour
if s.age is not None:
parts.append(f"Patient {s.sexe or '?'}, {s.age} ans")
if s.duree_sejour is not None:
parts.append(f"Durée séjour : {s.duree_sejour}j")
if d.diagnostic_principal:
parts.append(f"DP : {d.diagnostic_principal.texte}")
bio = [f"{b.test}={b.valeur}" for b in d.biologie_cle[:5] if b.valeur]
if bio:
parts.append(f"Bio : {', '.join(bio)}")
return "CONTEXTE : " + " | ".join(parts) if parts else ""
def code_match_level(code_a: str, code_b: str) -> str:
"""Retourne le niveau de correspondance entre deux codes."""
if code_a == code_b:
return "exact"
if code_a[:3] == code_b[:3]:
return "categorie"
return "diff"
def run_benchmark(n: int = 50, das_max: int = 5):
print(f"=== Benchmark A/B : {MODEL_BASE} vs {MODEL_FINETUNED} ===")
print(f" Dossiers : {n}, DAS max/dossier : {das_max}\n")
# Vérifier que les deux modèles sont disponibles
for model in [MODEL_BASE, MODEL_FINETUNED]:
try:
resp = requests.post(
f"{OLLAMA_URL}/api/generate",
json={"model": model, "prompt": "test", "stream": False,
"options": {"num_predict": 1}},
timeout=60,
)
resp.raise_for_status()
print(f" {model} : OK")
except Exception as e:
print(f" {model} : ERREUR — {e}")
sys.exit(1)
dossiers = load_dossiers(n)
print(f"\nDossiers chargés : {len(dossiers)}\n")
cim10 = load_dict()
t_global_start = time.time()
dp_results = []
das_results = []
for i, item in enumerate(dossiers, 1):
d = item["dossier"]
dp = d.diagnostic_principal
name = item["name"]
ctx = build_contexte(d)
# === DP ===
prompt_dp = PROMPT_TEMPLATE.format(
texte=dp.texte,
type_diag="DP (diagnostic principal)",
contexte=ctx,
)
res_base, t_base = call_model(prompt_dp, MODEL_BASE)
res_ft, t_ft = call_model(prompt_dp, MODEL_FINETUNED)
code_base = normalize_code(res_base.get("code", "")) if res_base else "ERREUR"
code_ft = normalize_code(res_ft.get("code", "")) if res_ft else "ERREUR"
conf_base = res_base.get("confidence", "?") if res_base else "?"
conf_ft = res_ft.get("confidence", "?") if res_ft else "?"
valid_base = validate_code(code_base)[0] if code_base != "ERREUR" else False
valid_ft = validate_code(code_ft)[0] if code_ft != "ERREUR" else False
pipeline_code = dp.cim10_suggestion
match_level = code_match_level(code_base, code_ft)
dp_result = {
"dossier": name,
"texte": dp.texte[:80],
"pipeline": pipeline_code,
"base": code_base,
"ft": code_ft,
"conf_base": conf_base,
"conf_ft": conf_ft,
"valid_base": valid_base,
"valid_ft": valid_ft,
"match": match_level,
"t_base": round(t_base, 2),
"t_ft": round(t_ft, 2),
}
dp_results.append(dp_result)
tag = {"exact": "=", "categorie": "~", "diff": "X"}[match_level]
print(f" [{i:2d}/{len(dossiers)}] {name:<20s} DP=\"{dp.texte[:35]:<35s}\" "
f"base={code_base:<7s} ft={code_ft:<7s} [{tag}] "
f"({t_base:.1f}s / {t_ft:.1f}s)")
# === DAS (échantillon) ===
das_list = [das for das in d.diagnostics_associes
if das.texte and das.cim10_suggestion]
if len(das_list) > das_max:
random.seed(hash(name))
das_list = random.sample(das_list, das_max)
for das in das_list:
prompt_das = PROMPT_TEMPLATE.format(
texte=das.texte,
type_diag="DAS (diagnostic associé significatif)",
contexte=ctx,
)
res_b, tb = call_model(prompt_das, MODEL_BASE)
res_f, tf = call_model(prompt_das, MODEL_FINETUNED)
cb = normalize_code(res_b.get("code", "")) if res_b else "ERREUR"
cf = normalize_code(res_f.get("code", "")) if res_f else "ERREUR"
vb = validate_code(cb)[0] if cb != "ERREUR" else False
vf = validate_code(cf)[0] if cf != "ERREUR" else False
das_results.append({
"dossier": name,
"texte": das.texte[:80],
"pipeline": das.cim10_suggestion,
"base": cb,
"ft": cf,
"conf_base": (res_b or {}).get("confidence", "?"),
"conf_ft": (res_f or {}).get("confidence", "?"),
"valid_base": vb,
"valid_ft": vf,
"match": code_match_level(cb, cf),
"t_base": round(tb, 2),
"t_ft": round(tf, 2),
})
t_global = time.time() - t_global_start
# === RÉSUMÉ ===
print(f"\n{'='*75}")
print(f"RÉSUMÉ — {len(dp_results)} dossiers, {len(das_results)} DAS testés")
print(f"Durée totale : {t_global/60:.1f} min\n")
for label, results in [("DP", dp_results), ("DAS", das_results)]:
if not results:
continue
nt = len(results)
n_exact = sum(1 for r in results if r["match"] == "exact")
n_cat = sum(1 for r in results if r["match"] == "categorie")
n_diff = sum(1 for r in results if r["match"] == "diff")
n_vb = sum(1 for r in results if r["valid_base"])
n_vf = sum(1 for r in results if r["valid_ft"])
avg_tb = sum(r["t_base"] for r in results) / nt
avg_tf = sum(r["t_ft"] for r in results) / nt
# Confiance
conf_b = {}
conf_f = {}
for r in results:
conf_b[r["conf_base"]] = conf_b.get(r["conf_base"], 0) + 1
conf_f[r["conf_ft"]] = conf_f.get(r["conf_ft"], 0) + 1
# Concordance avec pipeline (gemma run original)
n_base_eq_pipe = sum(1 for r in results if r["base"] == r["pipeline"])
n_ft_eq_pipe = sum(1 for r in results if r["ft"] == r["pipeline"])
n_base_cat_pipe = sum(1 for r in results
if r["base"][:3] == r["pipeline"][:3])
n_ft_cat_pipe = sum(1 for r in results
if r["ft"][:3] == r["pipeline"][:3])
print(f" --- {label} ({nt} diagnostics) ---")
print(f" Concordance base↔ft :")
print(f" Exact : {n_exact}/{nt} ({100*n_exact/nt:.0f}%)")
print(f" Catégorie : {n_exact+n_cat}/{nt} ({100*(n_exact+n_cat)/nt:.0f}%)")
print(f" Différent : {n_diff}/{nt} ({100*n_diff/nt:.0f}%)")
print(f" Codes valides :")
print(f" base : {n_vb}/{nt} ({100*n_vb/nt:.0f}%)")
print(f" ft : {n_vf}/{nt} ({100*n_vf/nt:.0f}%)")
print(f" vs pipeline (gemma original) :")
print(f" base=pipe : {n_base_eq_pipe}/{nt} exact, {n_base_cat_pipe}/{nt} catégorie")
print(f" ft=pipe : {n_ft_eq_pipe}/{nt} exact, {n_ft_cat_pipe}/{nt} catégorie")
print(f" Temps moyen : base={avg_tb:.2f}s ft={avg_tf:.2f}s (Δ={100*(avg_tf-avg_tb)/avg_tb:+.0f}%)")
print(f" Confiance base : {conf_b}")
print(f" Confiance ft : {conf_f}")
print()
# Lister les différences DP
diffs_dp = [r for r in dp_results if r["match"] == "diff"]
if diffs_dp:
print(f" Différences DP ({len(diffs_dp)}) :")
for r in diffs_dp:
vb = "" if r["valid_base"] else ""
vf = "" if r["valid_ft"] else ""
print(f" {r['dossier']:<18s} \"{r['texte'][:40]}\"")
print(f" base={r['base']:<7s}{vb} ft={r['ft']:<7s}{vf} pipe={r['pipeline']}")
# Sauvegarder
out = {
"meta": {
"date": time.strftime("%Y-%m-%dT%H:%M:%S"),
"model_base": MODEL_BASE,
"model_ft": MODEL_FINETUNED,
"n_dossiers": len(dp_results),
"n_das": len(das_results),
"duration_min": round(t_global / 60, 1),
},
"dp": dp_results,
"das": das_results,
}
out_path = Path(__file__).parent.parent / "output" / "benchmark_ab.json"
out_path.write_text(json.dumps(out, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"\nRésultats détaillés : {out_path}")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--n", type=int, default=50,
help="Nombre de dossiers à tester")
parser.add_argument("--das-max", type=int, default=5,
help="Max DAS testés par dossier")
args = parser.parse_args()
run_benchmark(args.n, args.das_max)

View File

@@ -1,874 +0,0 @@
#!/usr/bin/env python3
"""Benchmark NUKE-3 — rapport comparatif LLM off vs on.
Analyse les dossiers JSON existants (output/structured/) pour produire
des métriques DIM-like sur la sélection DP (NUKE-3).
Mode 1 (par défaut) : analyse les JSON existants (pas d'Ollama requis).
Mode 2 (--rerun) : relance le pipeline 2× (LLM off puis LLM on) —
nécessite Ollama pour le mode "on".
Usage:
python scripts/benchmark_nuke3_compare.py # analyse offline
python scripts/benchmark_nuke3_compare.py --n 10 # top 10 dossiers
python scripts/benchmark_nuke3_compare.py --rerun --n 5 # relance pipeline
python scripts/benchmark_nuke3_compare.py --dossiers A,B,C # dossiers spécifiques
python scripts/benchmark_nuke3_compare.py --gold data/gold_crh/gold_crh.jsonl
python scripts/benchmark_nuke3_compare.py --offline --case-id 74_23141536
python scripts/benchmark_nuke3_compare.py --offline --top-errors 20
python scripts/benchmark_nuke3_compare.py --offline --dim-pack 20
"""
from __future__ import annotations
import argparse
import csv
import json
import os
import subprocess
import sys
import time
from datetime import datetime
from pathlib import Path
from statistics import mean
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
OUTPUT_DIR = ROOT / "output" / "structured"
INPUT_DIR = ROOT / "input"
REPORT_PATH = ROOT / "docs" / "NUKE3_BENCHMARK_REPORT.md"
PY = str(ROOT / ".venv" / "bin" / "python3")
# ---------------------------------------------------------------------------
# Chargement JSON
# ---------------------------------------------------------------------------
def find_merged_json(dossier_id: str) -> Path | None:
"""Trouve le JSON fusionné d'un dossier."""
d = OUTPUT_DIR / dossier_id
if not d.exists():
return None
fusions = list(d.glob("*fusionne_cim10.json"))
if fusions:
return fusions[0]
cim10s = list(d.glob("*_cim10.json"))
return cim10s[0] if cim10s else None
def load_dossier_json(dossier_id: str) -> dict | None:
"""Charge le JSON d'un dossier."""
path = find_merged_json(dossier_id)
if not path:
return None
try:
return json.loads(path.read_text("utf-8"))
except (json.JSONDecodeError, OSError):
return None
def select_dossiers(n: int, specific: list[str] | None) -> list[str]:
"""Sélectionne les dossiers à analyser."""
if specific:
return [d.strip() for d in specific if d.strip()]
all_dirs = sorted(
d.name for d in OUTPUT_DIR.iterdir()
if d.is_dir() and find_merged_json(d.name) is not None
)
return all_dirs[:n] if n > 0 else all_dirs
# ---------------------------------------------------------------------------
# Analyse NUKE-3 d'un dossier
# ---------------------------------------------------------------------------
def analyze_dp_selection(data: dict) -> dict:
"""Extrait les métriques NUKE-3 d'un dossier."""
dp_sel = data.get("dp_selection")
result = {
"has_dp_selection": dp_sel is not None,
"verdict": None,
"confidence": None,
"chosen_code": None,
"n_candidates": 0,
"n_evidence": 0,
"is_comorbidity_dp": False,
"is_symptom_dp": False,
"is_act_only_dp": False,
"has_evidence": False,
"delta": None,
"reason": None,
}
if not dp_sel:
return result
result["verdict"] = dp_sel.get("verdict")
result["confidence"] = dp_sel.get("confidence")
result["chosen_code"] = dp_sel.get("chosen_code")
candidates = dp_sel.get("candidates", [])
result["n_candidates"] = len(candidates)
evidence = dp_sel.get("evidence", [])
result["n_evidence"] = len(evidence)
result["has_evidence"] = len(evidence) > 0
result["reason"] = dp_sel.get("reason")
# Debug scores
debug = dp_sel.get("debug_scores") or {}
result["delta"] = debug.get("delta")
# Flags du gagnant
if candidates:
winner = candidates[0]
result["is_comorbidity_dp"] = winner.get("is_comorbidity_like", False)
result["is_symptom_dp"] = winner.get("is_symptom_like", False)
result["is_act_only_dp"] = winner.get("is_act_only", False)
return result
# ---------------------------------------------------------------------------
# Agrégation
# ---------------------------------------------------------------------------
def compute_metrics(analyses: list[dict]) -> dict:
"""Calcule les métriques agrégées DIM-like."""
n = len(analyses)
if n == 0:
return {"n": 0}
with_selection = [a for a in analyses if a["has_dp_selection"]]
n_sel = len(with_selection)
confirmed = [a for a in with_selection if a["verdict"] == "CONFIRMED"]
review = [a for a in with_selection if a["verdict"] == "REVIEW"]
# Métriques principales
confirmed_rate = len(confirmed) / n_sel if n_sel else 0
# Evidence
confirmed_with_evidence = sum(1 for a in confirmed if a["has_evidence"])
confirmed_evidence_rate = confirmed_with_evidence / len(confirmed) if confirmed else 0
# Codes problématiques en DP
symptom_count = sum(1 for a in with_selection if a["is_symptom_dp"])
comorbidity_count = sum(1 for a in with_selection if a["is_comorbidity_dp"])
act_only_count = sum(1 for a in with_selection if a["is_act_only_dp"])
# Confidence
conf_high = sum(1 for a in with_selection if a["confidence"] == "high")
conf_med = sum(1 for a in with_selection if a["confidence"] == "medium")
conf_low = sum(1 for a in with_selection if a["confidence"] == "low")
# R-codes en DP (symptômes)
r_code_count = sum(
1 for a in with_selection
if a["chosen_code"] and a["chosen_code"].startswith("R")
)
return {
"n_total": n,
"n_with_selection": n_sel,
"confirmed_count": len(confirmed),
"review_count": len(review),
"confirmed_rate": round(confirmed_rate, 3),
"review_rate": round(1 - confirmed_rate, 3) if n_sel else 0,
"confirmed_evidence_rate": round(confirmed_evidence_rate, 3),
"dp_symptom_rate": round(symptom_count / n_sel, 3) if n_sel else 0,
"dp_comorbidity_rate": round(comorbidity_count / n_sel, 3) if n_sel else 0,
"dp_act_only_rate": round(act_only_count / n_sel, 3) if n_sel else 0,
"dp_r_code_rate": round(r_code_count / n_sel, 3) if n_sel else 0,
"confidence": {
"high": conf_high,
"medium": conf_med,
"low": conf_low,
},
"confidence_high_rate": round(conf_high / n_sel, 3) if n_sel else 0,
}
# ---------------------------------------------------------------------------
# Évaluation gold CRH
# ---------------------------------------------------------------------------
def load_gold(gold_path: str | Path) -> dict:
"""Charge le gold JSONL et retourne un index case_id → GoldCRHCase."""
from src.eval.gold_models import load_gold_index
return load_gold_index(Path(gold_path))
def evaluate_gold_cases(
dossier_details: list[dict],
gold_index: dict,
) -> list[dict]:
"""Évalue les dossiers présents dans le gold. Retourne une liste d'évaluations."""
from src.eval.gold_models import evaluate_dp
evals: list[dict] = []
for d in dossier_details:
case_id = d["id"]
if case_id not in gold_index:
continue
gold_case = gold_index[case_id]
sel = d.get("dp_selection") or {}
chosen_code = sel.get("chosen_code")
verdict = sel.get("verdict")
confidence = sel.get("confidence")
ev = evaluate_dp(chosen_code, gold_case)
ev["verdict"] = verdict
ev["confidence_nuke3"] = confidence
evals.append(ev)
return evals
def compute_gold_metrics(evals: list[dict]) -> dict:
"""Calcule les métriques agrégées sur les cas gold."""
n = len(evals)
if n == 0:
return {"n": 0}
strict = sum(1 for e in evals if e["exact_match_strict"])
tolerant = sum(1 for e in evals if e["exact_match_tolerant_codes"])
family3 = sum(1 for e in evals if e["family3_match_tolerant"])
acceptable = sum(1 for e in evals if e["acceptable_match"])
symptom_bad = sum(1 for e in evals if e["symptom_not_allowed"])
# Confirmed-only accuracy
confirmed_evals = [e for e in evals if e["verdict"] == "CONFIRMED"]
n_conf = len(confirmed_evals)
conf_acceptable = sum(1 for e in confirmed_evals if e["acceptable_match"])
return {
"n": n,
"exact_match_strict": strict,
"exact_match_strict_rate": round(strict / n, 3),
"exact_match_tolerant": tolerant,
"exact_match_tolerant_rate": round(tolerant / n, 3),
"family3_match": family3,
"family3_match_rate": round(family3 / n, 3),
"acceptable_match": acceptable,
"acceptable_match_rate": round(acceptable / n, 3),
"confirmed_accuracy_tolerant": round(conf_acceptable / n_conf, 3) if n_conf else None,
"confirmed_count": n_conf,
"symptom_not_allowed": symptom_bad,
"symptom_not_allowed_rate": round(symptom_bad / n, 3),
}
def write_gold_eval_csv(evals: list[dict], csv_path: Path) -> None:
"""Écrit le CSV d'évaluation gold."""
cols = [
"case_id", "chosen_code", "verdict", "confidence_nuke3",
"dp_expected_code", "acceptable_match", "exact_match_strict",
"symptom_not_allowed", "allow_symptom_dp", "confidence_gold",
]
csv_path.parent.mkdir(parents=True, exist_ok=True)
with open(csv_path, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=cols, extrasaction="ignore")
writer.writeheader()
for ev in evals:
row = {
"case_id": ev["case_id"],
"chosen_code": ev["chosen_code"] or "",
"verdict": ev["verdict"] or "",
"confidence_nuke3": ev["confidence_nuke3"] or "",
"dp_expected_code": ev["dp_expected_code"],
"acceptable_match": ev["acceptable_match"],
"exact_match_strict": ev["exact_match_strict"],
"symptom_not_allowed": ev["symptom_not_allowed"],
"allow_symptom_dp": ev["allow_symptom_dp"],
"confidence_gold": ev["confidence_gold"],
}
writer.writerow(row)
# ---------------------------------------------------------------------------
# Re-run pipeline (mode --rerun)
# ---------------------------------------------------------------------------
def check_ollama() -> bool:
"""Vérifie que Ollama est joignable."""
try:
import urllib.request
url = os.environ.get("OLLAMA_URL", "http://localhost:11434")
req = urllib.request.Request(f"{url}/api/tags", method="GET")
urllib.request.urlopen(req, timeout=5)
return True
except Exception:
return False
def run_pipeline_with_env(dossier_id: str, llm_flag: str) -> bool:
"""Lance le pipeline sur un dossier avec T2A_DP_RANKER_LLM=flag."""
env = os.environ.copy()
env["T2A_DP_RANKER_LLM"] = llm_flag
try:
result = subprocess.run(
[PY, "-m", "src.main", str(INPUT_DIR / dossier_id)],
capture_output=True, text=True, cwd=str(ROOT),
timeout=600, env=env,
)
return result.returncode == 0
except Exception as e:
print(f" ERREUR: {e}")
return False
# ---------------------------------------------------------------------------
# Rapport Markdown
# ---------------------------------------------------------------------------
def _pct(v: float) -> str:
return f"{v * 100:.1f}%"
def generate_report(
metrics_off: dict,
metrics_on: dict | None,
dossier_details: list[dict],
args: argparse.Namespace,
gold_metrics: dict | None = None,
gold_evals: list[dict] | None = None,
) -> str:
"""Génère le rapport Markdown."""
lines: list[str] = []
now = datetime.now().strftime("%Y-%m-%d %H:%M")
# Commit hash
try:
commit = subprocess.check_output(
["git", "rev-parse", "--short", "HEAD"],
cwd=str(ROOT), text=True, stderr=subprocess.DEVNULL,
).strip()
except Exception:
commit = "?"
lines.append("# NUKE-3 — Benchmark Report")
lines.append("")
lines.append(f"**Date** : {now} ")
lines.append(f"**Commit** : `{commit}` ")
lines.append(f"**Dossiers analysés** : {metrics_off['n_total']} ")
lines.append(f"**Mode** : {'rerun pipeline' if args.rerun else 'analyse offline (JSON existants)'} ")
lines.append("")
# Table comparative
lines.append("## Métriques DIM-like")
lines.append("")
if metrics_on:
lines.append("| Métrique | LLM OFF | LLM ON | Delta |")
lines.append("|----------|---------|--------|-------|")
rows = [
("CONFIRMED rate", "confirmed_rate"),
("REVIEW rate", "review_rate"),
("CONFIRMED + evidence", "confirmed_evidence_rate"),
("DP symptôme (R*)", "dp_symptom_rate"),
("DP comorbidité", "dp_comorbidity_rate"),
("DP acte-seul", "dp_act_only_rate"),
("DP R-code", "dp_r_code_rate"),
("Confidence high", "confidence_high_rate"),
]
for label, key in rows:
v_off = metrics_off.get(key, 0)
v_on = metrics_on.get(key, 0)
delta = v_on - v_off
sign = "+" if delta > 0 else ""
lines.append(
f"| {label} | {_pct(v_off)} | {_pct(v_on)} | {sign}{_pct(delta)} |"
)
else:
lines.append("| Métrique | Valeur |")
lines.append("|----------|--------|")
rows_single = [
("CONFIRMED rate", "confirmed_rate"),
("REVIEW rate", "review_rate"),
("CONFIRMED + evidence", "confirmed_evidence_rate"),
("DP symptôme (R*)", "dp_symptom_rate"),
("DP comorbidité", "dp_comorbidity_rate"),
("DP acte-seul", "dp_act_only_rate"),
("DP R-code", "dp_r_code_rate"),
("Confidence high", "confidence_high_rate"),
]
for label, key in rows_single:
v = metrics_off.get(key, 0)
lines.append(f"| {label} | {_pct(v)} |")
lines.append("")
# Volumes
lines.append("## Volumes")
lines.append("")
lines.append(f"- Dossiers avec dp_selection : {metrics_off['n_with_selection']}/{metrics_off['n_total']}")
lines.append(f"- CONFIRMED : {metrics_off['confirmed_count']}")
lines.append(f"- REVIEW : {metrics_off['review_count']}")
c = metrics_off.get("confidence", {})
lines.append(f"- Confidence — high: {c.get('high', 0)}, medium: {c.get('medium', 0)}, low: {c.get('low', 0)}")
lines.append("")
# Détail par dossier
lines.append("## Détail par dossier")
lines.append("")
lines.append("| Dossier | Verdict | Code | Confidence | Evidence | Candidats | Reason |")
lines.append("|---------|---------|------|------------|----------|-----------|--------|")
for d in dossier_details:
sel = d.get("dp_selection", {})
if not sel:
lines.append(f"| {d['id']} | - | - | - | - | - | pas de dp_selection |")
continue
lines.append(
f"| {d['id']} "
f"| {sel.get('verdict', '-')} "
f"| {sel.get('chosen_code', '-')} "
f"| {sel.get('confidence', '-')} "
f"| {sel.get('n_evidence', 0)} "
f"| {sel.get('n_candidates', 0)} "
f"| {(sel.get('reason') or '-')[:60]} |"
)
# Section gold CRH
if gold_metrics and gold_metrics.get("n", 0) > 0:
gm = gold_metrics
lines.append("")
lines.append("## Évaluation Gold CRH")
lines.append("")
lines.append(f"**Cas gold évalués** : {gm['n']} ")
lines.append("")
lines.append("| Métrique | Valeur |")
lines.append("|----------|--------|")
lines.append(f"| Exact match (strict) | {_pct(gm['exact_match_strict_rate'])} ({gm['exact_match_strict']}/{gm['n']}) |")
lines.append(f"| Exact match (codes tolérants) | {_pct(gm['exact_match_tolerant_rate'])} ({gm['exact_match_tolerant']}/{gm['n']}) |")
lines.append(f"| Family3 match (tolérant) | {_pct(gm['family3_match_rate'])} ({gm['family3_match']}/{gm['n']}) |")
lines.append(f"| Acceptable match (codes OU family3) | {_pct(gm['acceptable_match_rate'])} ({gm['acceptable_match']}/{gm['n']}) |")
if gm["confirmed_accuracy_tolerant"] is not None:
lines.append(f"| Confirmed accuracy (tolérant) | {_pct(gm['confirmed_accuracy_tolerant'])} ({gm['confirmed_count']} CONFIRMED) |")
lines.append(f"| Symptôme non autorisé | {gm['symptom_not_allowed']}/{gm['n']} |")
lines.append("")
# Détail par cas gold
if gold_evals:
lines.append("### Détail par cas gold")
lines.append("")
lines.append("| Case ID | Choisi | Attendu | Strict | Acceptable | Symptôme interdit | Verdict |")
lines.append("|---------|--------|---------|--------|------------|-------------------|---------|")
for ev in gold_evals:
ok_s = "OK" if ev["exact_match_strict"] else "FAIL"
ok_a = "OK" if ev["acceptable_match"] else "FAIL"
sym = "OUI" if ev["symptom_not_allowed"] else "-"
lines.append(
f"| {ev['case_id']} "
f"| {ev['chosen_code'] or '-'} "
f"| {ev['dp_expected_code']} "
f"| {ok_s} "
f"| {ok_a} "
f"| {sym} "
f"| {ev['verdict'] or '-'} |"
)
lines.append("")
lines.append("")
lines.append("---")
lines.append(f"*Généré par `scripts/benchmark_nuke3_compare.py` — {now}*")
# Règle DIM rappel
lines.append("")
lines.append("> **Règle DIM** : `CONFIRMED` ⇒ `evidence` obligatoirement non vide.")
lines.append("> Un DP sans preuve exploitable est automatiquement `REVIEW`.")
return "\n".join(lines)
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def _rebuild_and_select(data: dict) -> dict:
"""Reconstruit un DossierMedical depuis le JSON et exécute select_dp() offline.
Utile quand les JSON n'ont pas de champ dp_selection (générés avant NUKE-3).
"""
from src.config import DossierMedical, Diagnostic, Sejour
from src.medical.dp_selector import select_dp
dp_raw = data.get("diagnostic_principal", {})
das_raw = data.get("diagnostics_associes", [])
doc_type = data.get("document_type", "crh")
sej_raw = data.get("sejour", {})
dp_diag = None
if dp_raw and dp_raw.get("texte"):
dp_diag = Diagnostic(
texte=dp_raw.get("texte", ""),
cim10_suggestion=dp_raw.get("cim10_suggestion") or dp_raw.get("cim10_final"),
cim10_confidence=dp_raw.get("cim10_confidence"),
source=dp_raw.get("source"),
)
das_list = []
for d_item in das_raw:
code = d_item.get("cim10_suggestion") or d_item.get("cim10_final")
if not code:
continue
das_list.append(Diagnostic(
texte=d_item.get("texte", ""),
cim10_suggestion=code,
cim10_confidence=d_item.get("cim10_confidence"),
source=d_item.get("source"),
status=d_item.get("status"),
))
safe_sej = {k: v for k, v in sej_raw.items() if k in Sejour.model_fields}
dossier = DossierMedical(
document_type=doc_type,
sejour=Sejour(**safe_sej),
diagnostic_principal=dp_diag,
diagnostics_associes=das_list,
)
# Construire synthese depuis les champs disponibles.
# Les JSONs pré-NUKE-3 n'ont pas de sections CRH stockées.
# On récupère le texte de conclusion depuis les source_excerpt si besoin.
conclusion = data.get("conclusion_medicale", "")
if not conclusion:
# Chercher "CONCLUSION" dans source_excerpt des DAS ou traitements.
# Prendre l'extrait le plus long (les courts sont souvent tronqués).
best = ""
for container in (das_raw, data.get("traitements_sortie", [])):
for item in container:
excerpt = item.get("source_excerpt", "")
up = excerpt.upper()
if "CONCLUSION" in up:
idx = up.index("CONCLUSION")
candidate = excerpt[idx:]
if len(candidate) > len(best):
best = candidate
conclusion = best
synthese = {
"motif": data.get("motif_hospitalisation", ""),
"conclusion": conclusion,
"diag_sortie": data.get("synthese_medicale", {}).get("diag_sortie", ""),
"diag_principal": data.get("synthese_medicale", {}).get("diag_principal", ""),
"synthese": data.get("synthese_medicale", {}).get("synthese", ""),
}
selection = select_dp(dossier, synthese, config={"llm_enabled": False})
dossier.dp_selection = selection
# Finalizer DP (arbitrage Trackare vs CRH, traçabilité)
try:
from src.medical.dp_finalizer import finalize_dp
finalize_dp(dossier)
except Exception:
pass
# Utiliser dp_final si disponible, sinon dp_selection
final = dossier.dp_final or selection
# Convertir en dict compatible analyze_dp_selection
cands = [c.model_dump() for c in final.candidates]
result = {
"dp_selection": {
"verdict": final.verdict,
"confidence": final.confidence,
"chosen_code": final.chosen_code,
"chosen_term": final.chosen_term,
"candidates": cands,
"evidence": final.evidence,
"reason": final.reason,
"debug_scores": final.debug_scores,
}
}
if dossier.dp_final:
result["dp_final"] = dossier.dp_final.model_dump(exclude_none=True)
if dossier.quality_flags:
result["quality_flags"] = dossier.quality_flags
return result
def _run_debug_reports(
args: argparse.Namespace,
dossier_ids: list[str],
dossier_details: list[dict],
gold_index: dict | None,
gold_evals: list[dict] | None,
out_dir: Path,
) -> None:
"""Exécute les modes --case-id, --top-errors, --dim-pack."""
from src.eval.gold_debug import (
build_case_report,
write_case_report,
build_error_entry,
sort_error_entries,
write_top_errors_csv,
write_top_errors_md,
write_top_errors_jsonl,
select_dim_pack_cases,
write_dim_pack,
)
from src.eval.gold_models import evaluate_dp
has_debug = args.case_id or args.top_errors > 0 or args.dim_pack > 0
if not has_debug:
return
# Helper : build full report for a case
def _build_report_for(case_id: str) -> dict | None:
data = load_dossier_json(case_id)
if not data:
return None
# Offline rebuild si nécessaire
if args.offline and not data.get("dp_selection"):
rebuilt = _rebuild_and_select(data)
data["dp_selection"] = rebuilt["dp_selection"]
dp_sel = data.get("dp_selection")
gold_case_dict = None
eval_result = None
if gold_index and case_id in gold_index:
gc = gold_index[case_id]
gold_case_dict = gc.model_dump()
chosen_code = (dp_sel or {}).get("chosen_code")
eval_result = evaluate_dp(chosen_code, gc)
return build_case_report(case_id, data, dp_sel, gold_case_dict, eval_result)
# --case-id
if args.case_id:
cid = args.case_id.strip()
data = load_dossier_json(cid)
if not data:
print(f"ERREUR: output JSON introuvable pour {cid}")
print(f" Suggestion : relancer le pipeline avec --rerun ou vérifier output/structured/{cid}/")
sys.exit(1)
if gold_index and cid not in gold_index:
print(f"ERREUR: {cid} absent du gold ({len(gold_index)} cas chargés)")
sys.exit(1)
report = _build_report_for(cid)
if report:
jp, mp = write_case_report(report, out_dir)
print(f"\n=== Case debug: {cid} ===")
print(f" JSON : {jp}")
print(f" MD : {mp}")
# --top-errors
if args.top_errors > 0:
if not gold_index:
print("ERREUR: --top-errors requiert --gold (ou auto-détection gold_crh.jsonl)")
sys.exit(1)
# Build reports for all gold cases
all_reports: list[dict] = []
gold_case_ids = set(gold_index.keys())
for cid in dossier_ids:
if cid not in gold_case_ids:
continue
r = _build_report_for(cid)
if r:
all_reports.append(r)
entries = [build_error_entry(r) for r in all_reports]
entries = sort_error_entries(entries)
entries = entries[:args.top_errors]
csv_p = out_dir / "NUKE3_GOLD_TOP_ERRORS.csv"
md_p = out_dir / "NUKE3_GOLD_TOP_ERRORS.md"
jsonl_p = out_dir / "NUKE3_GOLD_TOP_ERRORS.jsonl"
write_top_errors_csv(entries, csv_p)
write_top_errors_md(entries, md_p)
write_top_errors_jsonl(entries, jsonl_p)
print(f"\n=== Top {len(entries)} erreurs gold ===")
print(f" CSV : {csv_p}")
print(f" MD : {md_p}")
print(f" JSONL : {jsonl_p}")
# --dim-pack
if args.dim_pack > 0:
# Build reports for all CRH (non-trackare) dossiers
all_reports_dim: list[dict] = []
for cid in dossier_ids:
r = _build_report_for(cid)
if r and r["document_type"] != "trackare":
all_reports_dim.append(r)
elif r and r["prediction"]["verdict"] == "REVIEW":
# Include trackare-sans-DP too (they go through scoring)
all_reports_dim.append(r)
selected = select_dim_pack_cases(all_reports_dim, args.dim_pack)
csv_p, cases_dir = write_dim_pack(selected, out_dir)
print(f"\n=== DIM Pack ({len(selected)} cas) ===")
print(f" CSV : {csv_p}")
print(f" Cas JSON : {cases_dir}/")
def main():
parser = argparse.ArgumentParser(description="Benchmark NUKE-3 comparatif")
parser.add_argument("--n", type=int, default=0, help="Nombre de dossiers (0=tous)")
parser.add_argument("--dossiers", type=str, default="", help="IDs séparés par virgules")
parser.add_argument("--rerun", action="store_true", help="Relancer le pipeline (nécessite Ollama pour LLM on)")
parser.add_argument("--offline", action="store_true",
help="Exécuter NUKE-3 offline (reconstruit DossierMedical depuis JSON, LLM off)")
parser.add_argument("--gold", type=str, default="",
help="Fichier JSONL gold CRH (évaluation tolérante)")
parser.add_argument("--case-id", type=str, default="",
help="Rapport détaillé pour un cas (ex: 74_23141536)")
parser.add_argument("--top-errors", type=int, default=0,
help="Top N erreurs gold (ex: 20)")
parser.add_argument("--dim-pack", type=int, default=0,
help="Pack DIM de N cas CRH à annoter (ex: 20)")
parser.add_argument("--out-dir", type=str, default=str(ROOT / "docs" / "gold_debug"),
help="Dossier de sortie pour debug reports")
parser.add_argument("--output", type=str, default=str(REPORT_PATH), help="Chemin du rapport")
args = parser.parse_args()
specific = [d.strip() for d in args.dossiers.split(",") if d.strip()] if args.dossiers else None
dossier_ids = select_dossiers(args.n, specific)
if not dossier_ids:
print("ERREUR: aucun dossier trouvé")
sys.exit(1)
print(f"NUKE-3 benchmark — {len(dossier_ids)} dossiers")
# Mode rerun
if args.rerun:
ollama_ok = check_ollama()
print(f" Ollama: {'OK' if ollama_ok else 'INDISPONIBLE'}")
# Pass 1 : LLM OFF
print("\n=== Pass 1 : T2A_DP_RANKER_LLM=0 ===")
for did in dossier_ids:
ok = run_pipeline_with_env(did, "0")
status = "OK" if ok else "FAIL"
print(f" {did}: {status}")
# Analyse JSON existants (ou résultat du pass 1)
print("\n=== Analyse des dossiers ===")
analyses_off: list[dict] = []
dossier_details: list[dict] = []
for did in dossier_ids:
data = load_dossier_json(did)
if not data:
print(f" {did}: JSON introuvable")
dossier_details.append({"id": did, "dp_selection": None})
continue
# Mode offline : reconstruire le DossierMedical et exécuter select_dp
if args.offline and not data.get("dp_selection"):
rebuilt = _rebuild_and_select(data)
data["dp_selection"] = rebuilt["dp_selection"]
analysis = analyze_dp_selection(data)
analyses_off.append(analysis)
dossier_details.append({"id": did, "dp_selection": analysis})
verdict = analysis["verdict"] or "-"
code = analysis["chosen_code"] or "-"
print(f" {did}: {verdict}{code} (evidence: {analysis['n_evidence']})")
metrics_off = compute_metrics(analyses_off)
# Pass 2 : LLM ON (si rerun + Ollama dispo)
metrics_on = None
if args.rerun:
if not check_ollama():
print("\nWARN: Ollama indisponible — pass LLM ON ignorée")
print(" Le rapport ne contiendra que les métriques LLM OFF")
else:
print("\n=== Pass 2 : T2A_DP_RANKER_LLM=1 ===")
for did in dossier_ids:
ok = run_pipeline_with_env(did, "1")
status = "OK" if ok else "FAIL"
print(f" {did}: {status}")
analyses_on: list[dict] = []
for did in dossier_ids:
data = load_dossier_json(did)
if data:
analyses_on.append(analyze_dp_selection(data))
metrics_on = compute_metrics(analyses_on)
# Gold CRH
gold_metrics = None
gold_evals = None
gold_index = None
gold_path = args.gold
if not gold_path:
# Auto-détection
default_gold = ROOT / "data" / "gold_crh" / "gold_crh.jsonl"
if default_gold.exists():
gold_path = str(default_gold)
if gold_path:
try:
gold_index = load_gold(gold_path)
print(f"\n=== Évaluation Gold CRH ({len(gold_index)} cas) ===")
gold_evals = evaluate_gold_cases(dossier_details, gold_index)
gold_metrics = compute_gold_metrics(gold_evals)
for ev in gold_evals:
match_str = "OK" if ev["acceptable_match"] else "FAIL"
sym_str = " [R* interdit]" if ev["symptom_not_allowed"] else ""
print(f" {ev['case_id']}: {ev['chosen_code'] or '-'} vs {ev['dp_expected_code']}"
f"{match_str}{sym_str}")
# CSV évaluation
csv_out = ROOT / "docs" / "NUKE3_GOLD_EVAL.csv"
write_gold_eval_csv(gold_evals, csv_out)
print(f"\nCSV évaluation : {csv_out}")
except Exception as e:
print(f"\nERREUR gold : {e}")
gold_metrics = None
gold_evals = None
# --- Debug reports (--case-id, --top-errors, --dim-pack) ---
out_dir = Path(args.out_dir)
_run_debug_reports(args, dossier_ids, dossier_details, gold_index, gold_evals, out_dir)
# Rapport
report = generate_report(
metrics_off, metrics_on, dossier_details, args,
gold_metrics=gold_metrics, gold_evals=gold_evals,
)
output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(report, encoding="utf-8")
print(f"\nRapport écrit : {output_path}")
# Résumé console
print(f"\n{'='*50}")
print(f"CONFIRMED : {metrics_off['confirmed_count']}/{metrics_off['n_with_selection']}"
f" ({_pct(metrics_off['confirmed_rate'])})")
print(f"REVIEW : {metrics_off['review_count']}/{metrics_off['n_with_selection']}"
f" ({_pct(metrics_off['review_rate'])})")
print(f"Evidence : {_pct(metrics_off['confirmed_evidence_rate'])} des CONFIRMED")
print(f"DP symptôme : {_pct(metrics_off['dp_symptom_rate'])}")
print(f"DP comorbidité: {_pct(metrics_off['dp_comorbidity_rate'])}")
if gold_metrics and gold_metrics.get("n", 0) > 0:
gm = gold_metrics
print(f"\n--- Gold CRH ({gm['n']} cas) ---")
print(f"Strict match : {_pct(gm['exact_match_strict_rate'])}")
print(f"Acceptable match : {_pct(gm['acceptable_match_rate'])}")
if gm['confirmed_accuracy_tolerant'] is not None:
print(f"Confirmed acc. : {_pct(gm['confirmed_accuracy_tolerant'])}")
print(f"Symptôme interdit: {gm['symptom_not_allowed']}")
print(f"{'='*50}")
if __name__ == "__main__":
main()

View File

@@ -1,722 +0,0 @@
#!/usr/bin/env python3
"""Benchmark qualité T2A — validation end-to-end sur vrais dossiers.
Compare la qualité des codes CIM-10, vetos, downgrades et CPAM
entre runs successifs. Chaque run est sauvegardé dans un répertoire
isolé pour permettre des comparaisons A/B.
Usage:
python scripts/benchmark_quality.py --n 10
python scripts/benchmark_quality.py --n 10 --compare RUN_ID
python scripts/benchmark_quality.py --dossiers 116_23065570,45_23183041
python scripts/benchmark_quality.py --gold-standard
"""
from __future__ import annotations
import argparse
import json
import os
import shutil
import subprocess
import sys
import time
from datetime import datetime
from pathlib import Path
from statistics import mean, median
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
BENCHMARKS_DIR = ROOT / "output" / "benchmarks"
GOLD_STANDARD_FILE = ROOT / "data" / "gold_standard" / "_selection.json"
INPUT_DIR = ROOT / "input"
OUTPUT_DIR = ROOT / "output" / "structured"
PY = str(ROOT / ".venv" / "bin" / "python3")
# ---------------------------------------------------------------------------
# Sélection des dossiers
# ---------------------------------------------------------------------------
def _gold_standard_ids() -> list[str]:
"""Charge les IDs du gold standard."""
if not GOLD_STANDARD_FILE.exists():
print(f"ERREUR: {GOLD_STANDARD_FILE} introuvable")
sys.exit(1)
data = json.loads(GOLD_STANDARD_FILE.read_text("utf-8"))
# Format: "116_23065570/116_23065570_fusionne_cim10" → on prend la partie avant /
return [d.split("/")[0] for d in data["dossiers"]]
def select_dossiers(n: int, gold_standard: bool, specific: list[str] | None, seed: int = 42) -> list[str]:
"""Sélectionne les dossiers à benchmarker."""
if specific:
# Vérifier que les dossiers existent
valid = []
for d in specific:
if (INPUT_DIR / d).is_dir():
valid.append(d)
else:
print(f" WARN: dossier {d} introuvable dans input/")
return valid
if gold_standard:
ids = _gold_standard_ids()
return ids[:n] if n < len(ids) else ids
# Sinon : prendre N dossiers depuis input/ (tri déterministe + seed pour reproductibilité)
all_dirs = sorted(
d.name for d in INPUT_DIR.iterdir()
if d.is_dir() and any(d.glob("*.pdf"))
)
if not all_dirs:
print("ERREUR: aucun dossier avec PDF dans input/")
sys.exit(1)
import random
rng = random.Random(seed)
rng.shuffle(all_dirs)
return all_dirs[:n]
# ---------------------------------------------------------------------------
# Exécution pipeline
# ---------------------------------------------------------------------------
def run_pipeline(dossier_id: str, clean: bool) -> tuple[float, bool]:
"""Exécute le pipeline sur un dossier. Retourne (durée_s, succès)."""
input_path = INPUT_DIR / dossier_id
if clean:
for subdir in ["structured", "reports", "anonymized"]:
target = ROOT / "output" / subdir / dossier_id
if target.exists():
shutil.rmtree(target)
t0 = time.time()
try:
result = subprocess.run(
[PY, "-m", "src.main", str(input_path)],
capture_output=True,
text=True,
cwd=str(ROOT),
timeout=600, # 10 min max par dossier
)
duration = time.time() - t0
if result.returncode != 0:
print(f" STDERR: {result.stderr[-500:]}")
return duration, False
return duration, True
except subprocess.TimeoutExpired:
return time.time() - t0, False
except Exception as e:
print(f" EXCEPTION: {e}")
return time.time() - t0, False
# ---------------------------------------------------------------------------
# Chargement dictionnaire CIM-10
# ---------------------------------------------------------------------------
def load_cim10_dict() -> dict[str, str]:
"""Charge le dictionnaire CIM-10 (sans passer par le singleton)."""
dict_path = ROOT / "data" / "cim10_dict.json"
supp_path = ROOT / "data" / "cim10_supplements.json"
d = {}
if dict_path.exists():
d = json.loads(dict_path.read_text("utf-8"))
if supp_path.exists():
for code, label in json.loads(supp_path.read_text("utf-8")).items():
d.setdefault(code, label)
return d
def normalize_code(code: str) -> str:
"""K810 → K81.0, k85.1 → K85.1."""
code = code.strip().upper()
if len(code) > 3 and "." not in code:
code = code[:3] + "." + code[3:]
return code
def is_valid_code(code: str, cim10: dict[str, str]) -> bool:
"""Vérifie si un code CIM-10 existe dans le dictionnaire."""
nc = normalize_code(code)
return nc in cim10 or code.upper().strip() in cim10
# ---------------------------------------------------------------------------
# Analyse d'un dossier
# ---------------------------------------------------------------------------
def find_merged_json(dossier_id: str) -> Path | None:
"""Trouve le JSON fusionné d'un dossier."""
d = OUTPUT_DIR / dossier_id
if not d.exists():
return None
# Chercher le fusionné d'abord
fusions = list(d.glob("*fusionne_cim10.json"))
if fusions:
return fusions[0]
# Sinon premier _cim10.json
cim10s = list(d.glob("*_cim10.json"))
return cim10s[0] if cim10s else None
def analyze_dossier(dossier_id: str, cim10: dict[str, str], duration: float) -> dict:
"""Analyse le JSON de sortie d'un dossier et extrait les métriques."""
result = {
"dossier_id": dossier_id,
"processing_time_s": round(duration, 1),
"success": False,
}
json_path = find_merged_json(dossier_id)
if not json_path:
return result
try:
data = json.loads(json_path.read_text("utf-8"))
except (json.JSONDecodeError, OSError):
return result
result["success"] = True
# --- DP ---
dp = data.get("diagnostic_principal", {})
dp_code = dp.get("cim10_final") or dp.get("cim10_suggestion") or ""
dp_suggestion = dp.get("cim10_suggestion") or ""
result["dp"] = {
"texte": (dp.get("texte") or "")[:80],
"code_suggestion": dp_suggestion,
"code_final": dp_code,
"confidence": dp.get("cim10_confidence", ""),
"has_code": bool(dp_code),
"valid_code": is_valid_code(dp_code, cim10) if dp_code else False,
"downgraded": bool(dp_code and dp_suggestion and dp_code != dp_suggestion),
}
# --- DAS ---
das_list = data.get("diagnostics_associes", [])
das_codes = []
das_conf = {"high": 0, "medium": 0, "low": 0}
das_valid = 0
das_no_code = 0
das_downgraded = 0
for d_item in das_list:
code = d_item.get("cim10_final") or d_item.get("cim10_suggestion") or ""
suggestion = d_item.get("cim10_suggestion") or ""
conf = d_item.get("cim10_confidence", "low")
if not code:
das_no_code += 1
continue
das_codes.append(code)
if conf in das_conf:
das_conf[conf] += 1
if is_valid_code(code, cim10):
das_valid += 1
if code and suggestion and code != suggestion:
das_downgraded += 1
n_das_with_code = len(das_codes)
result["das"] = {
"total": len(das_list),
"with_code": n_das_with_code,
"no_code": das_no_code,
"valid": das_valid,
"validity_rate": round(das_valid / n_das_with_code, 3) if n_das_with_code else 0,
"confidence": das_conf,
"downgraded": das_downgraded,
"downgrade_rate": round(das_downgraded / n_das_with_code, 3) if n_das_with_code else 0,
"codes_uniques": sorted(set(das_codes)),
}
# --- Metrics du dossier ---
metrics = data.get("metrics", {})
result["metrics"] = {
"das_active": metrics.get("das_active", 0),
"das_removed": metrics.get("das_removed", 0),
"das_ruled_out": metrics.get("das_ruled_out", 0),
}
# --- Veto ---
veto = data.get("veto_report", {})
issues = veto.get("issues", [])
result["veto"] = {
"verdict": veto.get("verdict", "NO_REPORT"),
"score": veto.get("score_contestabilite", 0),
"issues_count": len(issues),
"hard_count": sum(1 for i in issues if i.get("severity") == "HARD"),
"top_issues": [i.get("veto", i.get("type", "?")) for i in issues[:5]],
}
# --- GHM ---
ghm = data.get("ghm_estimation")
result["ghm"] = {
"estimated": ghm is not None and bool(ghm),
"cmd": ghm.get("cmd") if ghm else None,
"severity": ghm.get("severity") if ghm else None,
"ghm": ghm.get("ghm") if ghm else None,
}
# --- CPAM ---
cpam = data.get("controles_cpam", [])
result["cpam"] = {
"controls_count": len(cpam),
"has_response": any(bool(c.get("contre_argumentation")) for c in cpam),
"sources_count": sum(len(c.get("sources_reponse", [])) for c in cpam),
}
# --- Biologie ---
bio = data.get("biologie_cle", [])
result["biologie"] = {
"tests_count": len(bio),
"anomalies": sum(1 for b in bio if b.get("anomalie")),
}
# --- Codes CIM-10 invalides (détail) ---
invalid_codes = []
if dp_code and not is_valid_code(dp_code, cim10):
invalid_codes.append(f"DP:{dp_code}")
for code in das_codes:
if not is_valid_code(code, cim10):
invalid_codes.append(f"DAS:{code}")
result["invalid_codes"] = invalid_codes
return result
# ---------------------------------------------------------------------------
# Agrégation
# ---------------------------------------------------------------------------
def compute_aggregate(per_dossier: list[dict]) -> dict:
"""Calcule les métriques agrégées sur tous les dossiers."""
successful = [d for d in per_dossier if d.get("success")]
n = len(successful)
if n == 0:
return {"n_total": len(per_dossier), "n_success": 0}
# DP
dp_has_code = sum(1 for d in successful if d["dp"]["has_code"])
dp_valid = sum(1 for d in successful if d["dp"]["valid_code"])
dp_conf = {"high": 0, "medium": 0, "low": 0}
for d in successful:
c = d["dp"]["confidence"]
if c in dp_conf:
dp_conf[c] += 1
dp_downgraded = sum(1 for d in successful if d["dp"]["downgraded"])
# DAS
total_das = sum(d["das"]["total"] for d in successful)
total_das_with_code = sum(d["das"]["with_code"] for d in successful)
total_das_valid = sum(d["das"]["valid"] for d in successful)
total_das_downgraded = sum(d["das"]["downgraded"] for d in successful)
das_conf_agg = {"high": 0, "medium": 0, "low": 0}
for d in successful:
for k in das_conf_agg:
das_conf_agg[k] += d["das"]["confidence"].get(k, 0)
# Veto
verdicts = {}
total_hard = 0
for d in successful:
v = d["veto"]["verdict"]
verdicts[v] = verdicts.get(v, 0) + 1
total_hard += d["veto"]["hard_count"]
# GHM
ghm_estimated = sum(1 for d in successful if d["ghm"]["estimated"])
# CPAM
cpam_total = sum(d["cpam"]["controls_count"] for d in successful)
cpam_with_response = sum(1 for d in successful if d["cpam"]["has_response"])
# Temps
times = [d["processing_time_s"] for d in successful]
times_sorted = sorted(times)
p90_idx = int(len(times_sorted) * 0.9)
# Codes invalides
all_invalid = []
for d in successful:
all_invalid.extend(d.get("invalid_codes", []))
return {
"n_total": len(per_dossier),
"n_success": n,
"n_failed": len(per_dossier) - n,
"dp": {
"has_code_rate": round(dp_has_code / n, 3),
"valid_code_rate": round(dp_valid / n, 3),
"confidence": dp_conf,
"downgraded": dp_downgraded,
},
"das": {
"total": total_das,
"mean_per_dossier": round(total_das / n, 1),
"with_code": total_das_with_code,
"valid": total_das_valid,
"validity_rate": round(total_das_valid / total_das_with_code, 3) if total_das_with_code else 0,
"confidence": das_conf_agg,
"confidence_high_rate": round(das_conf_agg["high"] / total_das_with_code, 3) if total_das_with_code else 0,
"downgraded": total_das_downgraded,
"downgrade_rate": round(total_das_downgraded / total_das_with_code, 3) if total_das_with_code else 0,
},
"veto": {
"verdicts": verdicts,
"hard_total": total_hard,
"dossiers_with_hard": sum(1 for d in successful if d["veto"]["hard_count"] > 0),
},
"ghm": {
"estimated_rate": round(ghm_estimated / n, 3),
},
"cpam": {
"controls_total": cpam_total,
"with_response": cpam_with_response,
},
"timing": {
"mean_s": round(mean(times), 1),
"median_s": round(median(times), 1),
"p90_s": round(times_sorted[p90_idx], 1) if times_sorted else 0,
"total_s": round(sum(times), 1),
},
"invalid_codes": all_invalid,
"invalid_codes_count": len(all_invalid),
}
# ---------------------------------------------------------------------------
# Rapport texte
# ---------------------------------------------------------------------------
def _pct(val: float) -> str:
return f"{val * 100:.1f}%"
def _bar(val: float, width: int = 20) -> str:
filled = int(val * width)
return "" * filled + "" * (width - filled)
def generate_report(run_id: str, config: dict, agg: dict, per_dossier: list[dict]) -> str:
"""Génère un rapport lisible."""
lines = []
w = 66
lines.append("=" * w)
lines.append(f" BENCHMARK QUALITÉ T2A — {run_id}")
lines.append("=" * w)
lines.append(f" Date : {config['timestamp']}")
lines.append(f" Modèles : coding={config['models'].get('coding','?')} cpam={config['models'].get('cpam','?')}")
lines.append(f" validation={config['models'].get('validation','?')} qc={config['models'].get('qc','?')}")
lines.append(f" Dossiers : {agg['n_success']}/{agg['n_total']} traités ({agg.get('n_failed',0)} échecs)")
lines.append(f" Durée : {agg['timing']['total_s']:.0f}s ({agg['timing']['mean_s']:.1f}s/dossier)")
lines.append("-" * w)
# DP
dp = agg["dp"]
lines.append("")
lines.append(" DIAGNOSTIC PRINCIPAL (DP)")
lines.append(f" Code obtenu : {_bar(dp['has_code_rate'])} {_pct(dp['has_code_rate'])}")
lines.append(f" Code CIM-10 valide : {_bar(dp['valid_code_rate'])} {_pct(dp['valid_code_rate'])}")
lines.append(f" Confiance high : {dp['confidence'].get('high',0)}/{agg['n_success']} "
f"medium: {dp['confidence'].get('medium',0)} low: {dp['confidence'].get('low',0)}")
lines.append(f" Downgrades : {dp['downgraded']}")
# DAS
das = agg["das"]
lines.append("")
lines.append(" DIAGNOSTICS ASSOCIÉS (DAS)")
lines.append(f" Total : {das['total']} (moy {das['mean_per_dossier']}/dossier)")
lines.append(f" Avec code : {das['with_code']}/{das['total']}")
lines.append(f" Codes valides : {_bar(das['validity_rate'])} {_pct(das['validity_rate'])}")
lines.append(f" Confiance : high={das['confidence']['high']} "
f"medium={das['confidence']['medium']} low={das['confidence']['low']}")
lines.append(f" Confiance high : {_bar(das['confidence_high_rate'])} {_pct(das['confidence_high_rate'])}")
lines.append(f" Downgrades : {das['downgraded']} ({_pct(das['downgrade_rate'])})")
# Veto
veto = agg["veto"]
lines.append("")
lines.append(" VETOS / QUALITÉ")
for v, count in sorted(veto["verdicts"].items(), key=lambda x: -x[1]):
lines.append(f" {v:12s} : {count}")
lines.append(f" Issues HARD : {veto['hard_total']} (dans {veto['dossiers_with_hard']} dossiers)")
# GHM
lines.append("")
lines.append(" GHM")
lines.append(f" Estimé : {_bar(agg['ghm']['estimated_rate'])} {_pct(agg['ghm']['estimated_rate'])}")
# CPAM
if agg["cpam"]["controls_total"] > 0:
lines.append("")
lines.append(" CPAM")
lines.append(f" Contrôles : {agg['cpam']['controls_total']}")
lines.append(f" Avec réponse : {agg['cpam']['with_response']}")
# Temps
lines.append("")
lines.append(" TEMPS DE TRAITEMENT")
lines.append(f" Moyen : {agg['timing']['mean_s']:.1f}s")
lines.append(f" Médian : {agg['timing']['median_s']:.1f}s")
lines.append(f" P90 : {agg['timing']['p90_s']:.1f}s")
lines.append(f" Total : {agg['timing']['total_s']:.0f}s")
# Codes invalides
if agg["invalid_codes"]:
lines.append("")
lines.append(f" CODES CIM-10 INVALIDES ({agg['invalid_codes_count']})")
for code in agg["invalid_codes"][:20]:
lines.append(f" {code}")
if agg["invalid_codes_count"] > 20:
lines.append(f" ... et {agg['invalid_codes_count'] - 20} autres")
# Détail par dossier
lines.append("")
lines.append("-" * w)
lines.append(" DÉTAIL PAR DOSSIER")
lines.append("-" * w)
lines.append(f" {'Dossier':<25s} {'DP':>6s} {'DAS':>4s} {'Valid%':>7s} {'Veto':>10s} {'Temps':>6s}")
lines.append(f" {'-'*25:<25s} {'-'*6:>6s} {'-'*4:>4s} {'-'*7:>7s} {'-'*10:>10s} {'-'*6:>6s}")
for d in sorted(per_dossier, key=lambda x: x["dossier_id"]):
if not d.get("success"):
lines.append(f" {d['dossier_id']:<25s} {'ÉCHEC':>6s}")
continue
dp_code = d["dp"]["code_final"] or "-"
dp_mark = "" if d["dp"]["valid_code"] else ""
n_das = d["das"]["total"]
vr = f"{d['das']['validity_rate']*100:.0f}%" if d["das"]["with_code"] else "-"
verdict = d["veto"]["verdict"]
t = f"{d['processing_time_s']:.0f}s"
lines.append(f" {d['dossier_id']:<25s} {dp_code:>5s}{dp_mark} {n_das:>4d} {vr:>7s} {verdict:>10s} {t:>6s}")
lines.append("")
lines.append("=" * w)
return "\n".join(lines)
# ---------------------------------------------------------------------------
# Comparaison entre runs
# ---------------------------------------------------------------------------
def compare_runs(current_agg: dict, baseline_agg: dict, baseline_id: str) -> str:
"""Compare deux runs et génère un rapport diff."""
lines = []
w = 66
lines.append("")
lines.append("=" * w)
lines.append(f" COMPARAISON avec {baseline_id}")
lines.append("=" * w)
def _delta(cur: float, base: float, is_pct: bool = True) -> str:
d = cur - base
sign = "+" if d >= 0 else ""
if is_pct:
return f"{sign}{d*100:.1f}%"
return f"{sign}{d:.1f}"
def _row(label: str, cur_val: float, base_val: float, is_pct: bool = True):
if is_pct:
cur_s = _pct(cur_val)
base_s = _pct(base_val)
else:
cur_s = f"{cur_val:.1f}"
base_s = f"{base_val:.1f}"
delta_s = _delta(cur_val, base_val, is_pct)
lines.append(f" {label:<24s} {base_s:>10s} {cur_s:>10s} {delta_s:>10s}")
lines.append(f" {'Métrique':<24s} {'Baseline':>10s} {'Actuel':>10s} {'Delta':>10s}")
lines.append(f" {'-'*24:<24s} {'-'*10:>10s} {'-'*10:>10s} {'-'*10:>10s}")
_row("DP code valide", current_agg["dp"]["valid_code_rate"], baseline_agg["dp"]["valid_code_rate"])
_row("DAS validité", current_agg["das"]["validity_rate"], baseline_agg["das"]["validity_rate"])
_row("DAS confiance high", current_agg["das"]["confidence_high_rate"], baseline_agg["das"]["confidence_high_rate"])
_row("DAS downgrade", current_agg["das"]["downgrade_rate"], baseline_agg["das"]["downgrade_rate"])
_row("GHM estimé", current_agg["ghm"]["estimated_rate"], baseline_agg["ghm"]["estimated_rate"])
_row("DAS moy/dossier", current_agg["das"]["mean_per_dossier"], baseline_agg["das"]["mean_per_dossier"], is_pct=False)
_row("Temps moyen (s)", current_agg["timing"]["mean_s"], baseline_agg["timing"]["mean_s"], is_pct=False)
# Codes invalides
cur_inv = set(current_agg.get("invalid_codes", []))
base_inv = set(baseline_agg.get("invalid_codes", []))
new_inv = cur_inv - base_inv
fixed_inv = base_inv - cur_inv
if new_inv:
lines.append(f"\n Nouveaux codes invalides : {', '.join(sorted(new_inv))}")
if fixed_inv:
lines.append(f" Codes corrigés : {', '.join(sorted(fixed_inv))}")
lines.append("=" * w)
return "\n".join(lines)
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def get_current_config() -> dict:
"""Récupère la configuration modèle actuelle."""
try:
from src.config import OLLAMA_MODELS, OLLAMA_MODEL, OLLAMA_URL
return {
"models": dict(OLLAMA_MODELS),
"ollama_model": OLLAMA_MODEL,
"ollama_url": OLLAMA_URL,
}
except ImportError:
return {
"models": {
"coding": os.environ.get("T2A_MODEL_CODING", "?"),
"cpam": os.environ.get("T2A_MODEL_CPAM", "?"),
"validation": os.environ.get("T2A_MODEL_VALIDATION", "?"),
"qc": os.environ.get("T2A_MODEL_QC", "?"),
},
"ollama_model": os.environ.get("OLLAMA_MODEL", "?"),
}
def main():
parser = argparse.ArgumentParser(description="Benchmark qualité T2A")
parser.add_argument("--n", type=int, default=10, help="Nombre de dossiers")
parser.add_argument("--dossiers", type=str, help="IDs séparés par des virgules")
parser.add_argument("--gold-standard", action="store_true", help="Utiliser les 50 dossiers gold standard")
parser.add_argument("--compare", type=str, help="Run ID à comparer")
parser.add_argument("--label", type=str, default="", help="Label pour ce run")
parser.add_argument("--no-reprocess", action="store_true", help="Analyser les outputs existants sans relancer le pipeline")
parser.add_argument("--clean", action="store_true", help="Supprimer les outputs avant retraitement")
parser.add_argument("--seed", type=int, default=42, help="Seed pour la sélection aléatoire")
parser.add_argument("--workers", type=int, default=1, help="Nombre de dossiers traités en parallèle")
args = parser.parse_args()
# Sélection dossiers
specific = args.dossiers.split(",") if args.dossiers else None
dossiers = select_dossiers(args.n, args.gold_standard, specific, args.seed)
print(f"\n Dossiers sélectionnés : {len(dossiers)}")
for d in dossiers:
print(f" - {d}")
# Config
config = get_current_config()
run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
if args.label:
run_id = f"{run_id}_{args.label}"
config["timestamp"] = datetime.now().isoformat()
config["run_id"] = run_id
config["dossiers"] = dossiers
config["args"] = {
"n": args.n,
"gold_standard": args.gold_standard,
"clean": args.clean,
"no_reprocess": args.no_reprocess,
"seed": args.seed,
"label": args.label,
}
print(f"\n Run ID : {run_id}")
print(f" Modèles : {config['models']}")
print(f" Reprocess: {'NON' if args.no_reprocess else 'OUI (clean=' + str(args.clean) + ')'}")
print()
# Charger dictionnaire CIM-10
cim10 = load_cim10_dict()
print(f" Dictionnaire CIM-10 : {len(cim10)} codes")
print()
# Traitement
per_dossier = []
total = len(dossiers)
if args.workers > 1 and not args.no_reprocess:
# Mode parallèle : exécuter les pipelines en parallèle puis analyser
from concurrent.futures import ThreadPoolExecutor, as_completed
print(f" Mode parallèle : {args.workers} workers")
pipeline_results: dict[str, tuple[float, bool]] = {}
done = 0
with ThreadPoolExecutor(max_workers=args.workers) as executor:
futures = {
executor.submit(run_pipeline, dossier_id, args.clean): dossier_id
for dossier_id in dossiers
}
for future in as_completed(futures):
dossier_id = futures[future]
try:
duration, success = future.result()
except Exception as e:
print(f" EXCEPTION {dossier_id}: {e}")
duration, success = 0.0, False
pipeline_results[dossier_id] = (duration, success)
done += 1
mark = "" if success else ""
print(f" [{done}/{total}] {dossier_id}{duration:.1f}s {mark}")
# Analyse séquentielle (ordre stable)
for dossier_id in dossiers:
duration, success = pipeline_results[dossier_id]
metrics = analyze_dossier(dossier_id, cim10, duration)
per_dossier.append(metrics)
else:
# Mode séquentiel (ou --no-reprocess)
for i, dossier_id in enumerate(dossiers, 1):
print(f" [{i}/{total}] {dossier_id}", end="", flush=True)
if args.no_reprocess:
duration = 0.0
success = find_merged_json(dossier_id) is not None
if not success:
print(" — pas de JSON")
else:
print(" — analyse existant")
else:
print(" — traitement...", end="", flush=True)
duration, success = run_pipeline(dossier_id, args.clean)
print(f" {duration:.1f}s {'' if success else ''}")
metrics = analyze_dossier(dossier_id, cim10, duration)
per_dossier.append(metrics)
# Agrégation
agg = compute_aggregate(per_dossier)
# Rapport
report = generate_report(run_id, config, agg, per_dossier)
print(report)
# Comparaison si demandée
comparison = ""
if args.compare:
baseline_path = BENCHMARKS_DIR / args.compare / "metrics.json"
if baseline_path.exists():
baseline = json.loads(baseline_path.read_text("utf-8"))
comparison = compare_runs(agg, baseline["aggregate"], args.compare)
print(comparison)
else:
print(f"\n WARN: run baseline {args.compare} introuvable ({baseline_path})")
# Sauvegarde
run_dir = BENCHMARKS_DIR / run_id
run_dir.mkdir(parents=True, exist_ok=True)
(run_dir / "config.json").write_text(
json.dumps(config, ensure_ascii=False, indent=2), encoding="utf-8"
)
(run_dir / "metrics.json").write_text(
json.dumps({"aggregate": agg, "per_dossier": per_dossier}, ensure_ascii=False, indent=2),
encoding="utf-8",
)
(run_dir / "report.txt").write_text(report + comparison, encoding="utf-8")
print(f"\n Résultats sauvegardés dans : {run_dir}")
print(f" Pour comparer un futur run : python scripts/benchmark_quality.py --compare {run_id}")
if __name__ == "__main__":
main()

View File

@@ -1,163 +0,0 @@
#!/usr/bin/env python3
"""Régénération ciblée des contrôles CPAM classés Tier C ou sans response_data.
Usage :
cd /home/dom/ai/t2a_v2
.venv/bin/python3 scripts/regenerate_tier_c.py [--dry-run]
Le script :
1. Scanne output/structured/ pour trouver les contrôles Tier C + ceux sans response_data
2. Pour chaque contrôle, relance generate_cpam_response() avec le pipeline corrigé
3. Sauvegarde le JSON mis à jour (backup automatique .bak)
Options :
--dry-run Affiche les contrôles ciblés sans régénérer
"""
from __future__ import annotations
import json
import logging
import shutil
import sys
import time
from pathlib import Path
# Ajouter le répertoire racine au path
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
from src.config import DossierMedical
from src.control.cpam_response import generate_cpam_response
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)-7s %(message)s",
datefmt="%H:%M:%S",
)
logger = logging.getLogger(__name__)
STRUCTURED_DIR = ROOT / "output" / "structured"
def find_targets() -> list[tuple[Path, int]]:
"""Trouve les fichiers JSON contenant des contrôles Tier C ou sans response_data.
Returns:
Liste de (chemin_json, index_du_controle_dans_la_liste).
"""
targets: list[tuple[Path, int]] = []
for sub in sorted(STRUCTURED_DIR.iterdir()):
if not sub.is_dir():
continue
for jf in sub.glob("*_fusionne_cim10.json"):
data = json.loads(jf.read_text(encoding="utf-8"))
controles = data.get("controles_cpam", [])
for i, ctrl in enumerate(controles):
tier = ctrl.get("quality_tier")
has_resp = ctrl.get("response_data") is not None
if tier == "C" or not has_resp:
targets.append((jf, i))
return targets
def regenerate(targets: list[tuple[Path, int]]) -> dict[str, int]:
"""Régénère les contrôles CPAM ciblés.
Returns:
Statistiques {tier_A, tier_B, tier_C, errors}.
"""
stats = {"A": 0, "B": 0, "C": 0, "errors": 0}
# Grouper par fichier pour ne charger/sauver qu'une fois par dossier
by_file: dict[Path, list[int]] = {}
for path, idx in targets:
by_file.setdefault(path, []).append(idx)
total = len(targets)
done = 0
for json_path, indices in by_file.items():
dossier_id = json_path.parent.name
logger.info("=== Dossier %s (%d contrôle(s) à régénérer) ===", dossier_id, len(indices))
# Charger le dossier
data = json.loads(json_path.read_text(encoding="utf-8"))
dossier = DossierMedical.model_validate(data)
modified = False
for idx in indices:
ctrl = dossier.controles_cpam[idx]
done += 1
old_tier = ctrl.quality_tier or "?"
logger.info("[%d/%d] OGC %d%s (ancien tier: %s)",
done, total, ctrl.numero_ogc, ctrl.titre[:60], old_tier)
t0 = time.time()
try:
text, response_data, sources = generate_cpam_response(dossier, ctrl)
elapsed = time.time() - t0
ctrl.contre_argumentation = text
ctrl.response_data = response_data
ctrl.sources_reponse = sources
new_tier = ctrl.quality_tier or "?"
stats[new_tier] = stats.get(new_tier, 0) + 1
modified = True
logger.info(" Résultat : tier %s%s (%d chars, %.1fs)",
old_tier, new_tier, len(text), elapsed)
except Exception:
logger.exception(" ERREUR sur OGC %d", ctrl.numero_ogc)
stats["errors"] += 1
if modified:
# Backup + sauvegarde
backup_path = json_path.with_suffix(".json.bak")
shutil.copy2(json_path, backup_path)
json_path.write_text(
dossier.model_dump_json(indent=2, exclude_none=True),
encoding="utf-8",
)
logger.info(" Sauvegardé : %s (backup: %s)", json_path.name, backup_path.name)
return stats
def main() -> None:
dry_run = "--dry-run" in sys.argv
logger.info("Recherche des contrôles Tier C et sans response_data...")
targets = find_targets()
if not targets:
logger.info("Aucun contrôle à régénérer.")
return
logger.info("Trouvé %d contrôle(s) à régénérer :", len(targets))
for path, idx in targets:
data = json.loads(path.read_text(encoding="utf-8"))
ctrl = data["controles_cpam"][idx]
tier = ctrl.get("quality_tier", "?")
has_resp = "oui" if ctrl.get("response_data") else "NON"
logger.info(" %s OGC %d — tier %s, response_data: %s",
path.parent.name, ctrl["numero_ogc"], tier, has_resp)
if dry_run:
logger.info("Mode dry-run — aucune régénération effectuée.")
return
t0 = time.time()
stats = regenerate(targets)
elapsed = time.time() - t0
logger.info("=== TERMINÉ en %.1f min ===", elapsed / 60)
logger.info("Distribution : A=%d, B=%d, C=%d, erreurs=%d",
stats.get("A", 0), stats.get("B", 0), stats.get("C", 0), stats["errors"])
if __name__ == "__main__":
main()

View File

@@ -1,231 +0,0 @@
#!/usr/bin/env python3
"""Sélectionne 50 dossiers pour le gold standard de validation DIM.
- 25 dossiers CPAM (cas complexes, déjà contrôlés)
- 25 dossiers non-CPAM stratifiés par CMD, confiance DP, nombre de DAS
Crée data/gold_standard/_selection.json et initialise les annotations vides.
"""
from __future__ import annotations
import json
import random
import sys
from pathlib import Path
# Ajouter le répertoire racine au path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from src.config import STRUCTURED_DIR, BASE_DIR, DossierMedical
GOLD_DIR = BASE_DIR / "data" / "gold_standard"
TARGET_TOTAL = 50
TARGET_CPAM = 25
def load_all_dossiers() -> list[dict]:
"""Charge tous les dossiers fusionnés depuis output/structured/."""
dossiers = []
for subdir in sorted(STRUCTURED_DIR.iterdir()):
if not subdir.is_dir():
continue
# Chercher le fichier fusionné
fusionne = None
for f in subdir.glob("*fusionne*.json"):
fusionne = f
break
if not fusionne:
# Prendre le premier JSON du dossier
jsons = sorted(subdir.glob("*.json"))
if jsons:
fusionne = jsons[0]
if not fusionne:
continue
try:
data = json.loads(fusionne.read_text(encoding="utf-8"))
dossier = DossierMedical.model_validate(data)
rel_path = str(fusionne.relative_to(STRUCTURED_DIR))
group_name = subdir.name
dossiers.append({
"dossier_id": f"{group_name}/{fusionne.stem}",
"group_name": group_name,
"path_rel": rel_path,
"dossier": dossier,
})
except Exception as e:
print(f" Erreur chargement {fusionne.name}: {e}")
return dossiers
def select_dossiers(all_dossiers: list[dict]) -> list[dict]:
"""Sélectionne les 50 dossiers selon la stratégie définie."""
# Séparer CPAM / non-CPAM
cpam = [d for d in all_dossiers if d["dossier"].controles_cpam]
non_cpam = [d for d in all_dossiers if not d["dossier"].controles_cpam]
print(f"Dossiers CPAM disponibles : {len(cpam)}")
print(f"Dossiers non-CPAM disponibles : {len(non_cpam)}")
# Prendre tous les CPAM (ou max TARGET_CPAM)
selected_cpam = cpam[:TARGET_CPAM]
remaining_target = TARGET_TOTAL - len(selected_cpam)
# Stratifier les non-CPAM
selected_non_cpam = stratified_sample(non_cpam, remaining_target)
selected = selected_cpam + selected_non_cpam
print(f"\nSélection finale : {len(selected)} dossiers")
print(f" - CPAM : {len(selected_cpam)}")
print(f" - Non-CPAM : {len(selected_non_cpam)}")
return selected
def stratified_sample(dossiers: list[dict], n: int) -> list[dict]:
"""Échantillonnage stratifié par CMD, confiance DP et nombre de DAS."""
if len(dossiers) <= n:
return dossiers
# Grouper par CMD
by_cmd: dict[str, list[dict]] = {}
for d in dossiers:
ghm = d["dossier"].ghm_estimation
cmd = ghm.cmd if ghm else "inconnu"
by_cmd.setdefault(cmd or "inconnu", []).append(d)
selected = []
seen_ids = set()
# Phase 1 : 1 dossier par CMD (diversité maximale)
cmds = sorted(by_cmd.keys())
random.seed(42) # Reproductible
for cmd in cmds:
if len(selected) >= n:
break
candidates = by_cmd[cmd]
# Préférer un mix de confiances
random.shuffle(candidates)
d = candidates[0]
selected.append(d)
seen_ids.add(d["dossier_id"])
# Phase 2 : compléter avec diversité confiance DP
if len(selected) < n:
remaining = [d for d in dossiers if d["dossier_id"] not in seen_ids]
# Trier par confiance DP (low > medium > high pour surreprésenter les cas difficiles)
conf_order = {"low": 0, "medium": 1, "high": 2, None: 3}
remaining.sort(key=lambda d: (
conf_order.get(
d["dossier"].diagnostic_principal.cim10_confidence
if d["dossier"].diagnostic_principal else None,
3
),
-len(d["dossier"].diagnostics_associes), # beaucoup de DAS d'abord
))
for d in remaining:
if len(selected) >= n:
break
selected.append(d)
return selected[:n]
def create_empty_annotation(dossier_id: str, dossier: DossierMedical) -> dict:
"""Crée une annotation vide pour un dossier."""
dp = dossier.diagnostic_principal
das_list = []
for i, das in enumerate(dossier.diagnostics_associes):
das_list.append({
"index": i,
"texte_original": das.texte,
"code_pipeline": das.cim10_suggestion or "",
"confidence": das.cim10_confidence or "",
"source": das.source or "",
"statut": "correct",
"code_corrige": None,
"commentaire": "",
})
return {
"dossier_id": dossier_id,
"validateur": "",
"date_validation": "",
"statut": "non_commence",
"dp": {
"texte_original": dp.texte if dp else "",
"code_pipeline": dp.cim10_suggestion if dp else "",
"confidence": dp.cim10_confidence if dp else "",
"statut": "correct",
"code_corrige": None,
"commentaire": "",
},
"das": das_list,
"das_ajoutes": [],
"commentaire_general": "",
}
def main():
print("=== Sélection des dossiers pour validation DIM ===\n")
all_dossiers = load_all_dossiers()
print(f"Total dossiers chargés : {len(all_dossiers)}\n")
if not all_dossiers:
print("Aucun dossier trouvé dans output/structured/")
sys.exit(1)
selected = select_dossiers(all_dossiers)
# Créer le répertoire gold standard
GOLD_DIR.mkdir(parents=True, exist_ok=True)
# Sauvegarder la sélection
selection = {
"date_selection": __import__("datetime").datetime.now().isoformat(timespec="seconds"),
"total": len(selected),
"cpam": sum(1 for d in selected if d["dossier"].controles_cpam),
"non_cpam": sum(1 for d in selected if not d["dossier"].controles_cpam),
"dossiers": [d["dossier_id"] for d in selected],
}
selection_path = GOLD_DIR / "_selection.json"
selection_path.write_text(
json.dumps(selection, ensure_ascii=False, indent=2),
encoding="utf-8",
)
print(f"\nSélection sauvegardée : {selection_path}")
# Initialiser les annotations vides
created = 0
for d in selected:
dossier_id = d["dossier_id"]
safe_name = dossier_id.replace("/", "__") + ".json"
annot_path = GOLD_DIR / safe_name
if not annot_path.exists():
annotation = create_empty_annotation(dossier_id, d["dossier"])
annot_path.write_text(
json.dumps(annotation, ensure_ascii=False, indent=2),
encoding="utf-8",
)
created += 1
print(f"Annotations vides créées : {created}")
print(f"Annotations existantes préservées : {len(selected) - created}")
# Résumé
print(f"\n--- Résumé ---")
for i, d in enumerate(selected, 1):
dos = d["dossier"]
dp_code = dos.diagnostic_principal.cim10_suggestion if dos.diagnostic_principal else "?"
dp_conf = (dos.diagnostic_principal.cim10_confidence or "?") if dos.diagnostic_principal else "?"
n_das = len(dos.diagnostics_associes)
cpam_flag = " [CPAM]" if dos.controles_cpam else ""
ghm = dos.ghm_estimation
cmd = ghm.cmd if ghm else "?"
print(f" {i:2d}. {d['group_name']:<20s} DP={dp_code:<6s} conf={dp_conf:<7s} DAS={n_das:2d} CMD={cmd}{cpam_flag}")
if __name__ == "__main__":
main()