From dbc5bdbaf42135eff6edfbd085eaecc89e07c41e Mon Sep 17 00:00:00 2001 From: dom Date: Tue, 17 Feb 2026 21:43:02 +0100 Subject: [PATCH] feat: mode Validation DIM dans le viewer Flask MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Permet aux médecins DIM de valider/corriger les codes CIM-10 extraits par le pipeline pour construire un gold standard (50 dossiers). - ValidationManager : gestion annotations JSON dans data/gold_standard/ - Script sélection 50 dossiers (25 CPAM + 25 stratifiés CMD/confiance) - Routes /validation, /api/cim10/search, /api/validation/save, /validation/metrics - Formulaire avec autocomplete CIM-10, boutons Correct/Modifier/Supprimer - Dashboard métriques : precision, recall, F1, hallucination par confiance/source Co-Authored-By: Claude Opus 4.6 --- scripts/select_validation_dossiers.py | 231 +++++++++++ src/viewer/app.py | 155 +++++++ src/viewer/templates/base.html | 4 + src/viewer/templates/validation_detail.html | 404 +++++++++++++++++++ src/viewer/templates/validation_list.html | 179 ++++++++ src/viewer/templates/validation_metrics.html | 243 +++++++++++ src/viewer/validation.py | 272 +++++++++++++ 7 files changed, 1488 insertions(+) create mode 100644 scripts/select_validation_dossiers.py create mode 100644 src/viewer/templates/validation_detail.html create mode 100644 src/viewer/templates/validation_list.html create mode 100644 src/viewer/templates/validation_metrics.html create mode 100644 src/viewer/validation.py diff --git a/scripts/select_validation_dossiers.py b/scripts/select_validation_dossiers.py new file mode 100644 index 0000000..5b70516 --- /dev/null +++ b/scripts/select_validation_dossiers.py @@ -0,0 +1,231 @@ +#!/usr/bin/env python3 +"""Sélectionne 50 dossiers pour le gold standard de validation DIM. + +- 25 dossiers CPAM (cas complexes, déjà contrôlés) +- 25 dossiers non-CPAM stratifiés par CMD, confiance DP, nombre de DAS + +Crée data/gold_standard/_selection.json et initialise les annotations vides. +""" + +from __future__ import annotations + +import json +import random +import sys +from pathlib import Path + +# Ajouter le répertoire racine au path +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + +from src.config import STRUCTURED_DIR, BASE_DIR, DossierMedical + +GOLD_DIR = BASE_DIR / "data" / "gold_standard" +TARGET_TOTAL = 50 +TARGET_CPAM = 25 + + +def load_all_dossiers() -> list[dict]: + """Charge tous les dossiers fusionnés depuis output/structured/.""" + dossiers = [] + for subdir in sorted(STRUCTURED_DIR.iterdir()): + if not subdir.is_dir(): + continue + # Chercher le fichier fusionné + fusionne = None + for f in subdir.glob("*fusionne*.json"): + fusionne = f + break + if not fusionne: + # Prendre le premier JSON du dossier + jsons = sorted(subdir.glob("*.json")) + if jsons: + fusionne = jsons[0] + if not fusionne: + continue + + try: + data = json.loads(fusionne.read_text(encoding="utf-8")) + dossier = DossierMedical.model_validate(data) + rel_path = str(fusionne.relative_to(STRUCTURED_DIR)) + group_name = subdir.name + dossiers.append({ + "dossier_id": f"{group_name}/{fusionne.stem}", + "group_name": group_name, + "path_rel": rel_path, + "dossier": dossier, + }) + except Exception as e: + print(f" Erreur chargement {fusionne.name}: {e}") + return dossiers + + +def select_dossiers(all_dossiers: list[dict]) -> list[dict]: + """Sélectionne les 50 dossiers selon la stratégie définie.""" + # Séparer CPAM / non-CPAM + cpam = [d for d in all_dossiers if d["dossier"].controles_cpam] + non_cpam = [d for d in all_dossiers if not d["dossier"].controles_cpam] + + print(f"Dossiers CPAM disponibles : {len(cpam)}") + print(f"Dossiers non-CPAM disponibles : {len(non_cpam)}") + + # Prendre tous les CPAM (ou max TARGET_CPAM) + selected_cpam = cpam[:TARGET_CPAM] + remaining_target = TARGET_TOTAL - len(selected_cpam) + + # Stratifier les non-CPAM + selected_non_cpam = stratified_sample(non_cpam, remaining_target) + + selected = selected_cpam + selected_non_cpam + print(f"\nSélection finale : {len(selected)} dossiers") + print(f" - CPAM : {len(selected_cpam)}") + print(f" - Non-CPAM : {len(selected_non_cpam)}") + + return selected + + +def stratified_sample(dossiers: list[dict], n: int) -> list[dict]: + """Échantillonnage stratifié par CMD, confiance DP et nombre de DAS.""" + if len(dossiers) <= n: + return dossiers + + # Grouper par CMD + by_cmd: dict[str, list[dict]] = {} + for d in dossiers: + ghm = d["dossier"].ghm_estimation + cmd = ghm.cmd if ghm else "inconnu" + by_cmd.setdefault(cmd or "inconnu", []).append(d) + + selected = [] + seen_ids = set() + + # Phase 1 : 1 dossier par CMD (diversité maximale) + cmds = sorted(by_cmd.keys()) + random.seed(42) # Reproductible + for cmd in cmds: + if len(selected) >= n: + break + candidates = by_cmd[cmd] + # Préférer un mix de confiances + random.shuffle(candidates) + d = candidates[0] + selected.append(d) + seen_ids.add(d["dossier_id"]) + + # Phase 2 : compléter avec diversité confiance DP + if len(selected) < n: + remaining = [d for d in dossiers if d["dossier_id"] not in seen_ids] + # Trier par confiance DP (low > medium > high pour surreprésenter les cas difficiles) + conf_order = {"low": 0, "medium": 1, "high": 2, None: 3} + remaining.sort(key=lambda d: ( + conf_order.get( + d["dossier"].diagnostic_principal.cim10_confidence + if d["dossier"].diagnostic_principal else None, + 3 + ), + -len(d["dossier"].diagnostics_associes), # beaucoup de DAS d'abord + )) + for d in remaining: + if len(selected) >= n: + break + selected.append(d) + + return selected[:n] + + +def create_empty_annotation(dossier_id: str, dossier: DossierMedical) -> dict: + """Crée une annotation vide pour un dossier.""" + dp = dossier.diagnostic_principal + das_list = [] + for i, das in enumerate(dossier.diagnostics_associes): + das_list.append({ + "index": i, + "texte_original": das.texte, + "code_pipeline": das.cim10_suggestion or "", + "confidence": das.cim10_confidence or "", + "source": das.source or "", + "statut": "correct", + "code_corrige": None, + "commentaire": "", + }) + + return { + "dossier_id": dossier_id, + "validateur": "", + "date_validation": "", + "statut": "non_commence", + "dp": { + "texte_original": dp.texte if dp else "", + "code_pipeline": dp.cim10_suggestion if dp else "", + "confidence": dp.cim10_confidence if dp else "", + "statut": "correct", + "code_corrige": None, + "commentaire": "", + }, + "das": das_list, + "das_ajoutes": [], + "commentaire_general": "", + } + + +def main(): + print("=== Sélection des dossiers pour validation DIM ===\n") + + all_dossiers = load_all_dossiers() + print(f"Total dossiers chargés : {len(all_dossiers)}\n") + + if not all_dossiers: + print("Aucun dossier trouvé dans output/structured/") + sys.exit(1) + + selected = select_dossiers(all_dossiers) + + # Créer le répertoire gold standard + GOLD_DIR.mkdir(parents=True, exist_ok=True) + + # Sauvegarder la sélection + selection = { + "date_selection": __import__("datetime").datetime.now().isoformat(timespec="seconds"), + "total": len(selected), + "cpam": sum(1 for d in selected if d["dossier"].controles_cpam), + "non_cpam": sum(1 for d in selected if not d["dossier"].controles_cpam), + "dossiers": [d["dossier_id"] for d in selected], + } + selection_path = GOLD_DIR / "_selection.json" + selection_path.write_text( + json.dumps(selection, ensure_ascii=False, indent=2), + encoding="utf-8", + ) + print(f"\nSélection sauvegardée : {selection_path}") + + # Initialiser les annotations vides + created = 0 + for d in selected: + dossier_id = d["dossier_id"] + safe_name = dossier_id.replace("/", "__") + ".json" + annot_path = GOLD_DIR / safe_name + if not annot_path.exists(): + annotation = create_empty_annotation(dossier_id, d["dossier"]) + annot_path.write_text( + json.dumps(annotation, ensure_ascii=False, indent=2), + encoding="utf-8", + ) + created += 1 + + print(f"Annotations vides créées : {created}") + print(f"Annotations existantes préservées : {len(selected) - created}") + + # Résumé + print(f"\n--- Résumé ---") + for i, d in enumerate(selected, 1): + dos = d["dossier"] + dp_code = dos.diagnostic_principal.cim10_suggestion if dos.diagnostic_principal else "?" + dp_conf = (dos.diagnostic_principal.cim10_confidence or "?") if dos.diagnostic_principal else "?" + n_das = len(dos.diagnostics_associes) + cpam_flag = " [CPAM]" if dos.controles_cpam else "" + ghm = dos.ghm_estimation + cmd = ghm.cmd if ghm else "?" + print(f" {i:2d}. {d['group_name']:<20s} DP={dp_code:<6s} conf={dp_conf:<7s} DAS={n_das:2d} CMD={cmd}{cpam_flag}") + + +if __name__ == "__main__": + main() diff --git a/src/viewer/app.py b/src/viewer/app.py index 75ef0fd..985622b 100644 --- a/src/viewer/app.py +++ b/src/viewer/app.py @@ -22,6 +22,7 @@ from ..config import ( ) from .. import config as cfg from .referentiels import ReferentielManager +from .validation import ValidationManager logger = logging.getLogger(__name__) @@ -539,4 +540,158 @@ def create_app() -> Flask: logger.exception("Erreur lors du rebuild de l'index") return jsonify({"error": str(e)}), 500 + # ------------------------------------------------------------------ + # Routes validation DIM + # ------------------------------------------------------------------ + + val_manager = ValidationManager() + + @app.route("/validation") + def validation_list(): + groups = scan_dossiers() + selection = val_manager.load_selection() + annotations = {a["dossier_id"]: a for a in val_manager.list_annotations()} + + # Construire la liste enrichie + items = [] + for dossier_id in selection: + annot = annotations.get(dossier_id, {}) + # Trouver les données pipeline + parts = dossier_id.split("/") + group_name = parts[0] if parts else "" + group_items = groups.get(group_name, []) + pipeline = None + for gi in group_items: + if "fusionne" in gi["name"]: + pipeline = gi + break + if not pipeline and group_items: + pipeline = group_items[0] + + d = pipeline["dossier"] if pipeline else None + items.append({ + "dossier_id": dossier_id, + "group_name": group_name, + "dp_code": d.diagnostic_principal.cim10_suggestion if d and d.diagnostic_principal else "", + "dp_texte": d.diagnostic_principal.texte if d and d.diagnostic_principal else "", + "dp_confidence": d.diagnostic_principal.cim10_confidence if d and d.diagnostic_principal else "", + "nb_das": len(d.diagnostics_associes) if d else 0, + "has_cpam": bool(d and d.controles_cpam), + "statut": annot.get("statut", "non_commence"), + "validateur": annot.get("validateur", ""), + "date_validation": annot.get("date_validation", ""), + }) + + total = len(items) + valides = sum(1 for i in items if i["statut"] == "valide") + en_cours = sum(1 for i in items if i["statut"] == "en_cours") + + return render_template( + "validation_list.html", + items=items, + total=total, + valides=valides, + en_cours=en_cours, + groups=groups, + ) + + @app.route("/validation/") + def validation_detail(dossier_id: str): + groups = scan_dossiers() + # Charger l'annotation + annotation = val_manager.load_annotation(dossier_id) + if not annotation: + abort(404) + + # Charger les données pipeline + parts = dossier_id.split("/") + group_name = parts[0] if parts else "" + group_items = groups.get(group_name, []) + pipeline = None + for gi in group_items: + if "fusionne" in gi["name"]: + pipeline = gi + break + if not pipeline and group_items: + pipeline = group_items[0] + + dossier = pipeline["dossier"] if pipeline else None + + # Navigation : dossier précédent / suivant + selection = val_manager.load_selection() + current_idx = selection.index(dossier_id) if dossier_id in selection else -1 + prev_id = selection[current_idx - 1] if current_idx > 0 else None + next_id = selection[current_idx + 1] if current_idx < len(selection) - 1 else None + + return render_template( + "validation_detail.html", + annotation=annotation, + dossier=dossier, + dossier_id=dossier_id, + group_name=group_name, + prev_id=prev_id, + next_id=next_id, + groups=groups, + ) + + @app.route("/api/validation/save", methods=["POST"]) + def api_validation_save(): + data = request.get_json(silent=True) + if not data or "dossier_id" not in data: + return jsonify({"error": "dossier_id requis"}), 400 + dossier_id = data["dossier_id"] + # Vérifier que le dossier fait partie de la sélection + selection = val_manager.load_selection() + if selection and dossier_id not in selection: + return jsonify({"error": "Dossier non sélectionné pour validation"}), 403 + try: + val_manager.save_annotation(dossier_id, data) + return jsonify({"ok": True}) + except Exception as e: + logger.exception("Erreur sauvegarde annotation %s", dossier_id) + return jsonify({"error": str(e)}), 500 + + @app.route("/api/cim10/search") + def api_cim10_search(): + from ..medical.cim10_dict import load_dict, normalize_text + q = request.args.get("q", "").strip() + if len(q) < 2: + return jsonify({"results": []}) + + cim10 = load_dict() + q_norm = normalize_text(q) + q_upper = q.upper().strip() + + results = [] + # Recherche par code exact d'abord + for code, label in cim10.items(): + if code.upper().startswith(q_upper): + results.append({"code": code, "label": label}) + if len(results) >= 20: + break + + # Puis recherche par texte normalisé + if len(results) < 20: + for code, label in cim10.items(): + if any(r["code"] == code for r in results): + continue + if q_norm in normalize_text(label): + results.append({"code": code, "label": label}) + if len(results) >= 20: + break + + return jsonify({"results": results}) + + @app.route("/validation/metrics") + def validation_metrics(): + groups = scan_dossiers() + metrics = val_manager.compute_metrics(groups) + selection = val_manager.load_selection() + return render_template( + "validation_metrics.html", + metrics=metrics, + total_selection=len(selection), + groups=groups, + ) + return app diff --git a/src/viewer/templates/base.html b/src/viewer/templates/base.html index 945514e..b3f22fd 100644 --- a/src/viewer/templates/base.html +++ b/src/viewer/templates/base.html @@ -262,6 +262,10 @@ onmouseover="this.style.color='#f8fafc'" onmouseout="this.style.color='#cbd5e1'"> Référentiels RAG + + Validation DIM +