feat(ui): interface Streamlit de review & annotation

Ajoute pipeline/ui_overlay.py : interface web pour inspecter les extractions et construire un gold set annoté manuellement. Fonctionnalités : - Un onglet par type de page détectée dans le dossier (recueil, concertation 1/2, concertation 2/2, preuves…). - Image PDF à gauche + champs éditables à droite, spécifiques au type de page (codes CIM/CCAM pour recueil, GHS + décision pour concertation 2, argumentaire pour concertation 1…). - Badges de validation ATIH à côté de chaque code : 🟢 valide (libellé officiel au survol) 🟡 invalide, suggestion Levenshtein≤1 disponible 🔴 invalide, pas de suggestion - Comparateur au gold set : ✓/✗/∅/— selon divergence. - Sidebar : sélecteur dossier, métriques ATIH, cohérence GHM↔GHS. - Expanders JSON pipeline / JSON gold / OCR raw pour debug. Sauvegarde des annotations dans gold/<nom>.json au même format que les JSONs pipeline, ce qui permettra de mesurer objectivement la qualité de futures versions du pipeline (champ par champ vs gold). Lancement : `streamlit run pipeline/ui_overlay.py` depuis la racine. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-24 15:06:18 +02:00
parent 6df590ae95
commit 1f75670770
1 changed files with 353 additions and 0 deletions
--- a/pipeline/ui_overlay.py
+++ b/pipeline/ui_overlay.py
@@ -0,0 +1,353 @@
+"""Interface Streamlit de review / annotation des extractions OGC (V2).
+
+Usages :
+1. **Visualisation** : image + champs structurés + JSON brut côte-à-côte,
+   pour chaque page du dossier (recueil, concertations, preuves…).
+2. **Correction & gold set** : éditer chaque champ, sauvegarder dans gold/<nom>.json.
+3. **Badges de validation ATIH** : chaque code médical est marqué ✓/✗ avec la
+   suggestion de correction s'il existe une correction Levenshtein ≤ 1.
+
+Lancement (depuis la racine du projet) :
+    streamlit run pipeline/ui_overlay.py
+
+Ou indirectement via run_overlay.sh.
+"""
+from __future__ import annotations
+
+import json
+import sys
+from copy import deepcopy
+from pathlib import Path
+
+# Assurer l'accès à `pipeline.*` quand streamlit lance ce fichier
+_REPO_ROOT = Path(__file__).resolve().parent.parent
+if str(_REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(_REPO_ROOT))
+
+import streamlit as st
+from PIL import Image
+
+from pipeline.ingest import pdf_to_images
+
+
+# ============================================================
+# Configuration
+# ============================================================
+
+PDF_DIR = Path("2018 CARC")
+EXTRACT_DIR = Path("output/v2")
+GOLD_DIR = Path("gold")
+GOLD_DIR.mkdir(exist_ok=True)
+
+# Ordre des pages selon le type
+PAGE_ORDER = ["recueil", "concertation_med", "hospitalisation",
+              "preuves", "concertation_2", "concertation_1"]
+PAGE_LABEL = {
+    "recueil":          "p1 — Recueil",
+    "concertation_med": "p2 — Concertation médicale",
+    "hospitalisation":  "p3 — Hospitalisation (manuscrit)",
+    "preuves":          "p4 — Éléments de preuve",
+    "concertation_2":   "p5 — Concertation 2/2 (décision)",
+    "concertation_1":   "p6 — Concertation 1/2 (argumentaire)",
+}
+
+# Champs éditables par type de page
+PAGE_FIELDS = {
+    "recueil": [
+        ("En-tête", [
+            "etablissement", "finess", "date_debut_controle",
+            "n_ogc", "n_champ", "dates_sejour",
+        ]),
+        ("Codage Établissement", [
+            "codage_etab.dp", "codage_etab.dp_libelle", "codage_etab.dr",
+        ]),
+        ("Codage Recodage", [
+            "codage_reco.dp", "codage_reco.dr",
+        ]),
+        ("GHM / GHS", [
+            "ghm_etab", "ghs_etab", "ghm_reco", "ghs_reco",
+        ]),
+        ("Décisions", [
+            "recodage_impactant", "ghs_injustifie",
+            "accord_desaccord", "praticien_conseil",
+        ]),
+    ],
+    "concertation_2": [
+        ("Décision finale", [
+            "ghs_initial", "ghs_avant_concertation", "ghs_final",
+            "decision", "date_concertation",
+        ]),
+        ("Signatures", [
+            "praticien_controleur", "medecin_dim",
+        ]),
+    ],
+    "concertation_1": [
+        ("Argumentaire", [
+            "date_concertation", "argumentaire",
+        ]),
+    ],
+    "preuves": [
+        ("Entête preuves", [
+            "date", "praticien_controleur", "medecin_dim",
+        ]),
+    ],
+    # concertation_med et hospitalisation : pas d'édition structurée
+    # (pages quasi vides ou manuscrit dense).
+}
+
+# Validité : quels champs d'une page sont validables via referentials ATIH
+VALIDATION_PATHS = {
+    "recueil": [
+        ("codage_etab.dp", "cim10"),
+        ("codage_etab.dr", "cim10"),
+        ("codage_reco.dp", "cim10"),
+        ("codage_reco.dr", "cim10"),
+        ("ghm_etab", "ghm"),
+        ("ghs_etab", "ghs"),
+        ("ghm_reco", "ghm"),
+        ("ghs_reco", "ghs"),
+    ],
+}
+
+
+# ============================================================
+# Helpers I/O
+# ============================================================
+
+def list_pdfs() -> list[Path]:
+    return sorted(PDF_DIR.glob("OGC *.pdf"))
+
+
+def load_extract(name: str) -> dict | None:
+    path = EXTRACT_DIR / f"{name}.json"
+    if not path.exists(): return None
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def load_gold(name: str) -> dict | None:
+    path = GOLD_DIR / f"{name}.json"
+    if not path.exists(): return None
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def save_gold(name: str, data: dict) -> Path:
+    path = GOLD_DIR / f"{name}.json"
+    path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
+    return path
+
+
+def get_field(d: dict | None, path: str) -> str:
+    if d is None: return ""
+    for k in path.split("."):
+        d = d.get(k, "") if isinstance(d, dict) else ""
+    return str(d) if d else ""
+
+
+def set_field(d: dict, path: str, value: str) -> None:
+    keys = path.split(".")
+    node = d
+    for k in keys[:-1]:
+        if k not in node or not isinstance(node[k], dict):
+            node[k] = {}
+        node = node[k]
+    node[keys[-1]] = value
+
+
+def compare_value(pipe_val: str, gold_val: str) -> str:
+    if not gold_val and not pipe_val: return "∅"
+    if not gold_val: return "—"
+    if pipe_val.strip() == gold_val.strip(): return "✓"
+    return "✗"
+
+
+def validation_marker(page_data: dict, field_path: str) -> tuple[str, str]:
+    """Retourne (emoji, info) selon la validation ATIH d'un champ."""
+    v = page_data.get("_validation") or {}
+    # Descendre dans _validation selon le chemin du champ
+    parts = field_path.split(".")
+    node = v
+    for p in parts:
+        node = node.get(p) if isinstance(node, dict) else None
+        if node is None:
+            return "", ""
+    if not isinstance(node, dict) or "valid" not in node:
+        return "", ""
+    if node.get("valid") is True:
+        lib = node.get("libelle_ref", "")
+        return "🟢", (f"ATIH ok — {lib}" if lib else "ATIH ok")
+    if node.get("valid") is False:
+        sug = node.get("suggestion", "")
+        if sug:
+            return "🟡", f"invalide ATIH — suggestion : {sug}"
+        return "🔴", "invalide ATIH"
+    return "", ""
+
+
+# ============================================================
+# Vue principale
+# ============================================================
+
+def render_page_editor(name: str, ptype: str, extract: dict, gold: dict | None):
+    """Affiche l'image + les champs éditables + JSON brut pour une page donnée."""
+    pipe_data = (extract.get("extraction") or {}).get(ptype) or {}
+    gold_data = (gold or {}).get("extraction", {}).get(ptype) or {} if gold else {}
+
+    # Trouver l'index de la page pour charger l'image
+    pages_meta = extract.get("pages") or []
+    page_num = next((p["page"] for p in pages_meta if p.get("type") == ptype), None)
+    pdf_path = next(p for p in list_pdfs() if p.stem == name)
+    if page_num is None:
+        st.warning(f"Aucune page trouvée de type '{ptype}' pour {name}")
+        return
+
+    col_img, col_fields = st.columns([5, 4])
+
+    with col_img:
+        st.caption(f"{name} — page {page_num} ({ptype})")
+        images = pdf_to_images(str(pdf_path))
+        if page_num <= len(images):
+            st.image(Image.open(images[page_num - 1]), use_container_width=True)
+
+    with col_fields:
+        fields_def = PAGE_FIELDS.get(ptype)
+        if fields_def:
+            st.markdown("**Champs extraits**")
+            # Formulaire d'édition
+            with st.form(f"form_{ptype}_{name}"):
+                edited = deepcopy(gold_data) if gold_data else deepcopy(pipe_data)
+                for section, fields in fields_def:
+                    st.markdown(f"*{section}*")
+                    for f in fields:
+                        pipe_val = get_field(pipe_data, f)
+                        gold_val = get_field(gold_data, f)
+                        cur_val = get_field(edited, f) or pipe_val
+                        if not gold_data:
+                            cur_val = pipe_val
+                        cmp = compare_value(pipe_val, gold_val)
+                        emoji, help_txt = validation_marker(pipe_data, f)
+                        label = f"{cmp}{emoji} `{f}`"
+                        # argumentaire : textarea
+                        if f == "argumentaire":
+                            new = st.text_area(label, value=cur_val, height=220,
+                                               key=f"fld_{name}_{ptype}_{f}",
+                                               help=help_txt or f"pipeline : {pipe_val!r}")
+                        else:
+                            new = st.text_input(label, value=cur_val,
+                                                key=f"fld_{name}_{ptype}_{f}",
+                                                help=help_txt or f"pipeline : {pipe_val!r}")
+                        set_field(edited, f, new.strip())
+
+                col_a, col_b = st.columns(2)
+                with col_a:
+                    save = st.form_submit_button("💾 Sauver (gold)")
+                with col_b:
+                    reset = st.form_submit_button("📋 Reset depuis pipeline")
+
+                if save:
+                    g = gold or {"fichier": name, "extraction": {}}
+                    g.setdefault("extraction", {})[ptype] = edited
+                    save_gold(name, g)
+                    st.success(f"Gold {ptype} sauvegardé")
+                elif reset:
+                    g = gold or {"fichier": name, "extraction": {}}
+                    g.setdefault("extraction", {})[ptype] = deepcopy(pipe_data)
+                    save_gold(name, g)
+                    st.info("Gold réinitialisé depuis pipeline")
+        else:
+            st.info(f"Pas d'édition structurée pour type `{ptype}`. JSON brut seulement.")
+
+    # JSON brut en bas, pleine largeur, replié par défaut
+    with st.expander("📄 JSON brut extrait (pipeline)"):
+        st.json(pipe_data)
+    with st.expander("🥇 JSON gold actuel"):
+        st.json(gold_data if gold_data else {})
+    # OCR brut (utile quand parse_error)
+    page_meta = next((p for p in pages_meta if p.get("page") == page_num), None)
+    if page_meta and page_meta.get("ocr_raw"):
+        with st.expander("📝 OCR raw (texte brut renvoyé par le modèle)"):
+            st.code(page_meta.get("ocr_raw", ""), language="json")
+
+
+def main():
+    st.set_page_config(page_title="OGC Overlay", layout="wide")
+
+    # Réduire les marges par défaut
+    st.markdown("""
+    <style>
+    .block-container { padding-top: 1rem; padding-bottom: 1rem; max-width: 95%; }
+    </style>
+    """, unsafe_allow_html=True)
+
+    st.title("🩺 Extraction OGC — review & gold set")
+
+    pdfs = list_pdfs()
+    if not pdfs:
+        st.error(f"Aucun PDF trouvé dans {PDF_DIR}")
+        return
+
+    with st.sidebar:
+        st.header("Dossier")
+        names = [p.stem for p in pdfs]
+        name = st.selectbox("Choisir un OGC", names)
+
+        extract = load_extract(name)
+        gold = load_gold(name)
+
+        if extract is None:
+            st.error(f"Pas de JSON extrait pour {name}")
+            st.caption(f"Attendu : output/v2/{name}.json")
+        else:
+            st.caption(f"✓ Extraction pipeline chargée")
+        if gold:
+            st.success("Gold set défini")
+        else:
+            st.caption("Pas encore de gold")
+
+        # Résumé validation ATIH
+        if extract:
+            rec_v = (extract.get("extraction") or {}).get("recueil", {}).get("_validation", {})
+            summary = rec_v.get("summary", {})
+            if summary:
+                st.markdown("---")
+                st.markdown("**Validation ATIH (page recueil)**")
+                st.metric("Codes valides", f"{summary.get('valid',0)}/{summary.get('total_codes','?')}")
+                cc = rec_v.get("cross_checks", {})
+                for side in ("etab", "reco"):
+                    c = cc.get(side, {})
+                    if c.get("checked"):
+                        icon = "✓" if c.get("coherent") else "✗"
+                        st.caption(f"{icon} GHM↔GHS {side}")
+
+        st.markdown("---")
+        st.caption("💡 Chaque code médical est suivi d'un marqueur :")
+        st.caption("🟢 valide ATIH — 🟡 invalide (suggestion dispo) — 🔴 invalide")
+        st.caption("✓/✗/∅ = accord avec gold set")
+
+    if extract is None:
+        return
+
+    # Onglets par type de page — un par vraie page trouvée
+    pages_meta = extract.get("pages") or []
+    available_types = []
+    seen = set()
+    for p in pages_meta:
+        t = p.get("type")
+        if t and t not in seen:
+            seen.add(t)
+            available_types.append(t)
+    # Ordre canonique puis fallback
+    ordered = [t for t in PAGE_ORDER if t in available_types] + \
+              [t for t in available_types if t not in PAGE_ORDER]
+
+    if not ordered:
+        st.warning("Aucune page classifiée dans ce dossier.")
+        return
+
+    tabs = st.tabs([PAGE_LABEL.get(t, t) for t in ordered])
+    for tab, ptype in zip(tabs, ordered):
+        with tab:
+            render_page_editor(name, ptype, extract, gold)
+
+
+if __name__ == "__main__":
+    main()