"""Interface Streamlit de review / annotation des extractions OGC (V2). Usages : 1. **Visualisation** : image + champs structurés + JSON brut côte-à-côte, pour chaque page du dossier (recueil, concertations, preuves…). 2. **Correction & gold set** : éditer chaque champ, sauvegarder dans gold/.json. 3. **Badges de validation ATIH** : chaque code médical est marqué ✓/✗ avec la suggestion de correction s'il existe une correction Levenshtein ≤ 1. Lancement (depuis la racine du projet) : streamlit run pipeline/ui_overlay.py Ou indirectement via run_overlay.sh. """ from __future__ import annotations import json import sys from copy import deepcopy from pathlib import Path # Assurer l'accès à `pipeline.*` quand streamlit lance ce fichier _REPO_ROOT = Path(__file__).resolve().parent.parent if str(_REPO_ROOT) not in sys.path: sys.path.insert(0, str(_REPO_ROOT)) import streamlit as st from PIL import Image # ---------------------------------------------------------------------------- # Compatibility shim : streamlit-drawable-canvas 0.9.3 utilise l'API privée # `streamlit.elements.image.image_to_url` qui a été retirée à partir de # Streamlit ≈ 1.49. On réinjecte une implémentation équivalente fondée sur # un data URI base64, ce qui permet au canvas de continuer à fonctionner # sans downgrader Streamlit globalement. # # Remplacer ce shim par l'upgrade de streamlit-drawable-canvas si une version # > 0.9.3 est publiée. # ---------------------------------------------------------------------------- import base64 as _b64 import io as _io from streamlit.elements import image as _st_image # type: ignore if not hasattr(_st_image, "image_to_url"): def _image_to_url_compat(image, width, clamp, channels, output_format, image_id): """Convertit une PIL.Image en data URI compatible avec drawable-canvas.""" fmt = (output_format or "PNG").upper() if fmt == "JPG": fmt = "JPEG" buf = _io.BytesIO() image.save(buf, format=fmt) b64 = _b64.b64encode(buf.getvalue()).decode("ascii") mime = "image/jpeg" if fmt == "JPEG" else f"image/{fmt.lower()}" return f"data:{mime};base64,{b64}" _st_image.image_to_url = _image_to_url_compat # type: ignore[attr-defined] # ---------------------------------------------------------------------------- from pipeline.ingest import pdf_to_images from pipeline.zones_config import load_config, save_config, DEFAULT_CONFIG_PATH try: from streamlit_drawable_canvas import st_canvas _HAS_CANVAS = True except ImportError: _HAS_CANVAS = False # ============================================================ # Configuration # ============================================================ PDF_DIR = Path("2018 CARC") EXTRACT_DIR = Path("output/v2") GOLD_DIR = Path("gold") GOLD_DIR.mkdir(exist_ok=True) # Ordre des pages selon le type PAGE_ORDER = ["recueil", "concertation_med", "hospitalisation", "preuves", "concertation_2", "concertation_1"] PAGE_LABEL = { "recueil": "p1 — Recueil", "concertation_med": "p2 — Concertation médicale", "hospitalisation": "p3 — Hospitalisation (manuscrit)", "preuves": "p4 — Éléments de preuve", "concertation_2": "p5 — Concertation 2/2 (décision)", "concertation_1": "p6 — Concertation 1/2 (argumentaire)", } # Champs éditables par type de page PAGE_FIELDS = { "recueil": [ ("En-tête", [ "etablissement", "finess", "date_debut_controle", "n_ogc", "n_champ", "dates_sejour", ]), ("Codage Établissement", [ "codage_etab.dp", "codage_etab.dp_libelle", "codage_etab.dr", ]), ("Codage Recodage", [ "codage_reco.dp", "codage_reco.dr", ]), ("GHM / GHS", [ "ghm_etab", "ghs_etab", "ghm_reco", "ghs_reco", ]), ("Décisions", [ "recodage_impactant", "ghs_injustifie", "accord_desaccord", "praticien_conseil", ]), ], "concertation_2": [ ("Décision finale", [ "ghs_initial", "ghs_avant_concertation", "ghs_final", "decision", "date_concertation", ]), ("Signatures", [ "praticien_controleur", "medecin_dim", ]), ], "concertation_1": [ ("Argumentaire", [ "date_concertation", "argumentaire", ]), ], "preuves": [ ("Entête preuves", [ "date", "praticien_controleur", "medecin_dim", ]), ], # concertation_med et hospitalisation : pas d'édition structurée # (pages quasi vides ou manuscrit dense). } # Validité : quels champs d'une page sont validables via referentials ATIH VALIDATION_PATHS = { "recueil": [ ("codage_etab.dp", "cim10"), ("codage_etab.dr", "cim10"), ("codage_reco.dp", "cim10"), ("codage_reco.dr", "cim10"), ("ghm_etab", "ghm"), ("ghs_etab", "ghs"), ("ghm_reco", "ghm"), ("ghs_reco", "ghs"), ], } # ============================================================ # Helpers I/O # ============================================================ def list_pdfs() -> list[Path]: return sorted(PDF_DIR.glob("OGC *.pdf")) def load_extract(name: str) -> dict | None: path = EXTRACT_DIR / f"{name}.json" if not path.exists(): return None return json.loads(path.read_text(encoding="utf-8")) def load_gold(name: str) -> dict | None: path = GOLD_DIR / f"{name}.json" if not path.exists(): return None return json.loads(path.read_text(encoding="utf-8")) def save_gold(name: str, data: dict) -> Path: path = GOLD_DIR / f"{name}.json" path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8") return path def get_field(d: dict | None, path: str) -> str: if d is None: return "" for k in path.split("."): d = d.get(k, "") if isinstance(d, dict) else "" return str(d) if d else "" def set_field(d: dict, path: str, value: str) -> None: keys = path.split(".") node = d for k in keys[:-1]: if k not in node or not isinstance(node[k], dict): node[k] = {} node = node[k] node[keys[-1]] = value def compare_value(pipe_val: str, gold_val: str) -> str: if not gold_val and not pipe_val: return "∅" if not gold_val: return "—" if pipe_val.strip() == gold_val.strip(): return "✓" return "✗" def validation_marker(page_data: dict, field_path: str) -> tuple[str, str]: """Retourne (emoji, info) selon la validation ATIH d'un champ.""" v = page_data.get("_validation") or {} # Descendre dans _validation selon le chemin du champ parts = field_path.split(".") node = v for p in parts: node = node.get(p) if isinstance(node, dict) else None if node is None: return "", "" if not isinstance(node, dict) or "valid" not in node: return "", "" if node.get("valid") is True: lib = node.get("libelle_ref", "") return "🟢", (f"ATIH ok — {lib}" if lib else "ATIH ok") if node.get("valid") is False: sug = node.get("suggestion", "") if sug: return "🟡", f"invalide ATIH — suggestion : {sug}" return "🔴", "invalide ATIH" return "", "" # ============================================================ # Vue principale # ============================================================ def render_page_editor(name: str, ptype: str, extract: dict, gold: dict | None): """Affiche l'image + les champs éditables + JSON brut pour une page donnée.""" pipe_data = (extract.get("extraction") or {}).get(ptype) or {} gold_data = (gold or {}).get("extraction", {}).get(ptype) or {} if gold else {} # Trouver l'index de la page pour charger l'image pages_meta = extract.get("pages") or [] page_num = next((p["page"] for p in pages_meta if p.get("type") == ptype), None) pdf_path = next(p for p in list_pdfs() if p.stem == name) if page_num is None: st.warning(f"Aucune page trouvée de type '{ptype}' pour {name}") return col_img, col_fields = st.columns([5, 4]) with col_img: st.caption(f"{name} — page {page_num} ({ptype})") images = pdf_to_images(str(pdf_path)) if page_num <= len(images): st.image(Image.open(images[page_num - 1]), use_container_width=True) with col_fields: fields_def = PAGE_FIELDS.get(ptype) if fields_def: st.markdown("**Champs extraits**") # Formulaire d'édition with st.form(f"form_{ptype}_{name}"): edited = deepcopy(gold_data) if gold_data else deepcopy(pipe_data) for section, fields in fields_def: st.markdown(f"*{section}*") for f in fields: pipe_val = get_field(pipe_data, f) gold_val = get_field(gold_data, f) cur_val = get_field(edited, f) or pipe_val if not gold_data: cur_val = pipe_val cmp = compare_value(pipe_val, gold_val) emoji, help_txt = validation_marker(pipe_data, f) label = f"{cmp}{emoji} `{f}`" # argumentaire : textarea if f == "argumentaire": new = st.text_area(label, value=cur_val, height=220, key=f"fld_{name}_{ptype}_{f}", help=help_txt or f"pipeline : {pipe_val!r}") else: new = st.text_input(label, value=cur_val, key=f"fld_{name}_{ptype}_{f}", help=help_txt or f"pipeline : {pipe_val!r}") set_field(edited, f, new.strip()) col_a, col_b = st.columns(2) with col_a: save = st.form_submit_button("💾 Sauver (gold)") with col_b: reset = st.form_submit_button("📋 Reset depuis pipeline") if save: g = gold or {"fichier": name, "extraction": {}} g.setdefault("extraction", {})[ptype] = edited save_gold(name, g) st.success(f"Gold {ptype} sauvegardé") elif reset: g = gold or {"fichier": name, "extraction": {}} g.setdefault("extraction", {})[ptype] = deepcopy(pipe_data) save_gold(name, g) st.info("Gold réinitialisé depuis pipeline") else: st.info(f"Pas d'édition structurée pour type `{ptype}`. JSON brut seulement.") # JSON brut en bas, pleine largeur, replié par défaut with st.expander("📄 JSON brut extrait (pipeline)"): st.json(pipe_data) with st.expander("🥇 JSON gold actuel"): st.json(gold_data if gold_data else {}) # OCR brut (utile quand parse_error) page_meta = next((p for p in pages_meta if p.get("page") == page_num), None) if page_meta and page_meta.get("ocr_raw"): with st.expander("📝 OCR raw (texte brut renvoyé par le modèle)"): st.code(page_meta.get("ocr_raw", ""), language="json") def render_calibration_page(): """Mode 'Calibration zones' : dessine des rectangles à la souris sur une image de référence, sauvegarde dans pipeline/zones_config.json.""" st.header("🔧 Calibration des zones") if not _HAS_CANVAS: st.error( "Le package `streamlit-drawable-canvas` n'est pas installé.\n" "Installe-le avec : `pip install streamlit-drawable-canvas`" ) return pdfs = list_pdfs() if not pdfs: st.error("Aucun PDF disponible pour la calibration") return col_ctrl, _ = st.columns([1, 3]) with col_ctrl: ref_name = st.selectbox( "PDF de référence (bien cadré)", [p.stem for p in pdfs], key="calib_pdf", ) page_type = st.selectbox( "Type de page", ["recueil"], help="Aujourd'hui seule la page recueil a des zones configurables", ) # Page numéro selon le type (recueil = page 1) page_num = {"recueil": 1}.get(page_type, 1) ref_pdf = next(p for p in pdfs if p.stem == ref_name) img_path = pdf_to_images(str(ref_pdf))[page_num - 1] img = Image.open(img_path) img_w, img_h = img.size # Charger config existante et préparer les zones cfg = load_config() existing_zones = cfg.get(page_type, {}) # On scale l'image pour tenir dans le canvas (largeur ~900 px max) canvas_w = 900 scale = canvas_w / img_w canvas_h = int(img_h * scale) # Préparer les rectangles initiaux depuis la config initial_rects = [] for zone_name, z in existing_zones.items(): if not isinstance(z, dict): continue initial_rects.append({ "type": "rect", "left": z["x1"] * canvas_w, "top": z["y1"] * canvas_h, "width": (z["x2"] - z["x1"]) * canvas_w, "height": (z["y2"] - z["y1"]) * canvas_h, "fill": "rgba(255, 100, 100, 0.15)", "stroke": "red", "strokeWidth": 2, "label_name": zone_name, }) st.caption( "💡 Dessine un rectangle par zone à la souris. Les zones existantes " "apparaissent déjà pré-dessinées. Tu peux les modifier (drag), " "en ajouter, ou en supprimer (touche Suppr) puis cliquer sur " "**Sauvegarder**." ) drawing_mode = st.radio( "Mode", ["rect", "transform"], horizontal=True, format_func=lambda x: {"rect": "✏️ Dessiner", "transform": "🖱 Sélectionner / Déplacer"}[x], key="calib_drawing_mode", ) canvas_result = st_canvas( fill_color="rgba(255, 100, 100, 0.15)", stroke_width=2, stroke_color="red", background_image=img, update_streamlit=True, width=canvas_w, height=canvas_h, drawing_mode=drawing_mode, initial_drawing={"objects": initial_rects, "version": "5.2.1"}, key="calib_canvas", ) # Reconstituer la config à partir des rectangles dessinés rects = (canvas_result.json_data or {}).get("objects", []) if canvas_result.json_data else [] st.markdown("### Zones détectées") if not rects: st.info("Aucun rectangle dessiné.") return new_zones = {} for i, r in enumerate(rects): if r.get("type") != "rect": continue # Récupérer le nom existant si présent, sinon demander default_name = r.get("label_name") or f"zone_{i+1}" name = st.text_input( f"Nom de la zone {i+1}", value=default_name, key=f"calib_name_{i}", ) x1 = r["left"] / canvas_w y1 = r["top"] / canvas_h x2 = x1 + r["width"] / canvas_w y2 = y1 + r["height"] / canvas_h desc = existing_zones.get(name, {}).get("description", "") desc = st.text_input( f"Description (optionnel)", value=desc, key=f"calib_desc_{i}", ) st.caption(f"Coords relatives : ({x1:.3f}, {y1:.3f}) → ({x2:.3f}, {y2:.3f})") new_zones[name] = {"x1": round(x1, 4), "y1": round(y1, 4), "x2": round(x2, 4), "y2": round(y2, 4), "description": desc} if st.button("💾 Sauvegarder la configuration", type="primary"): cfg[page_type] = new_zones path = save_config(cfg) st.success(f"Configuration sauvegardée : {path}") st.json(new_zones) def main(): st.set_page_config(page_title="OGC Overlay", layout="wide") # Réduire les marges par défaut st.markdown(""" """, unsafe_allow_html=True) st.title("🩺 Extraction OGC — review & gold set") # Sélecteur de mode en haut de sidebar with st.sidebar: mode = st.radio("Mode", ["📋 Review dossier", "🔧 Calibration zones"]) if mode == "🔧 Calibration zones": render_calibration_page() return pdfs = list_pdfs() if not pdfs: st.error(f"Aucun PDF trouvé dans {PDF_DIR}") return with st.sidebar: st.header("Dossier") names = [p.stem for p in pdfs] name = st.selectbox("Choisir un OGC", names) extract = load_extract(name) gold = load_gold(name) if extract is None: st.error(f"Pas de JSON extrait pour {name}") st.caption(f"Attendu : output/v2/{name}.json") else: st.caption(f"✓ Extraction pipeline chargée") if gold: st.success("Gold set défini") else: st.caption("Pas encore de gold") # Résumé validation ATIH if extract: rec_v = (extract.get("extraction") or {}).get("recueil", {}).get("_validation", {}) summary = rec_v.get("summary", {}) if summary: st.markdown("---") st.markdown("**Validation ATIH (page recueil)**") st.metric("Codes valides", f"{summary.get('valid',0)}/{summary.get('total_codes','?')}") cc = rec_v.get("cross_checks", {}) for side in ("etab", "reco"): c = cc.get(side, {}) if c.get("checked"): icon = "✓" if c.get("coherent") else "✗" st.caption(f"{icon} GHM↔GHS {side}") st.markdown("---") st.caption("💡 Chaque code médical est suivi d'un marqueur :") st.caption("🟢 valide ATIH — 🟡 invalide (suggestion dispo) — 🔴 invalide") st.caption("✓/✗/∅ = accord avec gold set") if extract is None: return # Onglets par type de page — un par vraie page trouvée pages_meta = extract.get("pages") or [] available_types = [] seen = set() for p in pages_meta: t = p.get("type") if t and t not in seen: seen.add(t) available_types.append(t) # Ordre canonique puis fallback ordered = [t for t in PAGE_ORDER if t in available_types] + \ [t for t in available_types if t not in PAGE_ORDER] if not ordered: st.warning("Aucune page classifiée dans ce dossier.") return tabs = st.tabs([PAGE_LABEL.get(t, t) for t in ordered]) for tab, ptype in zip(tabs, ordered): with tab: render_page_editor(name, ptype, extract, gold) if __name__ == "__main__": main()