From 40934fdc39101acb6137074816b3ba9daca5d124 Mon Sep 17 00:00:00 2001 From: dom Date: Wed, 18 Feb 2026 20:59:50 +0100 Subject: [PATCH] =?UTF-8?q?feat:=20tra=C3=A7abilit=C3=A9=20source=20syst?= =?UTF-8?q?=C3=A9matique=20+=20viewer=20interactif?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ajoute source_page/source_excerpt à tous les types (biologie, imagerie, traitements, actes CCAM, antécédents, complications). Convertit antecedents et complications en types structurés (Antecedent/Complication) avec validators backward-compat pour les vieux JSON. Étend _apply_source_tracking à tous les éléments du dossier. Ajoute un endpoint /api/source-text/ et un modal interactif dans le viewer avec surlignage du texte source. Co-Authored-By: Claude Opus 4.6 --- src/config.py | 54 ++++++++++- src/control/cpam_response.py | 4 +- src/medical/cim10_extractor.py | 103 ++++++++++++++++----- src/medical/clinical_context.py | 4 +- src/medical/fusion.py | 6 +- src/viewer/app.py | 23 ++++- src/viewer/templates/base.html | 73 +++++++++++++++ src/viewer/templates/detail.html | 119 +++++++++++++++++++++--- tests/test_medical.py | 149 ++++++++++++++++++++++++++++++- tests/test_viewer.py | 12 +++ 10 files changed, 500 insertions(+), 47 deletions(-) diff --git a/src/config.py b/src/config.py index 4ada52c..964afe0 100644 --- a/src/config.py +++ b/src/config.py @@ -7,7 +7,7 @@ from pathlib import Path from typing import Optional from dotenv import load_dotenv -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, field_validator load_dotenv() @@ -125,24 +125,44 @@ class ActeCCAM(BaseModel): date: Optional[str] = None validite: Optional[str] = None # "valide" | "obsolete" | "non_verifie" alertes: list[str] = Field(default_factory=list) + source_page: Optional[int] = None + source_excerpt: Optional[str] = None class Traitement(BaseModel): medicament: str posologie: Optional[str] = None code_atc: Optional[str] = None + source_page: Optional[int] = None + source_excerpt: Optional[str] = None class BiologieCle(BaseModel): test: str valeur: Optional[str] = None anomalie: Optional[bool] = None + source_page: Optional[int] = None + source_excerpt: Optional[str] = None class Imagerie(BaseModel): type: str conclusion: Optional[str] = None score: Optional[str] = None + source_page: Optional[int] = None + source_excerpt: Optional[str] = None + + +class Antecedent(BaseModel): + texte: str + source_page: Optional[int] = None + source_excerpt: Optional[str] = None + + +class Complication(BaseModel): + texte: str + source_page: Optional[int] = None + source_excerpt: Optional[str] = None class DossierMedical(BaseModel): @@ -152,17 +172,45 @@ class DossierMedical(BaseModel): diagnostic_principal: Optional[Diagnostic] = None diagnostics_associes: list[Diagnostic] = Field(default_factory=list) actes_ccam: list[ActeCCAM] = Field(default_factory=list) - antecedents: list[str] = Field(default_factory=list) + antecedents: list[Antecedent] = Field(default_factory=list) traitements_sortie: list[Traitement] = Field(default_factory=list) biologie_cle: list[BiologieCle] = Field(default_factory=list) imagerie: list[Imagerie] = Field(default_factory=list) - complications: list[str] = Field(default_factory=list) + complications: list[Complication] = Field(default_factory=list) alertes_codage: list[str] = Field(default_factory=list) source_files: list[str] = Field(default_factory=list) ghm_estimation: Optional[GHMEstimation] = None controles_cpam: list[ControleCPAM] = Field(default_factory=list) processing_time_s: float | None = None + @field_validator("antecedents", mode="before") + @classmethod + def _coerce_antecedents(cls, v): + """Backward compat : convertit les anciennes list[str] en list[Antecedent].""" + if not isinstance(v, list): + return v + result = [] + for item in v: + if isinstance(item, str): + result.append({"texte": item}) + else: + result.append(item) + return result + + @field_validator("complications", mode="before") + @classmethod + def _coerce_complications(cls, v): + """Backward compat : convertit les anciennes list[str] en list[Complication].""" + if not isinstance(v, list): + return v + result = [] + for item in v: + if isinstance(item, str): + result.append({"texte": item}) + else: + result.append(item) + return result + # --- Rapport d'anonymisation --- diff --git a/src/control/cpam_response.py b/src/control/cpam_response.py index 049faee..3b47ce2 100644 --- a/src/control/cpam_response.py +++ b/src/control/cpam_response.py @@ -470,10 +470,10 @@ def _build_cpam_prompt( dossier_lines.append(f"- Traitements de sortie : {', '.join(trt_parts)}") if dossier.antecedents: - dossier_lines.append(f"- Antécédents : {', '.join(dossier.antecedents[:10])}") + dossier_lines.append(f"- Antécédents : {', '.join(a.texte for a in dossier.antecedents[:10])}") if dossier.complications: - dossier_lines.append(f"- Complications : {', '.join(dossier.complications)}") + dossier_lines.append(f"- Complications : {', '.join(c.texte for c in dossier.complications)}") dossier_str = "\n".join(dossier_lines) if dossier_lines else "Non disponible" diff --git a/src/medical/cim10_extractor.py b/src/medical/cim10_extractor.py index 2da00fe..1c2136e 100644 --- a/src/medical/cim10_extractor.py +++ b/src/medical/cim10_extractor.py @@ -14,7 +14,9 @@ from .ccam_dict import lookup as ccam_lookup, validate_code as ccam_validate from .das_filter import clean_diagnostic_text, is_valid_diagnostic_text, correct_known_miscodes from ..config import ( ActeCCAM, + Antecedent, BiologieCle, + Complication, Diagnostic, DossierMedical, Imagerie, @@ -180,10 +182,10 @@ def _extract_das_llm(text: str, dossier: DossierMedical) -> None: "age": dossier.sejour.age, "duree_sejour": dossier.sejour.duree_sejour, "imc": dossier.sejour.imc, - "antecedents": dossier.antecedents[:5], + "antecedents": [a.texte for a in dossier.antecedents[:5]], "biologie_cle": [(b.test, b.valeur, b.anomalie) for b in dossier.biologie_cle], "imagerie": [(i.type, (i.conclusion or "")[:200]) for i in dossier.imagerie], - "complications": dossier.complications, + "complications": [c.texte for c in dossier.complications], } # DAS existants (texte + code) @@ -532,7 +534,8 @@ _ANTECEDENT_NOISE = ( "item de", "surveillance", "température", "signes vitaux", "pouls", "type de note", "aucune donnée", "renseignée", "habitudes de vie", "systolique", "diastolique", "saturation", - "texte libre", "mode de vie", "n° rpps", + "texte libre", "mode de vie", "n° rpps", "secrétariat", + "aucune aide", ) _SURVEILLANCE_SINGLE_WORDS = frozenset({ @@ -569,8 +572,14 @@ def _is_valid_antecedent(line: str) -> bool: # Deux mots identiques if len(words) == 2 and len(set(words)) == 1: return False - # Identifiants administratifs isolés - if re.match(r'^\[MEDECIN\]\s', line) and len(line) < 30: + # Lignes commençant par un tag médecin (artefact colonne gauche CRH) + if re.match(r'^\[MEDECIN', line): + return False + # Lignes commençant par "Dr [MEDECIN" ou "Dr[PERSONNE" (nom de médecin) + if re.match(r'^Dr\s*\[', line): + return False + # Fragment de localisation : "de Bordeaux", "de Lyon", "de Paris" + if re.match(r'^de [A-ZÀ-Ú]', line) and len(line) < 25: return False return True @@ -578,7 +587,7 @@ def _is_valid_antecedent(line: str) -> bool: def _extract_antecedents(text: str, dossier: DossierMedical) -> None: """Extrait les antécédents.""" m = re.search( - r"Antécédents?\s*[::]?\s*\n?(.*?)(?=\n\s*(?:Traitements?\s*[::]|Allergie|Histoire de la maladie|Examen clinique|Signes\s+[Vv]itaux|Observations?\s+m[eé]dicale|Passage aux|\n\n))", + r"Antécédents?\s*[::]?\s*\n?(.*?)(?=\n\s*(?:Traitements?\s*[::]|Allergie|Histoire de la maladie|Examen clinique|Signes\s+[Vv]itaux|Observations?\s+m[eé]dicale|Passage aux|Mode de vie|\n\n))", text, re.DOTALL | re.IGNORECASE, ) @@ -587,7 +596,7 @@ def _extract_antecedents(text: str, dossier: DossierMedical) -> None: for line in block.split("\n"): line = line.strip().lstrip("- •") if _is_valid_antecedent(line): - dossier.antecedents.append(line) + dossier.antecedents.append(Antecedent(texte=line)) def _extract_traitements( @@ -778,7 +787,7 @@ def _extract_complications( # Fallback regex pour la négation pattern = rf"(?:pas de|sans|absence de|aucun[e]?)\s+{re.escape(term)}" if not re.search(pattern, text_lower): - dossier.complications.append(term.capitalize()) + dossier.complications.append(Complication(texte=term.capitalize())) def _is_negated_by_edsnlp(term: str, negated_terms: set[str]) -> bool: @@ -1028,34 +1037,84 @@ def _is_abnormal(test: str, value: str) -> bool | None: return None -def _apply_source_tracking(dossier: DossierMedical, page_tracker, search_text: str) -> None: - """Ajoute la traçabilité source (page + extrait) à chaque diagnostic. +def _track_item(item, search_key: str, page_tracker, search_text: str) -> bool: + """Cherche la page source et l'extrait pour un item avec source_page/source_excerpt.""" + if item.source_page is not None: + return False + if not search_key: + return False + page = page_tracker.find_page_for_text(search_key, search_text) + if page: + item.source_page = page + item.source_excerpt = page_tracker.extract_excerpt(search_key, search_text) + return True + return False - Cherche le texte du diagnostic dans le texte source pour retrouver + +def _apply_source_tracking(dossier: DossierMedical, page_tracker, search_text: str) -> None: + """Ajoute la traçabilité source (page + extrait) à tous les éléments du dossier. + + Cherche le texte de chaque élément dans le texte source pour retrouver la page d'origine et extraire un passage contextualisé. """ + tracked = 0 + total = 0 + + # Diagnostics (DP + DAS) all_diags: list[Diagnostic] = [] if dossier.diagnostic_principal: all_diags.append(dossier.diagnostic_principal) all_diags.extend(dossier.diagnostics_associes) - tracked = 0 for diag in all_diags: - if diag.source_page is not None: - continue # déjà renseigné + total += 1 + if _track_item(diag, diag.texte, page_tracker, search_text): + tracked += 1 - texte = diag.texte - if not texte: - continue + # Biologie + for b in dossier.biologie_cle: + total += 1 + search_key = f"{b.test}: {b.valeur}" if b.valeur else b.test + if _track_item(b, search_key, page_tracker, search_text): + tracked += 1 + elif b.valeur and _track_item(b, b.test, page_tracker, search_text): + tracked += 1 - page = page_tracker.find_page_for_text(texte, search_text) - if page: - diag.source_page = page - diag.source_excerpt = page_tracker.extract_excerpt(texte, search_text) + # Imagerie + for img in dossier.imagerie: + total += 1 + search_key = img.type + if _track_item(img, search_key, page_tracker, search_text): + tracked += 1 + elif img.conclusion and _track_item(img, img.conclusion[:50], page_tracker, search_text): + tracked += 1 + + # Traitements + for t in dossier.traitements_sortie: + total += 1 + if _track_item(t, t.medicament, page_tracker, search_text): + tracked += 1 + + # Actes CCAM + for a in dossier.actes_ccam: + total += 1 + if _track_item(a, a.texte, page_tracker, search_text): + tracked += 1 + + # Antécédents + for ant in dossier.antecedents: + total += 1 + if _track_item(ant, ant.texte, page_tracker, search_text): + tracked += 1 + + # Complications + for comp in dossier.complications: + total += 1 + if _track_item(comp, comp.texte, page_tracker, search_text): tracked += 1 if tracked: - logger.info(" Traçabilité source : %d/%d diagnostics localisés", tracked, len(all_diags)) + logger.info(" Traçabilité source : %d/%d éléments localisés", tracked, total) def _validate_justifications(dossier: DossierMedical) -> None: diff --git a/src/medical/clinical_context.py b/src/medical/clinical_context.py index 5758cfa..5ba8bde 100644 --- a/src/medical/clinical_context.py +++ b/src/medical/clinical_context.py @@ -166,10 +166,10 @@ def build_enriched_context(dossier: DossierMedical) -> dict: "age": dossier.sejour.age, "duree_sejour": dossier.sejour.duree_sejour, "imc": dossier.sejour.imc, - "antecedents": dossier.antecedents[:5], + "antecedents": [a.texte for a in dossier.antecedents[:5]], "biologie_cle": [(b.test, b.valeur, b.anomalie) for b in dossier.biologie_cle], "imagerie": [(i.type, (i.conclusion or "")[:200]) for i in dossier.imagerie], - "complications": dossier.complications, + "complications": [c.texte for c in dossier.complications], } # Interprétations biologiques diff --git a/src/medical/fusion.py b/src/medical/fusion.py index f26a41f..3b727cf 100644 --- a/src/medical/fusion.py +++ b/src/medical/fusion.py @@ -10,7 +10,9 @@ import logging from ..config import ( ActeCCAM, + Antecedent, BiologieCle, + Complication, Diagnostic, DossierMedical, Imagerie, @@ -251,7 +253,7 @@ def merge_dossiers(dossiers: list[DossierMedical]) -> DossierMedical: ant_seen: set[str] = set() for d in dossiers: for a in d.antecedents: - key = a.lower().strip() + key = a.texte.lower().strip() if key not in ant_seen: merged.antecedents.append(a) ant_seen.add(key) @@ -260,7 +262,7 @@ def merge_dossiers(dossiers: list[DossierMedical]) -> DossierMedical: comp_seen: set[str] = set() for d in dossiers: for c in d.complications: - key = c.lower().strip() + key = c.texte.lower().strip() if key not in comp_seen: merged.complications.append(c) comp_seen.add(key) diff --git a/src/viewer/app.py b/src/viewer/app.py index dbea3b2..ddfeaab 100644 --- a/src/viewer/app.py +++ b/src/viewer/app.py @@ -16,7 +16,7 @@ from werkzeug.utils import secure_filename from collections import Counter from ..config import ( - STRUCTURED_DIR, OLLAMA_URL, CCAM_DICT_PATH, DossierMedical, + ANONYMIZED_DIR, STRUCTURED_DIR, OLLAMA_URL, CCAM_DICT_PATH, DossierMedical, ALLOWED_EXTENSIONS, UPLOAD_MAX_SIZE_MB, CIM10_PDF, GUIDE_METHODO_PDF, CCAM_PDF, CIM10_DICT_PATH, CIM10_SUPPLEMENTS_PATH, ) @@ -594,6 +594,27 @@ def create_app() -> Flask: logger.exception("Erreur lors du retraitement") return jsonify({"error": str(e)}), 500 + # ------------------------------------------------------------------ + # API texte source anonymisé + # ------------------------------------------------------------------ + + @app.route("/api/source-text/") + def source_text(dossier_id: str): + """Retourne le contenu texte anonymisé de tous les fichiers d'un dossier.""" + safe_dir = (ANONYMIZED_DIR / dossier_id).resolve() + if not safe_dir.is_relative_to(ANONYMIZED_DIR.resolve()): + abort(403) + if not safe_dir.is_dir(): + abort(404) + + result = {} + for txt_path in sorted(safe_dir.glob("*_anonymized.txt")): + try: + result[txt_path.name] = txt_path.read_text(encoding="utf-8") + except Exception: + logger.warning("Impossible de lire %s", txt_path) + return jsonify(result) + # ------------------------------------------------------------------ # Routes admin référentiels # ------------------------------------------------------------------ diff --git a/src/viewer/templates/base.html b/src/viewer/templates/base.html index b3f22fd..fc48ba1 100644 --- a/src/viewer/templates/base.html +++ b/src/viewer/templates/base.html @@ -233,6 +233,79 @@ border-radius: 50%; animation: spin 0.8s linear infinite; } + + /* Source tracking badges */ + .src-btn { + display: inline-block; + padding: 1px 6px; + border-radius: 9999px; + font-size: 0.65rem; + font-weight: 600; + background: #e0f2fe; + color: #0369a1; + border: 1px solid #bae6fd; + cursor: pointer; + margin-left: 0.3rem; + vertical-align: middle; + transition: background 0.15s; + } + .src-btn:hover { background: #bae6fd; } + + /* Source modal */ + #source-modal { + display: none; + position: fixed; + inset: 0; + z-index: 9999; + background: rgba(0,0,0,0.5); + padding: 2rem; + } + #source-modal-inner { + background: #fff; + border-radius: 12px; + max-width: 900px; + margin: 0 auto; + max-height: 90vh; + display: flex; + flex-direction: column; + box-shadow: 0 8px 30px rgba(0,0,0,0.2); + } + #source-header { + padding: 1rem 1.25rem; + border-bottom: 1px solid #e2e8f0; + font-weight: 700; + font-size: 0.9rem; + color: #0f172a; + display: flex; + justify-content: space-between; + align-items: center; + } + #source-content { + flex: 1; + overflow-y: auto; + padding: 1.25rem; + font-size: 0.85rem; + line-height: 1.6; + white-space: pre-wrap; + word-break: break-word; + color: #334155; + } + #source-content mark { + background: #fef08a; + padding: 2px 0; + border-radius: 2px; + } + #source-close-btn { + padding: 0.4rem 1rem; + background: #64748b; + color: #fff; + border: none; + border-radius: 6px; + cursor: pointer; + font-size: 0.8rem; + font-weight: 600; + } + #source-close-btn:hover { background: #475569; } diff --git a/src/viewer/templates/detail.html b/src/viewer/templates/detail.html index 72427b5..b86eb9a 100644 --- a/src/viewer/templates/detail.html +++ b/src/viewer/templates/detail.html @@ -287,7 +287,10 @@ {% set dp = dossier.diagnostic_principal %}

Diagnostic principal

-
{{ dp.texte }}
+
+ {{ dp.texte }} + {% if dp.source_page %}{% endif %} +
{% if dp.cim10_suggestion %} {{ dp.cim10_suggestion }} {{ dp.cim10_confidence | confidence_badge }} @@ -355,12 +358,7 @@ {{ das.source }} {% endif %} {% if das.source_page %} - p.{{ das.source_page }} - {% endif %} - {% if das.source_excerpt %} -
extrait -
{{ das.source_excerpt }}
-
+ {% endif %} @@ -410,7 +408,7 @@

Actes CCAM ({{ dossier.actes_ccam|length }})

- + {% for a in dossier.actes_ccam %} @@ -432,6 +430,7 @@
{{ alerte }}
{% endfor %} + {% endfor %} @@ -444,13 +443,14 @@

Biologie clé ({{ dossier.biologie_cle|length }})

TexteCode CCAMRegroupementDateValidité
TexteCode CCAMRegroupementDateValiditéSource
{% if a.source_page %}{% endif %}
- + {% for b in dossier.biologie_cle %} + {% endfor %} @@ -466,6 +466,7 @@
{{ img.type }} {% if img.score %} — Score : {{ img.score }}{% endif %} + {% if img.source_page %}{% endif %} {% if img.conclusion %}
{{ img.conclusion }}
{% endif %} @@ -479,13 +480,14 @@

Traitements de sortie ({{ dossier.traitements_sortie|length }})

TestValeurAnomalie
TestValeurAnomalieSource
{{ b.test }} {{ b.valeur or '' }} {% if b.anomalie %}Oui{% else %}—{% endif %}{% if b.source_page %}{% endif %}
- + {% for t in dossier.traitements_sortie %} + {% endfor %} @@ -499,7 +501,7 @@

Antécédents ({{ dossier.antecedents|length }})

    {% for a in dossier.antecedents %} -
  • {{ a }}
  • +
  • {{ a.texte }}{% if a.source_page %} {% endif %}
  • {% endfor %}
@@ -511,16 +513,109 @@

Complications ({{ dossier.complications|length }})

    {% for c in dossier.complications %} -
  • {{ c }}
  • +
  • {{ c.texte }}{% if c.source_page %} {% endif %}
  • {% endfor %}
{% endif %} +{# ---- Modal source ---- #} +
+
+
+ Document source + +
+
+
+
+ {% endblock %} {% block scripts %}
MédicamentPosologieCode ATC
MédicamentPosologieCode ATCSource
{{ t.medicament }} {{ t.posologie or '' }} {% if t.code_atc %}{{ t.code_atc }}{% endif %}{% if t.source_page %}{% endif %}