feat: mode hybride Ollama — gemma3:27b pour CPAM, 12b pour codage
Le pipeline utilise désormais gemma3:12b (rapide) pour le codage CIM-10 et gemma3:27b (meilleur raisonnement) pour la contre-argumentation CPAM. Configurable via OLLAMA_MODEL_CPAM et OLLAMA_TIMEOUT_CPAM. Inclut aussi : traçabilité source/page DAS, niveaux CMA ATIH, sévérité, page tracker PDF, améliorations fusion et filtres DAS. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -98,11 +98,21 @@ def extract_medical_info(
|
||||
anonymized_text: str,
|
||||
edsnlp_result: Optional[EdsnlpResult] = None,
|
||||
use_rag: bool = False,
|
||||
page_tracker=None,
|
||||
raw_text: str | None = None,
|
||||
) -> DossierMedical:
|
||||
"""Extrait les informations médicales structurées depuis les données parsées et le texte."""
|
||||
"""Extrait les informations médicales structurées depuis les données parsées et le texte.
|
||||
|
||||
Args:
|
||||
page_tracker: PageTracker pour la traçabilité page/extrait (optionnel).
|
||||
raw_text: Texte brut avant anonymisation (pour recherche page source).
|
||||
"""
|
||||
dossier = DossierMedical()
|
||||
dossier.document_type = parsed_data.get("type", "")
|
||||
|
||||
# Texte de référence pour la recherche de pages (raw_text préféré, sinon anonymized)
|
||||
search_text = raw_text or anonymized_text
|
||||
|
||||
_extract_sejour(parsed_data, dossier)
|
||||
_extract_diagnostics(parsed_data, anonymized_text, dossier, edsnlp_result)
|
||||
_extract_actes(anonymized_text, dossier)
|
||||
@@ -140,6 +150,10 @@ def extract_medical_info(
|
||||
# Post-processing : retirer DAS dont le code est identique au DP
|
||||
_remove_das_equal_dp(dossier)
|
||||
|
||||
# Post-processing : traçabilité source (page + extrait)
|
||||
if page_tracker:
|
||||
_apply_source_tracking(dossier, page_tracker, search_text)
|
||||
|
||||
return dossier
|
||||
|
||||
|
||||
@@ -331,10 +345,12 @@ def _extract_diagnostics(
|
||||
elif edsnlp_codes:
|
||||
# Utiliser la première entité CIM-10 edsnlp comme DP
|
||||
code, texte = next(iter(edsnlp_codes.items()))
|
||||
dossier.diagnostic_principal = Diagnostic(
|
||||
texte=texte.capitalize(), cim10_suggestion=code,
|
||||
source="edsnlp",
|
||||
)
|
||||
texte_clean = texte.capitalize()
|
||||
if is_valid_diagnostic_text(texte_clean):
|
||||
dossier.diagnostic_principal = Diagnostic(
|
||||
texte=texte_clean, cim10_suggestion=code,
|
||||
source="edsnlp",
|
||||
)
|
||||
|
||||
# Diagnostics associés depuis le texte (regex)
|
||||
das = _find_diagnostics_associes(text_lower, conclusion, dossier)
|
||||
@@ -881,18 +897,46 @@ def _apply_code_corrections(dossier: DossierMedical) -> None:
|
||||
diag.cim10_suggestion = corrected
|
||||
|
||||
|
||||
def _is_dp_family_redundant(das_code: str, dp_code: str) -> bool:
|
||||
"""True si le DAS est redondant avec le DP (même code, parent/enfant, ou même famille)."""
|
||||
if das_code == dp_code:
|
||||
return True
|
||||
# Relation parent/enfant → toujours redondant
|
||||
das_norm = das_code.replace(".", "")
|
||||
dp_norm = dp_code.replace(".", "")
|
||||
if das_norm.startswith(dp_norm) or dp_norm.startswith(das_norm):
|
||||
return True
|
||||
# Même famille 3 chars, sauf exceptions
|
||||
dp_family = dp_code[:3]
|
||||
if das_code[:3] == dp_family:
|
||||
# S/T (trauma) : sites différents → garder
|
||||
if dp_family[0] in ("S", "T"):
|
||||
return False
|
||||
# E10-E14 (diabète) : complications différentes → garder
|
||||
if dp_family[0] == "E" and dp_family[1:].isdigit() and 10 <= int(dp_family[1:]) <= 14:
|
||||
return False
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _remove_das_equal_dp(dossier: DossierMedical) -> None:
|
||||
"""Retire les DAS dont le code CIM-10 est identique au DP (violation règle PMSI)."""
|
||||
"""Retire les DAS redondants avec le DP (même code, famille, ou sémantique)."""
|
||||
from .das_filter import apply_semantic_dedup
|
||||
|
||||
dp_code = dossier.diagnostic_principal.cim10_suggestion if dossier.diagnostic_principal else None
|
||||
if not dp_code:
|
||||
return
|
||||
before = len(dossier.diagnostics_associes)
|
||||
dossier.diagnostics_associes = [
|
||||
d for d in dossier.diagnostics_associes if d.cim10_suggestion != dp_code
|
||||
d for d in dossier.diagnostics_associes
|
||||
if not d.cim10_suggestion or not _is_dp_family_redundant(d.cim10_suggestion, dp_code)
|
||||
]
|
||||
removed = before - len(dossier.diagnostics_associes)
|
||||
if removed:
|
||||
logger.info(" DAS=DP : %d DAS retiré(s) (code %s identique au DP)", removed, dp_code)
|
||||
logger.info(" DAS≈DP : %d DAS retiré(s) (famille %s du DP)", removed, dp_code[:3])
|
||||
|
||||
# Redondances sémantiques entre DAS
|
||||
dossier.diagnostics_associes = apply_semantic_dedup(dossier.diagnostics_associes)
|
||||
|
||||
|
||||
def _apply_noncumul_rules(dossier: DossierMedical) -> None:
|
||||
@@ -945,3 +989,33 @@ def _is_abnormal(test: str, value: str) -> bool | None:
|
||||
lo, hi = BIO_NORMALS[test]
|
||||
return val > hi or val < lo
|
||||
return None
|
||||
|
||||
|
||||
def _apply_source_tracking(dossier: DossierMedical, page_tracker, search_text: str) -> None:
|
||||
"""Ajoute la traçabilité source (page + extrait) à chaque diagnostic.
|
||||
|
||||
Cherche le texte du diagnostic dans le texte source pour retrouver
|
||||
la page d'origine et extraire un passage contextualisé.
|
||||
"""
|
||||
all_diags: list[Diagnostic] = []
|
||||
if dossier.diagnostic_principal:
|
||||
all_diags.append(dossier.diagnostic_principal)
|
||||
all_diags.extend(dossier.diagnostics_associes)
|
||||
|
||||
tracked = 0
|
||||
for diag in all_diags:
|
||||
if diag.source_page is not None:
|
||||
continue # déjà renseigné
|
||||
|
||||
texte = diag.texte
|
||||
if not texte:
|
||||
continue
|
||||
|
||||
page = page_tracker.find_page_for_text(texte, search_text)
|
||||
if page:
|
||||
diag.source_page = page
|
||||
diag.source_excerpt = page_tracker.extract_excerpt(texte, search_text)
|
||||
tracked += 1
|
||||
|
||||
if tracked:
|
||||
logger.info(" Traçabilité source : %d/%d diagnostics localisés", tracked, len(all_diags))
|
||||
|
||||
@@ -100,6 +100,44 @@ def is_valid_diagnostic_text(text: str) -> bool:
|
||||
return True
|
||||
|
||||
|
||||
# Paires de redondance sémantique CIM-10 en PMSI
|
||||
# Format: (dominated_prefix, dominant_prefixes)
|
||||
# Si un code commençant par dominated_prefix ET un code commençant par un dominant_prefix
|
||||
# sont tous deux en DAS, le dominated est supprimé.
|
||||
SEMANTIC_REDUNDANCIES: list[tuple[str, list[str]]] = [
|
||||
# I10 (HTA essentielle) redondant si I11/I12/I13 présent (cardio/néphropathie hypertensive)
|
||||
("I10", ["I11", "I12", "I13"]),
|
||||
# N30 (cystite) redondant si N39.0 présent (infection urinaire)
|
||||
("N30", ["N39"]),
|
||||
# J18 (pneumonie SAI) redondant si J15/J16 présent (pneumonie spécifique)
|
||||
("J18", ["J15", "J16"]),
|
||||
]
|
||||
|
||||
|
||||
def apply_semantic_dedup(das_list: list) -> list:
|
||||
"""Retire les DAS rendus redondants par la présence d'un code plus spécifique.
|
||||
|
||||
Utilise SEMANTIC_REDUNDANCIES pour déterminer les paires dominé/dominant.
|
||||
Accepte une liste de Diagnostic (avec attribut cim10_suggestion).
|
||||
"""
|
||||
codes_present = {d.cim10_suggestion for d in das_list if d.cim10_suggestion}
|
||||
to_remove: set[str] = set()
|
||||
|
||||
for dominated_prefix, dominant_prefixes in SEMANTIC_REDUNDANCIES:
|
||||
dominated_codes = [c for c in codes_present if c.startswith(dominated_prefix)]
|
||||
if not dominated_codes:
|
||||
continue
|
||||
has_dominant = any(
|
||||
c.startswith(dp) for c in codes_present for dp in dominant_prefixes
|
||||
)
|
||||
if has_dominant:
|
||||
to_remove.update(dominated_codes)
|
||||
|
||||
if not to_remove:
|
||||
return das_list
|
||||
return [d for d in das_list if d.cim10_suggestion not in to_remove]
|
||||
|
||||
|
||||
def correct_known_miscodes(code: str, texte: str) -> str | None:
|
||||
"""Corrige les codes CIM-10 systématiquement mal attribués par le LLM.
|
||||
|
||||
|
||||
@@ -17,6 +17,8 @@ from ..config import (
|
||||
Sejour,
|
||||
Traitement,
|
||||
)
|
||||
from ..medical.das_filter import is_valid_diagnostic_text, apply_semantic_dedup
|
||||
from ..medical.cim10_extractor import _is_dp_family_redundant
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -163,6 +165,14 @@ def merge_dossiers(dossiers: list[DossierMedical]) -> DossierMedical:
|
||||
if len(dossiers) == 1:
|
||||
result = dossiers[0].model_copy(deep=True)
|
||||
result.source_files = [result.source_file]
|
||||
# Appliquer la dédup famille DP + sémantique même pour un seul dossier
|
||||
dp_code = result.diagnostic_principal.cim10_suggestion if result.diagnostic_principal else None
|
||||
if dp_code:
|
||||
result.diagnostics_associes = [
|
||||
d for d in result.diagnostics_associes
|
||||
if not d.cim10_suggestion or not _is_dp_family_redundant(d.cim10_suggestion, dp_code)
|
||||
]
|
||||
result.diagnostics_associes = apply_semantic_dedup(result.diagnostics_associes)
|
||||
return result
|
||||
|
||||
merged = DossierMedical()
|
||||
@@ -181,23 +191,29 @@ def merge_dossiers(dossiers: list[DossierMedical]) -> DossierMedical:
|
||||
for d in dossiers:
|
||||
all_das.extend(d.diagnostics_associes)
|
||||
# Si le DP de ce dossier est différent du DP fusionné, l'ajouter comme DAS
|
||||
# mais seulement si le texte est un diagnostic valide (filtre artefacts OCR)
|
||||
if (
|
||||
d.diagnostic_principal
|
||||
and merged.diagnostic_principal
|
||||
and d.diagnostic_principal.cim10_suggestion
|
||||
!= merged.diagnostic_principal.cim10_suggestion
|
||||
and is_valid_diagnostic_text(d.diagnostic_principal.texte)
|
||||
):
|
||||
all_das.append(d.diagnostic_principal)
|
||||
|
||||
merged.diagnostics_associes = _dedup_diagnostics(all_das)
|
||||
|
||||
# Retirer les DAS dont le code est identique au DP (violation règle PMSI)
|
||||
# Retirer les DAS redondants avec le DP (même code, famille, parent/enfant)
|
||||
dp_code = merged.diagnostic_principal.cim10_suggestion if merged.diagnostic_principal else None
|
||||
if dp_code:
|
||||
merged.diagnostics_associes = [
|
||||
d for d in merged.diagnostics_associes if d.cim10_suggestion != dp_code
|
||||
d for d in merged.diagnostics_associes
|
||||
if not d.cim10_suggestion or not _is_dp_family_redundant(d.cim10_suggestion, dp_code)
|
||||
]
|
||||
|
||||
# Redondances sémantiques entre DAS
|
||||
merged.diagnostics_associes = apply_semantic_dedup(merged.diagnostics_associes)
|
||||
|
||||
# Actes CCAM
|
||||
all_actes: list[ActeCCAM] = []
|
||||
for d in dossiers:
|
||||
|
||||
@@ -141,19 +141,29 @@ def _detect_type_ghm(actes_ccam: list) -> str:
|
||||
def _compute_severity(das_list: list) -> tuple[int, int, int]:
|
||||
"""Calcule le niveau de sévérité à partir des DAS.
|
||||
|
||||
Utilise le max des niveau_cma officiels ATIH quand disponibles,
|
||||
avec fallback sur le comptage CMA/CMS.
|
||||
|
||||
Returns:
|
||||
(niveau, cma_count, cms_count)
|
||||
"""
|
||||
cma_count = 0
|
||||
cms_count = 0
|
||||
max_cma_level = 1
|
||||
|
||||
for das in das_list:
|
||||
niveau_cma = getattr(das, "niveau_cma", None)
|
||||
if niveau_cma and niveau_cma > 1:
|
||||
max_cma_level = max(max_cma_level, niveau_cma)
|
||||
if getattr(das, "est_cma", False):
|
||||
cma_count += 1
|
||||
if getattr(das, "est_cms", False):
|
||||
cms_count += 1
|
||||
|
||||
if cms_count >= 2:
|
||||
# Priorité au niveau CMA officiel ATIH
|
||||
if max_cma_level > 1:
|
||||
niveau = max_cma_level
|
||||
elif cms_count >= 2:
|
||||
niveau = 4
|
||||
elif cms_count >= 1 or cma_count >= 3:
|
||||
niveau = 3
|
||||
|
||||
@@ -34,12 +34,12 @@ def _get_anthropic_client():
|
||||
return None
|
||||
|
||||
|
||||
def _call_anthropic(
|
||||
def call_anthropic(
|
||||
prompt: str,
|
||||
temperature: float = 0.1,
|
||||
max_tokens: int = 2500,
|
||||
) -> dict | None:
|
||||
"""Appelle l'API Anthropic en fallback."""
|
||||
"""Appelle l'API Anthropic (Haiku)."""
|
||||
client = _get_anthropic_client()
|
||||
if client is None:
|
||||
return None
|
||||
@@ -82,6 +82,8 @@ def call_ollama(
|
||||
prompt: str,
|
||||
temperature: float = 0.1,
|
||||
max_tokens: int = 2500,
|
||||
model: str | None = None,
|
||||
timeout: int | None = None,
|
||||
) -> dict | None:
|
||||
"""Appelle Ollama en mode JSON natif, avec fallback Anthropic si indisponible.
|
||||
|
||||
@@ -89,16 +91,20 @@ def call_ollama(
|
||||
prompt: Le prompt à envoyer.
|
||||
temperature: Température de génération (défaut: 0.1).
|
||||
max_tokens: Nombre max de tokens (défaut: 2500).
|
||||
model: Modèle Ollama à utiliser (défaut: OLLAMA_MODEL global).
|
||||
timeout: Timeout en secondes (défaut: OLLAMA_TIMEOUT global).
|
||||
|
||||
Returns:
|
||||
Le dict JSON parsé, ou None en cas d'erreur.
|
||||
"""
|
||||
use_model = model or OLLAMA_MODEL
|
||||
use_timeout = timeout or OLLAMA_TIMEOUT
|
||||
for attempt in range(2):
|
||||
try:
|
||||
response = requests.post(
|
||||
f"{OLLAMA_URL}/api/generate",
|
||||
json={
|
||||
"model": OLLAMA_MODEL,
|
||||
"model": use_model,
|
||||
"prompt": prompt,
|
||||
"stream": False,
|
||||
"format": "json",
|
||||
@@ -107,7 +113,7 @@ def call_ollama(
|
||||
"num_predict": max_tokens,
|
||||
},
|
||||
},
|
||||
timeout=OLLAMA_TIMEOUT,
|
||||
timeout=use_timeout,
|
||||
)
|
||||
response.raise_for_status()
|
||||
raw = response.json().get("response", "")
|
||||
@@ -115,13 +121,14 @@ def call_ollama(
|
||||
if result is not None:
|
||||
return result
|
||||
if attempt == 0:
|
||||
logger.info("Ollama : retry après échec de parsing")
|
||||
logger.info("Ollama (%s) : retry après échec de parsing", use_model)
|
||||
except requests.ConnectionError:
|
||||
logger.info("Ollama indisponible → fallback Anthropic (%s)", _ANTHROPIC_MODEL)
|
||||
return _call_anthropic(prompt, temperature, max_tokens)
|
||||
return call_anthropic(prompt, temperature, max_tokens)
|
||||
except requests.Timeout:
|
||||
logger.warning("Ollama timeout après %ds → fallback Anthropic", OLLAMA_TIMEOUT)
|
||||
return _call_anthropic(prompt, temperature, max_tokens)
|
||||
logger.warning("Ollama (%s) timeout après %ds → fallback Anthropic",
|
||||
use_model, use_timeout)
|
||||
return call_anthropic(prompt, temperature, max_tokens)
|
||||
except (requests.RequestException, json.JSONDecodeError) as e:
|
||||
logger.warning("Ollama erreur : %s", e)
|
||||
return None
|
||||
|
||||
@@ -6,12 +6,16 @@ Phase 2 (future) : tables CMA/CMS officielles ATIH.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
from .cim10_dict import load_dict, normalize_text
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# --- Marqueurs de sévérité dans le texte ---
|
||||
|
||||
@@ -73,11 +77,34 @@ _HEURISTIC_CMA_ROOTS: set[str] = {
|
||||
}
|
||||
|
||||
|
||||
_cma_levels: dict[str, int] | None = None
|
||||
|
||||
|
||||
def _load_cma_levels() -> dict[str, int]:
|
||||
"""Charge les niveaux CMA officiels depuis data/cma_levels.json (lazy-loaded)."""
|
||||
global _cma_levels
|
||||
if _cma_levels is not None:
|
||||
return _cma_levels
|
||||
from ..config import CMA_LEVELS_PATH
|
||||
try:
|
||||
data = json.loads(CMA_LEVELS_PATH.read_text(encoding="utf-8"))
|
||||
_cma_levels = {k: int(v) for k, v in data.items()}
|
||||
logger.debug("CMA levels chargés : %d codes", len(_cma_levels))
|
||||
except FileNotFoundError:
|
||||
logger.warning("Fichier CMA levels non trouvé : %s", CMA_LEVELS_PATH)
|
||||
_cma_levels = {}
|
||||
except Exception:
|
||||
logger.warning("Erreur chargement CMA levels", exc_info=True)
|
||||
_cma_levels = {}
|
||||
return _cma_levels
|
||||
|
||||
|
||||
@dataclass
|
||||
class SeverityInfo:
|
||||
"""Résultat de l'évaluation de sévérité d'un diagnostic."""
|
||||
est_cma_probable: bool = False
|
||||
niveau_severite: str = "non_evalue" # "leger" | "modere" | "severe" | "non_evalue"
|
||||
niveau_cma: int = 1 # 1 (pas CMA), 2, 3 ou 4 (officiel ATIH)
|
||||
marqueurs_trouves: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
@@ -119,11 +146,14 @@ def _is_heuristic_cma(code: str) -> bool:
|
||||
def evaluate_severity(diagnostic) -> SeverityInfo:
|
||||
"""Évalue la sévérité d'un diagnostic (texte + code CIM-10).
|
||||
|
||||
Utilise en priorité les niveaux CMA officiels ATIH (2/3/4),
|
||||
avec fallback sur l'heuristique par racines CIM-10.
|
||||
|
||||
Args:
|
||||
diagnostic: Objet avec attributs texte, cim10_suggestion.
|
||||
|
||||
Returns:
|
||||
SeverityInfo avec est_cma_probable, niveau_severite, marqueurs_trouves.
|
||||
SeverityInfo avec est_cma_probable, niveau_cma, niveau_severite, marqueurs_trouves.
|
||||
"""
|
||||
info = SeverityInfo()
|
||||
|
||||
@@ -147,13 +177,17 @@ def evaluate_severity(diagnostic) -> SeverityInfo:
|
||||
info.niveau_severite = niveau
|
||||
info.marqueurs_trouves = marqueurs
|
||||
|
||||
# 3. Heuristique CMA basée sur la racine CIM-10
|
||||
if code and _is_heuristic_cma(code):
|
||||
info.est_cma_probable = True
|
||||
|
||||
# Un diagnostic sévère avec un code CMA-probable = forte indication
|
||||
if niveau == "severe" and info.est_cma_probable:
|
||||
info.est_cma_probable = True
|
||||
# 3. Lookup officiel CMA ATIH (prioritaire)
|
||||
if code:
|
||||
cma_levels = _load_cma_levels()
|
||||
official_level = cma_levels.get(code)
|
||||
if official_level:
|
||||
info.niveau_cma = official_level
|
||||
info.est_cma_probable = True
|
||||
elif _is_heuristic_cma(code):
|
||||
# Fallback heuristique → niveau 2
|
||||
info.niveau_cma = 2
|
||||
info.est_cma_probable = True
|
||||
|
||||
return info
|
||||
|
||||
@@ -176,6 +210,7 @@ def enrich_dossier_severity(dp, das_list: list) -> tuple[list[str], int, int]:
|
||||
if dp and dp.cim10_suggestion:
|
||||
info = evaluate_severity(dp)
|
||||
dp.niveau_severite = info.niveau_severite
|
||||
dp.niveau_cma = info.niveau_cma
|
||||
if info.est_cma_probable:
|
||||
dp.est_cma = True
|
||||
|
||||
@@ -187,15 +222,16 @@ def enrich_dossier_severity(dp, das_list: list) -> tuple[list[str], int, int]:
|
||||
continue
|
||||
info = evaluate_severity(das)
|
||||
das.niveau_severite = info.niveau_severite
|
||||
das.niveau_cma = info.niveau_cma
|
||||
if info.est_cma_probable:
|
||||
das.est_cma = True
|
||||
cma_count += 1
|
||||
# CMS = CMA sévère
|
||||
if info.niveau_severite == "severe":
|
||||
# CMS = CMA niveau 4 ou CMA sévère
|
||||
if info.niveau_cma >= 4 or info.niveau_severite == "severe":
|
||||
das.est_cms = True
|
||||
cms_count += 1
|
||||
alertes.append(
|
||||
f"CMA probable : '{das.texte}' ({das.cim10_suggestion}) — "
|
||||
f"CMA niveau {info.niveau_cma} : '{das.texte}' ({das.cim10_suggestion}) — "
|
||||
f"sévérité {info.niveau_severite}"
|
||||
+ (f", marqueurs : {', '.join(info.marqueurs_trouves)}" if info.marqueurs_trouves else "")
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user