Files
t2a_v2/src/config.py
dom 8c75941e40 feat: 3 quick wins — source DAS, fallback code parent, filtre anatomique
1. Champ source sur Diagnostic : trackare/edsnlp/regex/llm_das
   - Renseigné dans les 8 constructeurs de cim10_extractor.py
   - Permet l'audit de provenance des DAS dans le JSON de sortie

2. Fallback code parent pour les codes LLM halluccinés :
   - fallback_parent_code() dans cim10_dict.py (D71.9→D71, R69.8→R69)
   - Intégré dans _apply_llm_result_diagnostic() de rag_search.py
   - Récupère les codes rejetés dont le parent 3-char est valide

3. Règle 12 filtre DAS : en-têtes anatomiques + catégories vagues
   - Rejette "Musculaire", "Digestif", "Hépatique" (mots isolés)
   - Rejette "Musculaire - masse musculaire" (catégorie + description)
   - 13 nouveaux tests unitaires au total

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 11:34:32 +01:00

190 lines
6.1 KiB
Python

"""Configuration globale et modèles de données pour le pipeline T2A."""
from __future__ import annotations
import os
from pathlib import Path
from typing import Optional
from dotenv import load_dotenv
from pydantic import BaseModel, Field
load_dotenv()
# --- Chemins ---
BASE_DIR = Path(__file__).resolve().parent.parent
INPUT_DIR = BASE_DIR / "input"
OUTPUT_DIR = BASE_DIR / "output"
ANONYMIZED_DIR = OUTPUT_DIR / "anonymized"
STRUCTURED_DIR = OUTPUT_DIR / "structured"
REPORTS_DIR = OUTPUT_DIR / "reports"
for d in (INPUT_DIR, ANONYMIZED_DIR, STRUCTURED_DIR, REPORTS_DIR):
d.mkdir(parents=True, exist_ok=True)
# --- Configuration anonymisation ---
KEEP_ESTABLISHMENT_NAME = os.environ.get("T2A_KEEP_ESTABLISHMENT", "True").lower() in ("true", "1", "yes")
NER_MODEL = os.environ.get("T2A_NER_MODEL", "Jean-Baptiste/camembert-ner")
NER_CONFIDENCE_THRESHOLD = float(os.environ.get("T2A_NER_THRESHOLD", "0.80"))
# --- Configuration Ollama ---
OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434")
OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL", "gemma3:12b")
OLLAMA_TIMEOUT = int(os.environ.get("OLLAMA_TIMEOUT", "120"))
OLLAMA_CACHE_PATH = BASE_DIR / "data" / "ollama_cache.json"
OLLAMA_MAX_PARALLEL = int(os.environ.get("OLLAMA_MAX_PARALLEL", "2"))
# --- Configuration RUM / établissement ---
FINESS = os.environ.get("T2A_FINESS", "000000000")
NUM_UM = os.environ.get("T2A_NUM_UM", "0000")
# --- Configuration RAG ---
RAG_INDEX_DIR = BASE_DIR / "data" / "rag_index"
REFERENTIELS_DIR = BASE_DIR / "data" / "referentiels"
UPLOAD_MAX_SIZE_MB = 50
ALLOWED_EXTENSIONS = {".pdf", ".csv", ".xlsx", ".xls", ".txt"}
CIM10_DICT_PATH = BASE_DIR / "data" / "cim10_dict.json"
CIM10_SUPPLEMENTS_PATH = BASE_DIR / "data" / "cim10_supplements.json"
CCAM_DICT_PATH = BASE_DIR / "data" / "ccam_dict.json"
CIM10_PDF = Path(os.environ.get("T2A_CIM10_PDF", "/home/dom/ai/aivanov_CIM/cim-10-fr_2026_a_usage_pmsi_version_provisoire_111225.pdf"))
GUIDE_METHODO_PDF = Path(os.environ.get("T2A_GUIDE_METHODO_PDF", "/home/dom/ai/aivanov_CIM/guide_methodo_mco_2026_version_provisoire.pdf"))
CCAM_PDF = Path(os.environ.get("T2A_CCAM_PDF", "/home/dom/ai/aivanov_CIM/actualisation_ccam_descriptive_a_usage_pmsi_v4_2025.pdf"))
# --- Modèle d'embedding ---
EMBEDDING_MODEL = os.environ.get("T2A_EMBEDDING_MODEL", "dangvantuan/sentence-camembert-large")
# --- Modèle de re-ranking (cross-encoder, CPU uniquement) ---
RERANKER_MODEL = os.environ.get("T2A_RERANKER_MODEL", "cross-encoder/ms-marco-MiniLM-L-6-v2")
# --- Modèles de données CIM-10 ---
class RAGSource(BaseModel):
document: str
page: Optional[int] = None
code: Optional[str] = None
extrait: Optional[str] = None
class Sejour(BaseModel):
sexe: Optional[str] = None
age: Optional[int] = None
date_entree: Optional[str] = None
date_sortie: Optional[str] = None
duree_sejour: Optional[int] = None
mode_entree: Optional[str] = None
mode_sortie: Optional[str] = None
imc: Optional[float] = None
poids: Optional[float] = None
taille: Optional[float] = None
class Diagnostic(BaseModel):
texte: str
cim10_suggestion: Optional[str] = None
cim10_confidence: Optional[str] = None
justification: Optional[str] = None
raisonnement: Optional[str] = None
sources_rag: list[RAGSource] = Field(default_factory=list)
est_cma: Optional[bool] = None
est_cms: Optional[bool] = None
niveau_severite: Optional[str] = None # "leger" | "modere" | "severe" | "non_evalue"
source: Optional[str] = None # "trackare" | "edsnlp" | "regex" | "llm_das"
class ActeCCAM(BaseModel):
texte: str
code_ccam_suggestion: Optional[str] = None
ccam_confidence: Optional[str] = None
justification: Optional[str] = None
raisonnement: Optional[str] = None
sources_rag: list[RAGSource] = Field(default_factory=list)
date: Optional[str] = None
validite: Optional[str] = None # "valide" | "obsolete" | "non_verifie"
alertes: list[str] = Field(default_factory=list)
class Traitement(BaseModel):
medicament: str
posologie: Optional[str] = None
code_atc: Optional[str] = None
class BiologieCle(BaseModel):
test: str
valeur: Optional[str] = None
anomalie: Optional[bool] = None
class Imagerie(BaseModel):
type: str
conclusion: Optional[str] = None
score: Optional[str] = None
class DossierMedical(BaseModel):
source_file: str = ""
document_type: str = ""
sejour: Sejour = Field(default_factory=Sejour)
diagnostic_principal: Optional[Diagnostic] = None
diagnostics_associes: list[Diagnostic] = Field(default_factory=list)
actes_ccam: list[ActeCCAM] = Field(default_factory=list)
antecedents: list[str] = Field(default_factory=list)
traitements_sortie: list[Traitement] = Field(default_factory=list)
biologie_cle: list[BiologieCle] = Field(default_factory=list)
imagerie: list[Imagerie] = Field(default_factory=list)
complications: list[str] = Field(default_factory=list)
alertes_codage: list[str] = Field(default_factory=list)
source_files: list[str] = Field(default_factory=list)
ghm_estimation: Optional[GHMEstimation] = None
controles_cpam: list[ControleCPAM] = Field(default_factory=list)
processing_time_s: float | None = None
# --- Rapport d'anonymisation ---
class GHMEstimation(BaseModel):
cmd: Optional[str] = None
cmd_libelle: Optional[str] = None
type_ghm: Optional[str] = None # "C" / "M" / "K"
severite: int = 1 # 1-4
ghm_approx: Optional[str] = None # ex: "07C??2"
cma_count: int = 0
cms_count: int = 0
alertes: list[str] = Field(default_factory=list)
class ControleCPAM(BaseModel):
numero_ogc: int
titre: str = ""
arg_ucr: str = ""
decision_ucr: str = ""
dp_ucr: Optional[str] = None
da_ucr: Optional[str] = None
dr_ucr: Optional[str] = None
actes_ucr: Optional[str] = None
contre_argumentation: Optional[str] = None
sources_reponse: list[RAGSource] = Field(default_factory=list)
class AnonymizationReport(BaseModel):
source_file: str
total_replacements: int = 0
regex_replacements: int = 0
ner_replacements: int = 0
sweep_replacements: int = 0
entities_found: list[dict] = Field(default_factory=list)