"""Configuration globale et modèles de données pour le pipeline T2A.""" from __future__ import annotations import os from pathlib import Path from typing import Optional from dotenv import load_dotenv from pydantic import BaseModel, Field load_dotenv() # --- Chemins --- BASE_DIR = Path(__file__).resolve().parent.parent INPUT_DIR = BASE_DIR / "input" OUTPUT_DIR = BASE_DIR / "output" ANONYMIZED_DIR = OUTPUT_DIR / "anonymized" STRUCTURED_DIR = OUTPUT_DIR / "structured" REPORTS_DIR = OUTPUT_DIR / "reports" for d in (INPUT_DIR, ANONYMIZED_DIR, STRUCTURED_DIR, REPORTS_DIR): d.mkdir(parents=True, exist_ok=True) # --- Configuration anonymisation --- KEEP_ESTABLISHMENT_NAME = os.environ.get("T2A_KEEP_ESTABLISHMENT", "True").lower() in ("true", "1", "yes") NER_MODEL = os.environ.get("T2A_NER_MODEL", "Jean-Baptiste/camembert-ner") NER_CONFIDENCE_THRESHOLD = float(os.environ.get("T2A_NER_THRESHOLD", "0.80")) # --- Configuration Ollama --- OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434") OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL", "gemma3:12b") OLLAMA_TIMEOUT = int(os.environ.get("OLLAMA_TIMEOUT", "120")) OLLAMA_CACHE_PATH = BASE_DIR / "data" / "ollama_cache.json" OLLAMA_MAX_PARALLEL = int(os.environ.get("OLLAMA_MAX_PARALLEL", "2")) # --- Configuration RUM / établissement --- FINESS = os.environ.get("T2A_FINESS", "000000000") NUM_UM = os.environ.get("T2A_NUM_UM", "0000") # --- Configuration RAG --- RAG_INDEX_DIR = BASE_DIR / "data" / "rag_index" REFERENTIELS_DIR = BASE_DIR / "data" / "referentiels" UPLOAD_MAX_SIZE_MB = 50 ALLOWED_EXTENSIONS = {".pdf", ".csv", ".xlsx", ".xls", ".txt"} CIM10_DICT_PATH = BASE_DIR / "data" / "cim10_dict.json" CIM10_SUPPLEMENTS_PATH = BASE_DIR / "data" / "cim10_supplements.json" CCAM_DICT_PATH = BASE_DIR / "data" / "ccam_dict.json" CIM10_PDF = Path(os.environ.get("T2A_CIM10_PDF", "/home/dom/ai/aivanov_CIM/cim-10-fr_2026_a_usage_pmsi_version_provisoire_111225.pdf")) GUIDE_METHODO_PDF = Path(os.environ.get("T2A_GUIDE_METHODO_PDF", "/home/dom/ai/aivanov_CIM/guide_methodo_mco_2026_version_provisoire.pdf")) CCAM_PDF = Path(os.environ.get("T2A_CCAM_PDF", "/home/dom/ai/aivanov_CIM/actualisation_ccam_descriptive_a_usage_pmsi_v4_2025.pdf")) # --- Modèle d'embedding --- EMBEDDING_MODEL = os.environ.get("T2A_EMBEDDING_MODEL", "dangvantuan/sentence-camembert-large") # --- Modèle de re-ranking (cross-encoder, CPU uniquement) --- RERANKER_MODEL = os.environ.get("T2A_RERANKER_MODEL", "cross-encoder/ms-marco-MiniLM-L-6-v2") # --- Modèles de données CIM-10 --- class RAGSource(BaseModel): document: str page: Optional[int] = None code: Optional[str] = None extrait: Optional[str] = None class Sejour(BaseModel): sexe: Optional[str] = None age: Optional[int] = None date_entree: Optional[str] = None date_sortie: Optional[str] = None duree_sejour: Optional[int] = None mode_entree: Optional[str] = None mode_sortie: Optional[str] = None imc: Optional[float] = None poids: Optional[float] = None taille: Optional[float] = None class Diagnostic(BaseModel): texte: str cim10_suggestion: Optional[str] = None cim10_confidence: Optional[str] = None justification: Optional[str] = None raisonnement: Optional[str] = None sources_rag: list[RAGSource] = Field(default_factory=list) est_cma: Optional[bool] = None est_cms: Optional[bool] = None niveau_severite: Optional[str] = None # "leger" | "modere" | "severe" | "non_evalue" source: Optional[str] = None # "trackare" | "edsnlp" | "regex" | "llm_das" class ActeCCAM(BaseModel): texte: str code_ccam_suggestion: Optional[str] = None ccam_confidence: Optional[str] = None justification: Optional[str] = None raisonnement: Optional[str] = None sources_rag: list[RAGSource] = Field(default_factory=list) date: Optional[str] = None validite: Optional[str] = None # "valide" | "obsolete" | "non_verifie" alertes: list[str] = Field(default_factory=list) class Traitement(BaseModel): medicament: str posologie: Optional[str] = None code_atc: Optional[str] = None class BiologieCle(BaseModel): test: str valeur: Optional[str] = None anomalie: Optional[bool] = None class Imagerie(BaseModel): type: str conclusion: Optional[str] = None score: Optional[str] = None class DossierMedical(BaseModel): source_file: str = "" document_type: str = "" sejour: Sejour = Field(default_factory=Sejour) diagnostic_principal: Optional[Diagnostic] = None diagnostics_associes: list[Diagnostic] = Field(default_factory=list) actes_ccam: list[ActeCCAM] = Field(default_factory=list) antecedents: list[str] = Field(default_factory=list) traitements_sortie: list[Traitement] = Field(default_factory=list) biologie_cle: list[BiologieCle] = Field(default_factory=list) imagerie: list[Imagerie] = Field(default_factory=list) complications: list[str] = Field(default_factory=list) alertes_codage: list[str] = Field(default_factory=list) source_files: list[str] = Field(default_factory=list) ghm_estimation: Optional[GHMEstimation] = None controles_cpam: list[ControleCPAM] = Field(default_factory=list) processing_time_s: float | None = None # --- Rapport d'anonymisation --- class GHMEstimation(BaseModel): cmd: Optional[str] = None cmd_libelle: Optional[str] = None type_ghm: Optional[str] = None # "C" / "M" / "K" severite: int = 1 # 1-4 ghm_approx: Optional[str] = None # ex: "07C??2" cma_count: int = 0 cms_count: int = 0 alertes: list[str] = Field(default_factory=list) class ControleCPAM(BaseModel): numero_ogc: int titre: str = "" arg_ucr: str = "" decision_ucr: str = "" dp_ucr: Optional[str] = None da_ucr: Optional[str] = None dr_ucr: Optional[str] = None actes_ucr: Optional[str] = None contre_argumentation: Optional[str] = None sources_reponse: list[RAGSource] = Field(default_factory=list) class AnonymizationReport(BaseModel): source_file: str total_replacements: int = 0 regex_replacements: int = 0 ner_replacements: int = 0 sweep_replacements: int = 0 entities_found: list[dict] = Field(default_factory=list)