feat: pipeline T2A - anonymisation, extraction CIM-10 et intégration edsnlp
Pipeline complet de traitement de documents médicaux PDF : - Extraction texte (pdfplumber) et classification (Trackare/CRH) - Anonymisation multi-couche (regex + NER CamemBERT + sweep) - Extraction médicale CIM-10 hybride : edsnlp (AP-HP) enrichit les diagnostics, médicaments (codes ATC via Romedi) et négation, avec fallback regex pour les patterns spécifiques - Fix sentencepiece pinné à <0.2.0 pour compatibilité CamemBERT Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
99
src/config.py
Normal file
99
src/config.py
Normal file
@@ -0,0 +1,99 @@
|
||||
"""Configuration globale et modèles de données pour le pipeline T2A."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
# --- Chemins ---
|
||||
|
||||
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||
INPUT_DIR = BASE_DIR / "input"
|
||||
OUTPUT_DIR = BASE_DIR / "output"
|
||||
ANONYMIZED_DIR = OUTPUT_DIR / "anonymized"
|
||||
STRUCTURED_DIR = OUTPUT_DIR / "structured"
|
||||
REPORTS_DIR = OUTPUT_DIR / "reports"
|
||||
|
||||
for d in (INPUT_DIR, ANONYMIZED_DIR, STRUCTURED_DIR, REPORTS_DIR):
|
||||
d.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
# --- Configuration anonymisation ---
|
||||
|
||||
KEEP_ESTABLISHMENT_NAME = True
|
||||
NER_MODEL = "Jean-Baptiste/camembert-ner"
|
||||
NER_CONFIDENCE_THRESHOLD = 0.80
|
||||
|
||||
|
||||
# --- Modèles de données CIM-10 ---
|
||||
|
||||
|
||||
class Sejour(BaseModel):
|
||||
sexe: Optional[str] = None
|
||||
age: Optional[int] = None
|
||||
date_entree: Optional[str] = None
|
||||
date_sortie: Optional[str] = None
|
||||
duree_sejour: Optional[int] = None
|
||||
mode_entree: Optional[str] = None
|
||||
mode_sortie: Optional[str] = None
|
||||
imc: Optional[float] = None
|
||||
poids: Optional[float] = None
|
||||
taille: Optional[float] = None
|
||||
|
||||
|
||||
class Diagnostic(BaseModel):
|
||||
texte: str
|
||||
cim10_suggestion: Optional[str] = None
|
||||
|
||||
|
||||
class ActeCCAM(BaseModel):
|
||||
texte: str
|
||||
code_ccam_suggestion: Optional[str] = None
|
||||
date: Optional[str] = None
|
||||
|
||||
|
||||
class Traitement(BaseModel):
|
||||
medicament: str
|
||||
posologie: Optional[str] = None
|
||||
code_atc: Optional[str] = None
|
||||
|
||||
|
||||
class BiologieCle(BaseModel):
|
||||
test: str
|
||||
valeur: Optional[str] = None
|
||||
anomalie: Optional[bool] = None
|
||||
|
||||
|
||||
class Imagerie(BaseModel):
|
||||
type: str
|
||||
conclusion: Optional[str] = None
|
||||
score: Optional[str] = None
|
||||
|
||||
|
||||
class DossierMedical(BaseModel):
|
||||
source_file: str = ""
|
||||
document_type: str = ""
|
||||
sejour: Sejour = Field(default_factory=Sejour)
|
||||
diagnostic_principal: Optional[Diagnostic] = None
|
||||
diagnostics_associes: list[Diagnostic] = Field(default_factory=list)
|
||||
actes_ccam: list[ActeCCAM] = Field(default_factory=list)
|
||||
antecedents: list[str] = Field(default_factory=list)
|
||||
traitements_sortie: list[Traitement] = Field(default_factory=list)
|
||||
biologie_cle: list[BiologieCle] = Field(default_factory=list)
|
||||
imagerie: list[Imagerie] = Field(default_factory=list)
|
||||
complications: list[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
# --- Rapport d'anonymisation ---
|
||||
|
||||
|
||||
class AnonymizationReport(BaseModel):
|
||||
source_file: str
|
||||
total_replacements: int = 0
|
||||
regex_replacements: int = 0
|
||||
ner_replacements: int = 0
|
||||
sweep_replacements: int = 0
|
||||
entities_found: list[dict] = Field(default_factory=list)
|
||||
Reference in New Issue
Block a user