feat: pipeline T2A - anonymisation, extraction CIM-10 et intégration edsnlp

Pipeline complet de traitement de documents médicaux PDF :
- Extraction texte (pdfplumber) et classification (Trackare/CRH)
- Anonymisation multi-couche (regex + NER CamemBERT + sweep)
- Extraction médicale CIM-10 hybride : edsnlp (AP-HP) enrichit les
  diagnostics, médicaments (codes ATC via Romedi) et négation,
  avec fallback regex pour les patterns spécifiques
- Fix sentencepiece pinné à <0.2.0 pour compatibilité CamemBERT

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
dom
2026-02-10 15:24:12 +01:00
commit 4a12cd2676
25 changed files with 7592 additions and 0 deletions

124
tests/test_integration.py Normal file
View File

@@ -0,0 +1,124 @@
"""Tests d'intégration end-to-end sur les PDFs réels."""
import json
from pathlib import Path
import pytest
from src.config import INPUT_DIR, ANONYMIZED_DIR, STRUCTURED_DIR
from src.main import process_pdf
# Skip si les PDFs ne sont pas disponibles
CRH_PDF = INPUT_DIR / "CRH 23042753.pdf"
TRACKARE_PDF = INPUT_DIR / "trackare-01306172-23042753_01306172_23042753.pdf"
needs_pdfs = pytest.mark.skipif(
not CRH_PDF.exists() or not TRACKARE_PDF.exists(),
reason="PDFs de test non disponibles dans input/",
)
# Données personnelles connues à vérifier
PATIENT_PII = [
"NARBAIS", "CLIER", "AUDREY", "01306172", "23042753",
"23/02/1980", "IRREXELAIA", "BAIGORRY", "06.25.39.26.82",
]
SOIGNANT_NAMES = [
"DUTREY", "MENDIBOURE", "PUJOS", "AUDEMAR", "MELLIN",
"GUIRESSE", "GOYTINO", "SERRE", "NOVION",
]
@needs_pdfs
class TestCRHIntegration:
@pytest.fixture(autouse=True)
def setup(self):
self.anonymized, self.dossier, self.report = process_pdf(CRH_PDF)
def test_no_patient_pii(self):
text_upper = self.anonymized.upper()
for pii in PATIENT_PII:
assert pii.upper() not in text_upper, f"PII trouvé : {pii}"
def test_medical_content_preserved(self):
text_lower = self.anonymized.lower()
for term in ["pancréatite", "cholécystectomie", "cholangiographie", "lithiase"]:
assert term in text_lower, f"Terme médical manquant : {term}"
def test_diagnostic_principal(self):
dp = self.dossier.diagnostic_principal
assert dp is not None
assert dp.cim10_suggestion == "K85.1"
def test_diagnostics_associes(self):
codes = {d.cim10_suggestion for d in self.dossier.diagnostics_associes}
assert "K80.5" in codes or "K80.2" in codes
def test_sejour(self):
s = self.dossier.sejour
assert s.sexe == "F"
assert s.age == 43
assert s.date_entree == "25/02/2023"
assert s.date_sortie == "03/03/2023"
assert s.duree_sejour == 6
def test_traitements_have_optional_atc(self):
"""Vérifie que les traitements ont le champ code_atc (peut être None)."""
for t in self.dossier.traitements_sortie:
assert hasattr(t, "code_atc")
@needs_pdfs
class TestTrackareIntegration:
@pytest.fixture(autouse=True)
def setup(self):
self.anonymized, self.dossier, self.report = process_pdf(TRACKARE_PDF)
def test_no_patient_pii(self):
text_upper = self.anonymized.upper()
for pii in PATIENT_PII:
assert pii.upper() not in text_upper, f"PII trouvé : {pii}"
def test_no_soignant_names(self):
# Ignorer "selles" qui contient "SELLE"
text = self.anonymized
for name in SOIGNANT_NAMES:
# Chercher le nom comme mot complet
import re
pattern = re.compile(r"\b" + re.escape(name) + r"\b", re.IGNORECASE)
matches = pattern.findall(text)
assert len(matches) == 0, f"Nom soignant trouvé : {name} ({len(matches)} occurrences)"
def test_medical_content_preserved(self):
text_lower = self.anonymized.lower()
for term in ["pancréatite", "cholécystectomie", "morphine", "paracétamol"]:
assert term in text_lower, f"Terme médical manquant : {term}"
def test_diagnostic_principal(self):
dp = self.dossier.diagnostic_principal
assert dp is not None
assert dp.cim10_suggestion in ("K80.5", "K85.1")
def test_sejour_with_vitals(self):
s = self.dossier.sejour
assert s.sexe == "F"
assert s.age == 43
assert s.imc is not None
assert s.imc > 30
assert s.poids is not None
assert s.taille is not None
def test_biologie(self):
tests = {b.test for b in self.dossier.biologie_cle}
assert "Lipasémie" in tests
assert "CRP" in tests
# Lipasémie doit être anormale
for b in self.dossier.biologie_cle:
if b.test == "Lipasémie":
assert b.anomalie is True
def test_report_counts(self):
assert self.report.total_replacements > 100
assert self.report.regex_replacements > 50