feat: pipeline T2A - anonymisation, extraction CIM-10 et intégration edsnlp
Pipeline complet de traitement de documents médicaux PDF : - Extraction texte (pdfplumber) et classification (Trackare/CRH) - Anonymisation multi-couche (regex + NER CamemBERT + sweep) - Extraction médicale CIM-10 hybride : edsnlp (AP-HP) enrichit les diagnostics, médicaments (codes ATC via Romedi) et négation, avec fallback regex pour les patterns spécifiques - Fix sentencepiece pinné à <0.2.0 pour compatibilité CamemBERT Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
124
tests/test_integration.py
Normal file
124
tests/test_integration.py
Normal file
@@ -0,0 +1,124 @@
|
||||
"""Tests d'intégration end-to-end sur les PDFs réels."""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from src.config import INPUT_DIR, ANONYMIZED_DIR, STRUCTURED_DIR
|
||||
from src.main import process_pdf
|
||||
|
||||
|
||||
# Skip si les PDFs ne sont pas disponibles
|
||||
CRH_PDF = INPUT_DIR / "CRH 23042753.pdf"
|
||||
TRACKARE_PDF = INPUT_DIR / "trackare-01306172-23042753_01306172_23042753.pdf"
|
||||
|
||||
needs_pdfs = pytest.mark.skipif(
|
||||
not CRH_PDF.exists() or not TRACKARE_PDF.exists(),
|
||||
reason="PDFs de test non disponibles dans input/",
|
||||
)
|
||||
|
||||
|
||||
# Données personnelles connues à vérifier
|
||||
PATIENT_PII = [
|
||||
"NARBAIS", "CLIER", "AUDREY", "01306172", "23042753",
|
||||
"23/02/1980", "IRREXELAIA", "BAIGORRY", "06.25.39.26.82",
|
||||
]
|
||||
|
||||
SOIGNANT_NAMES = [
|
||||
"DUTREY", "MENDIBOURE", "PUJOS", "AUDEMAR", "MELLIN",
|
||||
"GUIRESSE", "GOYTINO", "SERRE", "NOVION",
|
||||
]
|
||||
|
||||
|
||||
@needs_pdfs
|
||||
class TestCRHIntegration:
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup(self):
|
||||
self.anonymized, self.dossier, self.report = process_pdf(CRH_PDF)
|
||||
|
||||
def test_no_patient_pii(self):
|
||||
text_upper = self.anonymized.upper()
|
||||
for pii in PATIENT_PII:
|
||||
assert pii.upper() not in text_upper, f"PII trouvé : {pii}"
|
||||
|
||||
def test_medical_content_preserved(self):
|
||||
text_lower = self.anonymized.lower()
|
||||
for term in ["pancréatite", "cholécystectomie", "cholangiographie", "lithiase"]:
|
||||
assert term in text_lower, f"Terme médical manquant : {term}"
|
||||
|
||||
def test_diagnostic_principal(self):
|
||||
dp = self.dossier.diagnostic_principal
|
||||
assert dp is not None
|
||||
assert dp.cim10_suggestion == "K85.1"
|
||||
|
||||
def test_diagnostics_associes(self):
|
||||
codes = {d.cim10_suggestion for d in self.dossier.diagnostics_associes}
|
||||
assert "K80.5" in codes or "K80.2" in codes
|
||||
|
||||
def test_sejour(self):
|
||||
s = self.dossier.sejour
|
||||
assert s.sexe == "F"
|
||||
assert s.age == 43
|
||||
assert s.date_entree == "25/02/2023"
|
||||
assert s.date_sortie == "03/03/2023"
|
||||
assert s.duree_sejour == 6
|
||||
|
||||
def test_traitements_have_optional_atc(self):
|
||||
"""Vérifie que les traitements ont le champ code_atc (peut être None)."""
|
||||
for t in self.dossier.traitements_sortie:
|
||||
assert hasattr(t, "code_atc")
|
||||
|
||||
|
||||
@needs_pdfs
|
||||
class TestTrackareIntegration:
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup(self):
|
||||
self.anonymized, self.dossier, self.report = process_pdf(TRACKARE_PDF)
|
||||
|
||||
def test_no_patient_pii(self):
|
||||
text_upper = self.anonymized.upper()
|
||||
for pii in PATIENT_PII:
|
||||
assert pii.upper() not in text_upper, f"PII trouvé : {pii}"
|
||||
|
||||
def test_no_soignant_names(self):
|
||||
# Ignorer "selles" qui contient "SELLE"
|
||||
text = self.anonymized
|
||||
for name in SOIGNANT_NAMES:
|
||||
# Chercher le nom comme mot complet
|
||||
import re
|
||||
pattern = re.compile(r"\b" + re.escape(name) + r"\b", re.IGNORECASE)
|
||||
matches = pattern.findall(text)
|
||||
assert len(matches) == 0, f"Nom soignant trouvé : {name} ({len(matches)} occurrences)"
|
||||
|
||||
def test_medical_content_preserved(self):
|
||||
text_lower = self.anonymized.lower()
|
||||
for term in ["pancréatite", "cholécystectomie", "morphine", "paracétamol"]:
|
||||
assert term in text_lower, f"Terme médical manquant : {term}"
|
||||
|
||||
def test_diagnostic_principal(self):
|
||||
dp = self.dossier.diagnostic_principal
|
||||
assert dp is not None
|
||||
assert dp.cim10_suggestion in ("K80.5", "K85.1")
|
||||
|
||||
def test_sejour_with_vitals(self):
|
||||
s = self.dossier.sejour
|
||||
assert s.sexe == "F"
|
||||
assert s.age == 43
|
||||
assert s.imc is not None
|
||||
assert s.imc > 30
|
||||
assert s.poids is not None
|
||||
assert s.taille is not None
|
||||
|
||||
def test_biologie(self):
|
||||
tests = {b.test for b in self.dossier.biologie_cle}
|
||||
assert "Lipasémie" in tests
|
||||
assert "CRP" in tests
|
||||
# Lipasémie doit être anormale
|
||||
for b in self.dossier.biologie_cle:
|
||||
if b.test == "Lipasémie":
|
||||
assert b.anomalie is True
|
||||
|
||||
def test_report_counts(self):
|
||||
assert self.report.total_replacements > 100
|
||||
assert self.report.regex_replacements > 50
|
||||
Reference in New Issue
Block a user