Pipeline complet de traitement de documents médicaux PDF : - Extraction texte (pdfplumber) et classification (Trackare/CRH) - Anonymisation multi-couche (regex + NER CamemBERT + sweep) - Extraction médicale CIM-10 hybride : edsnlp (AP-HP) enrichit les diagnostics, médicaments (codes ATC via Romedi) et négation, avec fallback regex pour les patterns spécifiques - Fix sentencepiece pinné à <0.2.0 pour compatibilité CamemBERT Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
125 lines
4.1 KiB
Python
125 lines
4.1 KiB
Python
"""Tests d'intégration end-to-end sur les PDFs réels."""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from src.config import INPUT_DIR, ANONYMIZED_DIR, STRUCTURED_DIR
|
|
from src.main import process_pdf
|
|
|
|
|
|
# Skip si les PDFs ne sont pas disponibles
|
|
CRH_PDF = INPUT_DIR / "CRH 23042753.pdf"
|
|
TRACKARE_PDF = INPUT_DIR / "trackare-01306172-23042753_01306172_23042753.pdf"
|
|
|
|
needs_pdfs = pytest.mark.skipif(
|
|
not CRH_PDF.exists() or not TRACKARE_PDF.exists(),
|
|
reason="PDFs de test non disponibles dans input/",
|
|
)
|
|
|
|
|
|
# Données personnelles connues à vérifier
|
|
PATIENT_PII = [
|
|
"NARBAIS", "CLIER", "AUDREY", "01306172", "23042753",
|
|
"23/02/1980", "IRREXELAIA", "BAIGORRY", "06.25.39.26.82",
|
|
]
|
|
|
|
SOIGNANT_NAMES = [
|
|
"DUTREY", "MENDIBOURE", "PUJOS", "AUDEMAR", "MELLIN",
|
|
"GUIRESSE", "GOYTINO", "SERRE", "NOVION",
|
|
]
|
|
|
|
|
|
@needs_pdfs
|
|
class TestCRHIntegration:
|
|
@pytest.fixture(autouse=True)
|
|
def setup(self):
|
|
self.anonymized, self.dossier, self.report = process_pdf(CRH_PDF)
|
|
|
|
def test_no_patient_pii(self):
|
|
text_upper = self.anonymized.upper()
|
|
for pii in PATIENT_PII:
|
|
assert pii.upper() not in text_upper, f"PII trouvé : {pii}"
|
|
|
|
def test_medical_content_preserved(self):
|
|
text_lower = self.anonymized.lower()
|
|
for term in ["pancréatite", "cholécystectomie", "cholangiographie", "lithiase"]:
|
|
assert term in text_lower, f"Terme médical manquant : {term}"
|
|
|
|
def test_diagnostic_principal(self):
|
|
dp = self.dossier.diagnostic_principal
|
|
assert dp is not None
|
|
assert dp.cim10_suggestion == "K85.1"
|
|
|
|
def test_diagnostics_associes(self):
|
|
codes = {d.cim10_suggestion for d in self.dossier.diagnostics_associes}
|
|
assert "K80.5" in codes or "K80.2" in codes
|
|
|
|
def test_sejour(self):
|
|
s = self.dossier.sejour
|
|
assert s.sexe == "F"
|
|
assert s.age == 43
|
|
assert s.date_entree == "25/02/2023"
|
|
assert s.date_sortie == "03/03/2023"
|
|
assert s.duree_sejour == 6
|
|
|
|
def test_traitements_have_optional_atc(self):
|
|
"""Vérifie que les traitements ont le champ code_atc (peut être None)."""
|
|
for t in self.dossier.traitements_sortie:
|
|
assert hasattr(t, "code_atc")
|
|
|
|
|
|
@needs_pdfs
|
|
class TestTrackareIntegration:
|
|
@pytest.fixture(autouse=True)
|
|
def setup(self):
|
|
self.anonymized, self.dossier, self.report = process_pdf(TRACKARE_PDF)
|
|
|
|
def test_no_patient_pii(self):
|
|
text_upper = self.anonymized.upper()
|
|
for pii in PATIENT_PII:
|
|
assert pii.upper() not in text_upper, f"PII trouvé : {pii}"
|
|
|
|
def test_no_soignant_names(self):
|
|
# Ignorer "selles" qui contient "SELLE"
|
|
text = self.anonymized
|
|
for name in SOIGNANT_NAMES:
|
|
# Chercher le nom comme mot complet
|
|
import re
|
|
pattern = re.compile(r"\b" + re.escape(name) + r"\b", re.IGNORECASE)
|
|
matches = pattern.findall(text)
|
|
assert len(matches) == 0, f"Nom soignant trouvé : {name} ({len(matches)} occurrences)"
|
|
|
|
def test_medical_content_preserved(self):
|
|
text_lower = self.anonymized.lower()
|
|
for term in ["pancréatite", "cholécystectomie", "morphine", "paracétamol"]:
|
|
assert term in text_lower, f"Terme médical manquant : {term}"
|
|
|
|
def test_diagnostic_principal(self):
|
|
dp = self.dossier.diagnostic_principal
|
|
assert dp is not None
|
|
assert dp.cim10_suggestion in ("K80.5", "K85.1")
|
|
|
|
def test_sejour_with_vitals(self):
|
|
s = self.dossier.sejour
|
|
assert s.sexe == "F"
|
|
assert s.age == 43
|
|
assert s.imc is not None
|
|
assert s.imc > 30
|
|
assert s.poids is not None
|
|
assert s.taille is not None
|
|
|
|
def test_biologie(self):
|
|
tests = {b.test for b in self.dossier.biologie_cle}
|
|
assert "Lipasémie" in tests
|
|
assert "CRP" in tests
|
|
# Lipasémie doit être anormale
|
|
for b in self.dossier.biologie_cle:
|
|
if b.test == "Lipasémie":
|
|
assert b.anomalie is True
|
|
|
|
def test_report_counts(self):
|
|
assert self.report.total_replacements > 100
|
|
assert self.report.regex_replacements > 50
|