feat: pipeline T2A - anonymisation, extraction CIM-10 et intégration edsnlp

Pipeline complet de traitement de documents médicaux PDF : - Extraction texte (pdfplumber) et classification (Trackare/CRH) - Anonymisation multi-couche (regex + NER CamemBERT + sweep) - Extraction médicale CIM-10 hybride : edsnlp (AP-HP) enrichit les diagnostics, médicaments (codes ATC via Romedi) et négation, avec fallback regex pour les patterns spécifiques - Fix sentencepiece pinné à <0.2.0 pour compatibilité CamemBERT Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-10 15:24:12 +01:00
commit 4a12cd2676
25 changed files with 7592 additions and 0 deletions
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -0,0 +1,124 @@
+"""Tests d'intégration end-to-end sur les PDFs réels."""
+
+import json
+from pathlib import Path
+
+import pytest
+
+from src.config import INPUT_DIR, ANONYMIZED_DIR, STRUCTURED_DIR
+from src.main import process_pdf
+
+
+# Skip si les PDFs ne sont pas disponibles
+CRH_PDF = INPUT_DIR / "CRH 23042753.pdf"
+TRACKARE_PDF = INPUT_DIR / "trackare-01306172-23042753_01306172_23042753.pdf"
+
+needs_pdfs = pytest.mark.skipif(
+    not CRH_PDF.exists() or not TRACKARE_PDF.exists(),
+    reason="PDFs de test non disponibles dans input/",
+)
+
+
+# Données personnelles connues à vérifier
+PATIENT_PII = [
+    "NARBAIS", "CLIER", "AUDREY", "01306172", "23042753",
+    "23/02/1980", "IRREXELAIA", "BAIGORRY", "06.25.39.26.82",
+]
+
+SOIGNANT_NAMES = [
+    "DUTREY", "MENDIBOURE", "PUJOS", "AUDEMAR", "MELLIN",
+    "GUIRESSE", "GOYTINO", "SERRE", "NOVION",
+]
+
+
+@needs_pdfs
+class TestCRHIntegration:
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        self.anonymized, self.dossier, self.report = process_pdf(CRH_PDF)
+
+    def test_no_patient_pii(self):
+        text_upper = self.anonymized.upper()
+        for pii in PATIENT_PII:
+            assert pii.upper() not in text_upper, f"PII trouvé : {pii}"
+
+    def test_medical_content_preserved(self):
+        text_lower = self.anonymized.lower()
+        for term in ["pancréatite", "cholécystectomie", "cholangiographie", "lithiase"]:
+            assert term in text_lower, f"Terme médical manquant : {term}"
+
+    def test_diagnostic_principal(self):
+        dp = self.dossier.diagnostic_principal
+        assert dp is not None
+        assert dp.cim10_suggestion == "K85.1"
+
+    def test_diagnostics_associes(self):
+        codes = {d.cim10_suggestion for d in self.dossier.diagnostics_associes}
+        assert "K80.5" in codes or "K80.2" in codes
+
+    def test_sejour(self):
+        s = self.dossier.sejour
+        assert s.sexe == "F"
+        assert s.age == 43
+        assert s.date_entree == "25/02/2023"
+        assert s.date_sortie == "03/03/2023"
+        assert s.duree_sejour == 6
+
+    def test_traitements_have_optional_atc(self):
+        """Vérifie que les traitements ont le champ code_atc (peut être None)."""
+        for t in self.dossier.traitements_sortie:
+            assert hasattr(t, "code_atc")
+
+
+@needs_pdfs
+class TestTrackareIntegration:
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        self.anonymized, self.dossier, self.report = process_pdf(TRACKARE_PDF)
+
+    def test_no_patient_pii(self):
+        text_upper = self.anonymized.upper()
+        for pii in PATIENT_PII:
+            assert pii.upper() not in text_upper, f"PII trouvé : {pii}"
+
+    def test_no_soignant_names(self):
+        # Ignorer "selles" qui contient "SELLE"
+        text = self.anonymized
+        for name in SOIGNANT_NAMES:
+            # Chercher le nom comme mot complet
+            import re
+            pattern = re.compile(r"\b" + re.escape(name) + r"\b", re.IGNORECASE)
+            matches = pattern.findall(text)
+            assert len(matches) == 0, f"Nom soignant trouvé : {name} ({len(matches)} occurrences)"
+
+    def test_medical_content_preserved(self):
+        text_lower = self.anonymized.lower()
+        for term in ["pancréatite", "cholécystectomie", "morphine", "paracétamol"]:
+            assert term in text_lower, f"Terme médical manquant : {term}"
+
+    def test_diagnostic_principal(self):
+        dp = self.dossier.diagnostic_principal
+        assert dp is not None
+        assert dp.cim10_suggestion in ("K80.5", "K85.1")
+
+    def test_sejour_with_vitals(self):
+        s = self.dossier.sejour
+        assert s.sexe == "F"
+        assert s.age == 43
+        assert s.imc is not None
+        assert s.imc > 30
+        assert s.poids is not None
+        assert s.taille is not None
+
+    def test_biologie(self):
+        tests = {b.test for b in self.dossier.biologie_cle}
+        assert "Lipasémie" in tests
+        assert "CRP" in tests
+        # Lipasémie doit être anormale
+        for b in self.dossier.biologie_cle:
+            if b.test == "Lipasémie":
+                assert b.anomalie is True
+
+    def test_report_counts(self):
+        assert self.report.total_replacements > 100
+        assert self.report.regex_replacements > 50