feat: pipeline T2A - anonymisation, extraction CIM-10 et intégration edsnlp
Pipeline complet de traitement de documents médicaux PDF : - Extraction texte (pdfplumber) et classification (Trackare/CRH) - Anonymisation multi-couche (regex + NER CamemBERT + sweep) - Extraction médicale CIM-10 hybride : edsnlp (AP-HP) enrichit les diagnostics, médicaments (codes ATC via Romedi) et négation, avec fallback regex pour les patterns spécifiques - Fix sentencepiece pinné à <0.2.0 pour compatibilité CamemBERT Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
197
tests/test_anonymization.py
Normal file
197
tests/test_anonymization.py
Normal file
@@ -0,0 +1,197 @@
|
||||
"""Tests pour le module d'anonymisation."""
|
||||
|
||||
import pytest
|
||||
|
||||
from src.anonymization.entity_registry import EntityRegistry
|
||||
from src.anonymization.regex_patterns import (
|
||||
CRH_FOOTER_IPP_EPISODE,
|
||||
CRH_FOOTER_PATIENT_PATTERN,
|
||||
DATE_NAISSANCE_PATTERN,
|
||||
DR_NAME_PATTERN,
|
||||
EMAIL_PATTERN,
|
||||
EPISODE_PATTERN,
|
||||
FOOTER_PATIENT_PATTERN,
|
||||
IPP_PATTERN,
|
||||
NOTE_AUTHOR_PATTERN,
|
||||
PHONE_PATTERN,
|
||||
RPPS_PATTERN,
|
||||
)
|
||||
|
||||
|
||||
class TestRegexPatterns:
|
||||
def test_ipp_with_colon(self):
|
||||
m = IPP_PATTERN.search("IPP: 01306172")
|
||||
assert m is not None
|
||||
assert m.group(1) == "01306172"
|
||||
|
||||
def test_ipp_without_colon(self):
|
||||
m = IPP_PATTERN.search("IPP 01306172")
|
||||
assert m is not None
|
||||
assert m.group(1) == "01306172"
|
||||
|
||||
def test_ipp_in_parentheses(self):
|
||||
m = IPP_PATTERN.search("(01306172 )")
|
||||
assert m is not None
|
||||
assert m.group(2) == "01306172"
|
||||
|
||||
def test_episode_no(self):
|
||||
m = EPISODE_PATTERN.search("Episode No: 23042753")
|
||||
assert m is not None
|
||||
assert m.group(1) == "23042753"
|
||||
|
||||
def test_episode_n_degree(self):
|
||||
m = EPISODE_PATTERN.search("N° Episode 23042753")
|
||||
assert m is not None
|
||||
assert m.group(2) == "23042753"
|
||||
|
||||
def test_phone_dots(self):
|
||||
m = PHONE_PATTERN.search("06.25.39.26.82")
|
||||
assert m is not None
|
||||
assert m.group(0) == "06.25.39.26.82"
|
||||
|
||||
def test_phone_spaces(self):
|
||||
m = PHONE_PATTERN.search("05 59 44 35 35")
|
||||
assert m is not None
|
||||
|
||||
def test_email(self):
|
||||
m = EMAIL_PATTERN.search("faudemar@ch-cotebasque.fr")
|
||||
assert m is not None
|
||||
assert m.group(0) == "faudemar@ch-cotebasque.fr"
|
||||
|
||||
def test_rpps(self):
|
||||
m = RPPS_PATTERN.search("RPPS : 10100532760")
|
||||
assert m is not None
|
||||
assert m.group(1) == "10100532760"
|
||||
|
||||
def test_date_naissance_nee_le(self):
|
||||
m = DATE_NAISSANCE_PATTERN.search("née le 23/02/1980")
|
||||
assert m is not None
|
||||
assert m.group(1) == "23/02/1980"
|
||||
|
||||
def test_date_naissance_ne_e_le(self):
|
||||
m = DATE_NAISSANCE_PATTERN.search("Né(e) le 23/02/1980")
|
||||
assert m is not None
|
||||
assert m.group(1) == "23/02/1980"
|
||||
|
||||
def test_date_naissance_field(self):
|
||||
m = DATE_NAISSANCE_PATTERN.search("Date de naissance: 23/02/1980")
|
||||
assert m is not None
|
||||
assert m.group(1) == "23/02/1980"
|
||||
|
||||
def test_dr_name(self):
|
||||
m = DR_NAME_PATTERN.search("Dr F. AUDEMAR")
|
||||
assert m is not None
|
||||
assert "AUDEMAR" in m.group(1)
|
||||
|
||||
def test_dr_name_docteur(self):
|
||||
m = DR_NAME_PATTERN.search("Docteur AUDEMAR Franck")
|
||||
assert m is not None
|
||||
assert "AUDEMAR" in m.group(1)
|
||||
|
||||
def test_note_author(self):
|
||||
m = NOTE_AUTHOR_PATTERN.search("Note IDE Annie GUIRESSE Non algique")
|
||||
assert m is not None
|
||||
assert m.group(1) == "Annie GUIRESSE"
|
||||
|
||||
def test_footer_patient_trackare(self):
|
||||
m = FOOTER_PATIENT_PATTERN.search(
|
||||
"Patient: CLIER NARBAIS AUDREY - Date de naissance: 23/02/1980"
|
||||
)
|
||||
assert m is not None
|
||||
assert "CLIER" in m.group(1)
|
||||
|
||||
def test_crh_footer_patient(self):
|
||||
m = CRH_FOOTER_PATIENT_PATTERN.search(
|
||||
"Patient(e) : CLIER AUDREY NARBAIS Né(e) le 23/02/1980"
|
||||
)
|
||||
assert m is not None
|
||||
assert "CLIER" in m.group(1)
|
||||
|
||||
def test_crh_footer_ipp_episode(self):
|
||||
m = CRH_FOOTER_IPP_EPISODE.search(
|
||||
"IPP 01306172 / N° Episode 23042753 (MEDECINE GASTRO B2 HC)"
|
||||
)
|
||||
assert m is not None
|
||||
assert m.group(1) == "01306172"
|
||||
assert m.group(2) == "23042753"
|
||||
|
||||
|
||||
class TestEntityRegistry:
|
||||
def test_register_returns_pseudo(self):
|
||||
reg = EntityRegistry()
|
||||
pseudo = reg.register("Jean Dupont", "patient")
|
||||
assert pseudo == "[PATIENT_1]"
|
||||
|
||||
def test_register_same_entity_returns_same(self):
|
||||
reg = EntityRegistry()
|
||||
p1 = reg.register("Jean Dupont", "patient")
|
||||
p2 = reg.register("Jean Dupont", "patient")
|
||||
assert p1 == p2
|
||||
|
||||
def test_register_case_insensitive(self):
|
||||
reg = EntityRegistry()
|
||||
p1 = reg.register("Jean DUPONT", "patient")
|
||||
p2 = reg.register("jean dupont", "patient")
|
||||
assert p1 == p2
|
||||
|
||||
def test_register_different_categories(self):
|
||||
reg = EntityRegistry()
|
||||
p1 = reg.register("Dupont", "patient")
|
||||
p2 = reg.register("Martin", "medecin")
|
||||
assert p1 == "[PATIENT_1]"
|
||||
assert p2 == "[MEDECIN_1]"
|
||||
|
||||
def test_get_replacement(self):
|
||||
reg = EntityRegistry()
|
||||
reg.register("Jean Dupont", "patient")
|
||||
assert reg.get_replacement("jean dupont") == "[PATIENT_1]"
|
||||
assert reg.get_replacement("inconnu") is None
|
||||
|
||||
|
||||
class TestAnonymizer:
|
||||
def test_anonymize_basic(self):
|
||||
from src.anonymization.anonymizer import Anonymizer
|
||||
|
||||
parsed = {
|
||||
"patient": {"nom_prenom": "DUPONT Jean", "nom_naissance": "DUPONT"},
|
||||
"medecins": ["MARTIN Pierre"],
|
||||
"contacts": [],
|
||||
}
|
||||
anonymizer = Anonymizer(parsed_data=parsed)
|
||||
text = "Le patient DUPONT Jean a été vu par Dr MARTIN Pierre."
|
||||
result = anonymizer.anonymize(text)
|
||||
|
||||
assert "DUPONT" not in result
|
||||
assert "MARTIN" not in result
|
||||
assert "[PATIENT" in result or "[MEDECIN" in result
|
||||
|
||||
def test_preserves_medical_content(self):
|
||||
from src.anonymization.anonymizer import Anonymizer
|
||||
|
||||
anonymizer = Anonymizer(parsed_data={"patient": {}, "medecins": [], "contacts": []})
|
||||
text = "Pancréatite aiguë biliaire. Cholécystectomie par cœlioscopie. IMC 34.37."
|
||||
result = anonymizer.anonymize(text)
|
||||
|
||||
assert "Pancréatite" in result
|
||||
assert "Cholécystectomie" in result
|
||||
assert "IMC" in result
|
||||
|
||||
def test_anonymize_phone(self):
|
||||
from src.anonymization.anonymizer import Anonymizer
|
||||
|
||||
anonymizer = Anonymizer(parsed_data={"patient": {}, "medecins": [], "contacts": []})
|
||||
text = "Appeler le 06.25.39.26.82 pour le rendez-vous."
|
||||
result = anonymizer.anonymize(text)
|
||||
|
||||
assert "06.25.39.26.82" not in result
|
||||
assert "[TEL" in result
|
||||
|
||||
def test_anonymize_email(self):
|
||||
from src.anonymization.anonymizer import Anonymizer
|
||||
|
||||
anonymizer = Anonymizer(parsed_data={"patient": {}, "medecins": [], "contacts": []})
|
||||
text = "Contact: faudemar@ch-cotebasque.fr"
|
||||
result = anonymizer.anonymize(text)
|
||||
|
||||
assert "faudemar@ch-cotebasque.fr" not in result
|
||||
assert "[EMAIL" in result
|
||||
Reference in New Issue
Block a user