Pipeline complet de traitement de documents médicaux PDF : - Extraction texte (pdfplumber) et classification (Trackare/CRH) - Anonymisation multi-couche (regex + NER CamemBERT + sweep) - Extraction médicale CIM-10 hybride : edsnlp (AP-HP) enrichit les diagnostics, médicaments (codes ATC via Romedi) et négation, avec fallback regex pour les patterns spécifiques - Fix sentencepiece pinné à <0.2.0 pour compatibilité CamemBERT Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
198 lines
6.4 KiB
Python
198 lines
6.4 KiB
Python
"""Tests pour le module d'anonymisation."""
|
|
|
|
import pytest
|
|
|
|
from src.anonymization.entity_registry import EntityRegistry
|
|
from src.anonymization.regex_patterns import (
|
|
CRH_FOOTER_IPP_EPISODE,
|
|
CRH_FOOTER_PATIENT_PATTERN,
|
|
DATE_NAISSANCE_PATTERN,
|
|
DR_NAME_PATTERN,
|
|
EMAIL_PATTERN,
|
|
EPISODE_PATTERN,
|
|
FOOTER_PATIENT_PATTERN,
|
|
IPP_PATTERN,
|
|
NOTE_AUTHOR_PATTERN,
|
|
PHONE_PATTERN,
|
|
RPPS_PATTERN,
|
|
)
|
|
|
|
|
|
class TestRegexPatterns:
|
|
def test_ipp_with_colon(self):
|
|
m = IPP_PATTERN.search("IPP: 01306172")
|
|
assert m is not None
|
|
assert m.group(1) == "01306172"
|
|
|
|
def test_ipp_without_colon(self):
|
|
m = IPP_PATTERN.search("IPP 01306172")
|
|
assert m is not None
|
|
assert m.group(1) == "01306172"
|
|
|
|
def test_ipp_in_parentheses(self):
|
|
m = IPP_PATTERN.search("(01306172 )")
|
|
assert m is not None
|
|
assert m.group(2) == "01306172"
|
|
|
|
def test_episode_no(self):
|
|
m = EPISODE_PATTERN.search("Episode No: 23042753")
|
|
assert m is not None
|
|
assert m.group(1) == "23042753"
|
|
|
|
def test_episode_n_degree(self):
|
|
m = EPISODE_PATTERN.search("N° Episode 23042753")
|
|
assert m is not None
|
|
assert m.group(2) == "23042753"
|
|
|
|
def test_phone_dots(self):
|
|
m = PHONE_PATTERN.search("06.25.39.26.82")
|
|
assert m is not None
|
|
assert m.group(0) == "06.25.39.26.82"
|
|
|
|
def test_phone_spaces(self):
|
|
m = PHONE_PATTERN.search("05 59 44 35 35")
|
|
assert m is not None
|
|
|
|
def test_email(self):
|
|
m = EMAIL_PATTERN.search("faudemar@ch-cotebasque.fr")
|
|
assert m is not None
|
|
assert m.group(0) == "faudemar@ch-cotebasque.fr"
|
|
|
|
def test_rpps(self):
|
|
m = RPPS_PATTERN.search("RPPS : 10100532760")
|
|
assert m is not None
|
|
assert m.group(1) == "10100532760"
|
|
|
|
def test_date_naissance_nee_le(self):
|
|
m = DATE_NAISSANCE_PATTERN.search("née le 23/02/1980")
|
|
assert m is not None
|
|
assert m.group(1) == "23/02/1980"
|
|
|
|
def test_date_naissance_ne_e_le(self):
|
|
m = DATE_NAISSANCE_PATTERN.search("Né(e) le 23/02/1980")
|
|
assert m is not None
|
|
assert m.group(1) == "23/02/1980"
|
|
|
|
def test_date_naissance_field(self):
|
|
m = DATE_NAISSANCE_PATTERN.search("Date de naissance: 23/02/1980")
|
|
assert m is not None
|
|
assert m.group(1) == "23/02/1980"
|
|
|
|
def test_dr_name(self):
|
|
m = DR_NAME_PATTERN.search("Dr F. AUDEMAR")
|
|
assert m is not None
|
|
assert "AUDEMAR" in m.group(1)
|
|
|
|
def test_dr_name_docteur(self):
|
|
m = DR_NAME_PATTERN.search("Docteur AUDEMAR Franck")
|
|
assert m is not None
|
|
assert "AUDEMAR" in m.group(1)
|
|
|
|
def test_note_author(self):
|
|
m = NOTE_AUTHOR_PATTERN.search("Note IDE Annie GUIRESSE Non algique")
|
|
assert m is not None
|
|
assert m.group(1) == "Annie GUIRESSE"
|
|
|
|
def test_footer_patient_trackare(self):
|
|
m = FOOTER_PATIENT_PATTERN.search(
|
|
"Patient: CLIER NARBAIS AUDREY - Date de naissance: 23/02/1980"
|
|
)
|
|
assert m is not None
|
|
assert "CLIER" in m.group(1)
|
|
|
|
def test_crh_footer_patient(self):
|
|
m = CRH_FOOTER_PATIENT_PATTERN.search(
|
|
"Patient(e) : CLIER AUDREY NARBAIS Né(e) le 23/02/1980"
|
|
)
|
|
assert m is not None
|
|
assert "CLIER" in m.group(1)
|
|
|
|
def test_crh_footer_ipp_episode(self):
|
|
m = CRH_FOOTER_IPP_EPISODE.search(
|
|
"IPP 01306172 / N° Episode 23042753 (MEDECINE GASTRO B2 HC)"
|
|
)
|
|
assert m is not None
|
|
assert m.group(1) == "01306172"
|
|
assert m.group(2) == "23042753"
|
|
|
|
|
|
class TestEntityRegistry:
|
|
def test_register_returns_pseudo(self):
|
|
reg = EntityRegistry()
|
|
pseudo = reg.register("Jean Dupont", "patient")
|
|
assert pseudo == "[PATIENT_1]"
|
|
|
|
def test_register_same_entity_returns_same(self):
|
|
reg = EntityRegistry()
|
|
p1 = reg.register("Jean Dupont", "patient")
|
|
p2 = reg.register("Jean Dupont", "patient")
|
|
assert p1 == p2
|
|
|
|
def test_register_case_insensitive(self):
|
|
reg = EntityRegistry()
|
|
p1 = reg.register("Jean DUPONT", "patient")
|
|
p2 = reg.register("jean dupont", "patient")
|
|
assert p1 == p2
|
|
|
|
def test_register_different_categories(self):
|
|
reg = EntityRegistry()
|
|
p1 = reg.register("Dupont", "patient")
|
|
p2 = reg.register("Martin", "medecin")
|
|
assert p1 == "[PATIENT_1]"
|
|
assert p2 == "[MEDECIN_1]"
|
|
|
|
def test_get_replacement(self):
|
|
reg = EntityRegistry()
|
|
reg.register("Jean Dupont", "patient")
|
|
assert reg.get_replacement("jean dupont") == "[PATIENT_1]"
|
|
assert reg.get_replacement("inconnu") is None
|
|
|
|
|
|
class TestAnonymizer:
|
|
def test_anonymize_basic(self):
|
|
from src.anonymization.anonymizer import Anonymizer
|
|
|
|
parsed = {
|
|
"patient": {"nom_prenom": "DUPONT Jean", "nom_naissance": "DUPONT"},
|
|
"medecins": ["MARTIN Pierre"],
|
|
"contacts": [],
|
|
}
|
|
anonymizer = Anonymizer(parsed_data=parsed)
|
|
text = "Le patient DUPONT Jean a été vu par Dr MARTIN Pierre."
|
|
result = anonymizer.anonymize(text)
|
|
|
|
assert "DUPONT" not in result
|
|
assert "MARTIN" not in result
|
|
assert "[PATIENT" in result or "[MEDECIN" in result
|
|
|
|
def test_preserves_medical_content(self):
|
|
from src.anonymization.anonymizer import Anonymizer
|
|
|
|
anonymizer = Anonymizer(parsed_data={"patient": {}, "medecins": [], "contacts": []})
|
|
text = "Pancréatite aiguë biliaire. Cholécystectomie par cœlioscopie. IMC 34.37."
|
|
result = anonymizer.anonymize(text)
|
|
|
|
assert "Pancréatite" in result
|
|
assert "Cholécystectomie" in result
|
|
assert "IMC" in result
|
|
|
|
def test_anonymize_phone(self):
|
|
from src.anonymization.anonymizer import Anonymizer
|
|
|
|
anonymizer = Anonymizer(parsed_data={"patient": {}, "medecins": [], "contacts": []})
|
|
text = "Appeler le 06.25.39.26.82 pour le rendez-vous."
|
|
result = anonymizer.anonymize(text)
|
|
|
|
assert "06.25.39.26.82" not in result
|
|
assert "[TEL" in result
|
|
|
|
def test_anonymize_email(self):
|
|
from src.anonymization.anonymizer import Anonymizer
|
|
|
|
anonymizer = Anonymizer(parsed_data={"patient": {}, "medecins": [], "contacts": []})
|
|
text = "Contact: faudemar@ch-cotebasque.fr"
|
|
result = anonymizer.anonymize(text)
|
|
|
|
assert "faudemar@ch-cotebasque.fr" not in result
|
|
assert "[EMAIL" in result
|