Files
t2a/tests/test_anonymization.py
dom 4a12cd2676 feat: pipeline T2A - anonymisation, extraction CIM-10 et intégration edsnlp
Pipeline complet de traitement de documents médicaux PDF :
- Extraction texte (pdfplumber) et classification (Trackare/CRH)
- Anonymisation multi-couche (regex + NER CamemBERT + sweep)
- Extraction médicale CIM-10 hybride : edsnlp (AP-HP) enrichit les
  diagnostics, médicaments (codes ATC via Romedi) et négation,
  avec fallback regex pour les patterns spécifiques
- Fix sentencepiece pinné à <0.2.0 pour compatibilité CamemBERT

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-10 15:24:12 +01:00

198 lines
6.4 KiB
Python

"""Tests pour le module d'anonymisation."""
import pytest
from src.anonymization.entity_registry import EntityRegistry
from src.anonymization.regex_patterns import (
CRH_FOOTER_IPP_EPISODE,
CRH_FOOTER_PATIENT_PATTERN,
DATE_NAISSANCE_PATTERN,
DR_NAME_PATTERN,
EMAIL_PATTERN,
EPISODE_PATTERN,
FOOTER_PATIENT_PATTERN,
IPP_PATTERN,
NOTE_AUTHOR_PATTERN,
PHONE_PATTERN,
RPPS_PATTERN,
)
class TestRegexPatterns:
def test_ipp_with_colon(self):
m = IPP_PATTERN.search("IPP: 01306172")
assert m is not None
assert m.group(1) == "01306172"
def test_ipp_without_colon(self):
m = IPP_PATTERN.search("IPP 01306172")
assert m is not None
assert m.group(1) == "01306172"
def test_ipp_in_parentheses(self):
m = IPP_PATTERN.search("(01306172 )")
assert m is not None
assert m.group(2) == "01306172"
def test_episode_no(self):
m = EPISODE_PATTERN.search("Episode No: 23042753")
assert m is not None
assert m.group(1) == "23042753"
def test_episode_n_degree(self):
m = EPISODE_PATTERN.search("N° Episode 23042753")
assert m is not None
assert m.group(2) == "23042753"
def test_phone_dots(self):
m = PHONE_PATTERN.search("06.25.39.26.82")
assert m is not None
assert m.group(0) == "06.25.39.26.82"
def test_phone_spaces(self):
m = PHONE_PATTERN.search("05 59 44 35 35")
assert m is not None
def test_email(self):
m = EMAIL_PATTERN.search("faudemar@ch-cotebasque.fr")
assert m is not None
assert m.group(0) == "faudemar@ch-cotebasque.fr"
def test_rpps(self):
m = RPPS_PATTERN.search("RPPS : 10100532760")
assert m is not None
assert m.group(1) == "10100532760"
def test_date_naissance_nee_le(self):
m = DATE_NAISSANCE_PATTERN.search("née le 23/02/1980")
assert m is not None
assert m.group(1) == "23/02/1980"
def test_date_naissance_ne_e_le(self):
m = DATE_NAISSANCE_PATTERN.search("Né(e) le 23/02/1980")
assert m is not None
assert m.group(1) == "23/02/1980"
def test_date_naissance_field(self):
m = DATE_NAISSANCE_PATTERN.search("Date de naissance: 23/02/1980")
assert m is not None
assert m.group(1) == "23/02/1980"
def test_dr_name(self):
m = DR_NAME_PATTERN.search("Dr F. AUDEMAR")
assert m is not None
assert "AUDEMAR" in m.group(1)
def test_dr_name_docteur(self):
m = DR_NAME_PATTERN.search("Docteur AUDEMAR Franck")
assert m is not None
assert "AUDEMAR" in m.group(1)
def test_note_author(self):
m = NOTE_AUTHOR_PATTERN.search("Note IDE Annie GUIRESSE Non algique")
assert m is not None
assert m.group(1) == "Annie GUIRESSE"
def test_footer_patient_trackare(self):
m = FOOTER_PATIENT_PATTERN.search(
"Patient: CLIER NARBAIS AUDREY - Date de naissance: 23/02/1980"
)
assert m is not None
assert "CLIER" in m.group(1)
def test_crh_footer_patient(self):
m = CRH_FOOTER_PATIENT_PATTERN.search(
"Patient(e) : CLIER AUDREY NARBAIS Né(e) le 23/02/1980"
)
assert m is not None
assert "CLIER" in m.group(1)
def test_crh_footer_ipp_episode(self):
m = CRH_FOOTER_IPP_EPISODE.search(
"IPP 01306172 / N° Episode 23042753 (MEDECINE GASTRO B2 HC)"
)
assert m is not None
assert m.group(1) == "01306172"
assert m.group(2) == "23042753"
class TestEntityRegistry:
def test_register_returns_pseudo(self):
reg = EntityRegistry()
pseudo = reg.register("Jean Dupont", "patient")
assert pseudo == "[PATIENT_1]"
def test_register_same_entity_returns_same(self):
reg = EntityRegistry()
p1 = reg.register("Jean Dupont", "patient")
p2 = reg.register("Jean Dupont", "patient")
assert p1 == p2
def test_register_case_insensitive(self):
reg = EntityRegistry()
p1 = reg.register("Jean DUPONT", "patient")
p2 = reg.register("jean dupont", "patient")
assert p1 == p2
def test_register_different_categories(self):
reg = EntityRegistry()
p1 = reg.register("Dupont", "patient")
p2 = reg.register("Martin", "medecin")
assert p1 == "[PATIENT_1]"
assert p2 == "[MEDECIN_1]"
def test_get_replacement(self):
reg = EntityRegistry()
reg.register("Jean Dupont", "patient")
assert reg.get_replacement("jean dupont") == "[PATIENT_1]"
assert reg.get_replacement("inconnu") is None
class TestAnonymizer:
def test_anonymize_basic(self):
from src.anonymization.anonymizer import Anonymizer
parsed = {
"patient": {"nom_prenom": "DUPONT Jean", "nom_naissance": "DUPONT"},
"medecins": ["MARTIN Pierre"],
"contacts": [],
}
anonymizer = Anonymizer(parsed_data=parsed)
text = "Le patient DUPONT Jean a été vu par Dr MARTIN Pierre."
result = anonymizer.anonymize(text)
assert "DUPONT" not in result
assert "MARTIN" not in result
assert "[PATIENT" in result or "[MEDECIN" in result
def test_preserves_medical_content(self):
from src.anonymization.anonymizer import Anonymizer
anonymizer = Anonymizer(parsed_data={"patient": {}, "medecins": [], "contacts": []})
text = "Pancréatite aiguë biliaire. Cholécystectomie par cœlioscopie. IMC 34.37."
result = anonymizer.anonymize(text)
assert "Pancréatite" in result
assert "Cholécystectomie" in result
assert "IMC" in result
def test_anonymize_phone(self):
from src.anonymization.anonymizer import Anonymizer
anonymizer = Anonymizer(parsed_data={"patient": {}, "medecins": [], "contacts": []})
text = "Appeler le 06.25.39.26.82 pour le rendez-vous."
result = anonymizer.anonymize(text)
assert "06.25.39.26.82" not in result
assert "[TEL" in result
def test_anonymize_email(self):
from src.anonymization.anonymizer import Anonymizer
anonymizer = Anonymizer(parsed_data={"patient": {}, "medecins": [], "contacts": []})
text = "Contact: faudemar@ch-cotebasque.fr"
result = anonymizer.anonymize(text)
assert "faudemar@ch-cotebasque.fr" not in result
assert "[EMAIL" in result