Externalize dictionaries and add anonymization review corpus
This commit is contained in:
63
tests/unit/test_header_pii_detection.py
Normal file
63
tests/unit/test_header_pii_detection.py
Normal file
@@ -0,0 +1,63 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests de non-régression pour les fuites en en-tête de document.
|
||||
"""
|
||||
from anonymizer_core_refactored_onnx import (
|
||||
RE_NUM_ACCESSION_HEADER,
|
||||
RE_NUM_EXAMEN_PATIENT,
|
||||
anonymise_document_regex,
|
||||
load_dictionaries,
|
||||
selective_rescan,
|
||||
)
|
||||
|
||||
|
||||
class TestHeaderPiiDetection:
|
||||
"""Cas réels vus en production: nom patient en capitales + numéro d'examen compact."""
|
||||
|
||||
def test_uppercase_patient_header_is_masked(self):
|
||||
cfg = load_dictionaries(None)
|
||||
anon = anonymise_document_regex(["ETCHEVERRY JEAN CLAUDE"], [[]], cfg)
|
||||
|
||||
assert "ETCHEVERRY" not in anon.text_out
|
||||
assert "JEAN" not in anon.text_out
|
||||
assert "CLAUDE" not in anon.text_out
|
||||
assert anon.text_out == "[NOM] [NOM] [NOM]"
|
||||
|
||||
def test_compact_exam_number_matches_labeled_pattern(self):
|
||||
match = RE_NUM_EXAMEN_PATIENT.search("N° examen : 23L35781")
|
||||
|
||||
assert match is not None
|
||||
assert match.group(1) == "23L35781"
|
||||
|
||||
def test_bare_header_accession_number_is_added_to_audit(self):
|
||||
cfg = load_dictionaries(None)
|
||||
text = (
|
||||
"N° 23L35781\n"
|
||||
"Prélevé le 26/07/2023\n"
|
||||
"Enregistré le 27/07/2023\n"
|
||||
)
|
||||
|
||||
match = RE_NUM_ACCESSION_HEADER.search(text)
|
||||
assert match is not None
|
||||
assert match.group(1) == "23L35781"
|
||||
|
||||
anon = anonymise_document_regex([text], [[]], cfg)
|
||||
assert any(h.kind == "DOSSIER" and h.original == "23L35781" for h in anon.audit)
|
||||
|
||||
def test_labeled_exam_number_is_masked_in_text_and_audit(self):
|
||||
cfg = load_dictionaries(None)
|
||||
|
||||
anon = anonymise_document_regex(["N° examen : 23L35781"], [[]], cfg)
|
||||
text = selective_rescan(anon.text_out, cfg)
|
||||
|
||||
assert text == "N° examen : [DOSSIER]"
|
||||
assert any(h.kind == "DOSSIER" and h.original == "23L35781" for h in anon.audit)
|
||||
|
||||
def test_structured_code_postal_preserves_label_and_audit(self):
|
||||
cfg = load_dictionaries(None)
|
||||
|
||||
anon = anonymise_document_regex(["Code postal : 64100"], [[]], cfg)
|
||||
text = selective_rescan(anon.text_out, cfg)
|
||||
|
||||
assert text == "Code postal : [CODE_POSTAL]"
|
||||
assert any(h.kind == "CODE_POSTAL" and h.original == "64100" for h in anon.audit)
|
||||
Reference in New Issue
Block a user