#!/usr/bin/env python3 """ Tests de non-régression pour les fuites en en-tête de document. """ from anonymizer_core_refactored_onnx import ( RE_NUM_ACCESSION_HEADER, RE_NUM_EXAMEN_PATIENT, anonymise_document_regex, load_dictionaries, selective_rescan, ) class TestHeaderPiiDetection: """Cas réels vus en production: nom patient en capitales + numéro d'examen compact.""" def test_uppercase_patient_header_is_masked(self): cfg = load_dictionaries(None) anon = anonymise_document_regex(["ETCHEVERRY JEAN CLAUDE"], [[]], cfg) assert "ETCHEVERRY" not in anon.text_out assert "JEAN" not in anon.text_out assert "CLAUDE" not in anon.text_out assert anon.text_out == "[NOM] [NOM] [NOM]" def test_compact_exam_number_matches_labeled_pattern(self): match = RE_NUM_EXAMEN_PATIENT.search("N° examen : 23L35781") assert match is not None assert match.group(1) == "23L35781" def test_bare_header_accession_number_is_added_to_audit(self): cfg = load_dictionaries(None) text = ( "N° 23L35781\n" "Prélevé le 26/07/2023\n" "Enregistré le 27/07/2023\n" ) match = RE_NUM_ACCESSION_HEADER.search(text) assert match is not None assert match.group(1) == "23L35781" anon = anonymise_document_regex([text], [[]], cfg) assert any(h.kind == "DOSSIER" and h.original == "23L35781" for h in anon.audit) def test_labeled_exam_number_is_masked_in_text_and_audit(self): cfg = load_dictionaries(None) anon = anonymise_document_regex(["N° examen : 23L35781"], [[]], cfg) text = selective_rescan(anon.text_out, cfg) assert text == "N° examen : [DOSSIER]" assert any(h.kind == "DOSSIER" and h.original == "23L35781" for h in anon.audit) def test_structured_code_postal_preserves_label_and_audit(self): cfg = load_dictionaries(None) anon = anonymise_document_regex(["Code postal : 12345"], [[]], cfg) text = selective_rescan(anon.text_out, cfg) assert text == "Code postal : [CODE_POSTAL]" assert any(h.kind == "CODE_POSTAL" and h.original == "12345" for h in anon.audit)