anonymisation/tests/unit/test_header_pii_detection.py

#!/usr/bin/env python3
"""
Tests de non-régression pour les fuites en en-tête de document.
"""
from anonymizer_core_refactored_onnx import (
    RE_NUM_ACCESSION_HEADER,
    RE_NUM_EXAMEN_PATIENT,
    anonymise_document_regex,
    load_dictionaries,
    selective_rescan,
)


class TestHeaderPiiDetection:
    """Cas réels vus en production: nom patient en capitales + numéro d'examen compact."""

    def test_uppercase_patient_header_is_masked(self):
        cfg = load_dictionaries(None)
        anon = anonymise_document_regex(["ETCHEVERRY JEAN CLAUDE"], [[]], cfg)

        assert "ETCHEVERRY" not in anon.text_out
        assert "JEAN" not in anon.text_out
        assert "CLAUDE" not in anon.text_out
        assert anon.text_out == "[NOM] [NOM] [NOM]"

    def test_compact_exam_number_matches_labeled_pattern(self):
        match = RE_NUM_EXAMEN_PATIENT.search("N° examen : 23L35781")

        assert match is not None
        assert match.group(1) == "23L35781"

    def test_bare_header_accession_number_is_added_to_audit(self):
        cfg = load_dictionaries(None)
        text = (
            "N° 23L35781\n"
            "Prélevé le 26/07/2023\n"
            "Enregistré le 27/07/2023\n"
        )

        match = RE_NUM_ACCESSION_HEADER.search(text)
        assert match is not None
        assert match.group(1) == "23L35781"

        anon = anonymise_document_regex([text], [[]], cfg)
        assert any(h.kind == "DOSSIER" and h.original == "23L35781" for h in anon.audit)

    def test_labeled_exam_number_is_masked_in_text_and_audit(self):
        cfg = load_dictionaries(None)

        anon = anonymise_document_regex(["N° examen : 23L35781"], [[]], cfg)
        text = selective_rescan(anon.text_out, cfg)

        assert text == "N° examen : [DOSSIER]"
        assert any(h.kind == "DOSSIER" and h.original == "23L35781" for h in anon.audit)

    def test_structured_code_postal_preserves_label_and_audit(self):
        cfg = load_dictionaries(None)

        anon = anonymise_document_regex(["Code postal : 12345"], [[]], cfg)
        text = selective_rescan(anon.text_out, cfg)

        assert text == "Code postal : [CODE_POSTAL]"
        assert any(h.kind == "CODE_POSTAL" and h.original == "12345" for h in anon.audit)