anonymisation/tests/unit/test_real_world_identifier_layouts.py

#!/usr/bin/env python3
"""
Tests de non-régression sur des layouts d'identifiants vus en documents réels.
"""
from anonymizer_core_refactored_onnx import (
    PiiHit,
    RE_SCAN_FILENAME_ARTIFACT,
    anonymise_document_regex,
    fitz,
    load_dictionaries,
    redact_pdf_vector,
)


def test_bacterio_multiline_venue_number_before_ipp_is_masked():
    cfg = load_dictionaries(None)
    text = (
        "Diffusé le :\n"
        "à\n"
        "N° venue :\n"
        "31/07/1973\n"
        "VAN DE GRAAF\n"
        "23176885\n"
        "IPP :\n"
        "2300201230\n"
    )

    anon = anonymise_document_regex([text], [[]], cfg)

    assert "23176885" not in anon.text_out
    assert "[NDA]" in anon.text_out
    assert any(h.kind == "NDA" and h.original == "23176885" for h in anon.audit)


def test_scan_filename_artifact_suffix_is_masked():
    cfg = load_dictionaries(None)
    text = (
        "IPP:\n"
        "16014215\n"
        "Document scanné non\n"
        "éditable pour patient (dont\ngénétique)\n"
        "EXT2-16014215-2300249096.TIF\n"
    )

    anon = anonymise_document_regex([text], [[]], cfg)

    assert RE_SCAN_FILENAME_ARTIFACT.search("EXT2-[IPP]-2300249096.TIF") is not None
    assert "2300249096" not in anon.text_out
    assert "EXT2-[IPP]-[DOSSIER].TIF" in anon.text_out


def test_practitioner_council_form_masks_professional_name_and_preserves_pmsi_codes():
    cfg = load_dictionaries(None)
    text = (
        "N° OGC : 14\n"
        "FICHE MEDICALE DE RECUEIL DU PRATICIEN CONSEIL (une fiche par RUM)\n"
        "Nom du praticien-conseil : V NOMTEST\n"
        "DP  K851  PANCREATITE AIG. BIL.\n"
        "GHM après recodage : 07C141\n"
        "ARGUMENTAIRE DU MEDECIN CONTROLEUR\n"
        "142 : La facturation du GHS par l'etablissement n'est pas conforme\n"
    )

    anon = anonymise_document_regex([text], [[]], cfg)

    assert "NOMTEST" not in anon.text_out
    assert "Nom du praticien-conseil : [NOM]" in anon.text_out
    assert "N° OGC : 14" in anon.text_out
    assert "07C141" in anon.text_out
    assert "142 : La facturation" in anon.text_out
    assert not any(h.kind in {"OGC", "OGC_court"} for h in anon.audit)
    assert any(
        h.kind == "NOM_FORCE" and "NOMTEST" in h.original
        for h in anon.audit
    )


def test_ogc_is_still_masked_outside_practitioner_council_form():
    cfg = load_dictionaries(None)
    text = "N° OGC : 12\nCompte rendu standard\n"

    anon = anonymise_document_regex([text], [[]], cfg)

    assert "N° OGC : [OGC]" in anon.text_out
    assert "N° OGC : 12" not in anon.text_out
    assert any(h.kind == "OGC" and h.original == "12" for h in anon.audit)


def test_ogc_pdf_redaction_does_not_mask_numeric_substrings(tmp_path):
    if fitz is None:
        return

    source = tmp_path / "ogc_substrings.pdf"
    output = tmp_path / "ogc_substrings.redacted.pdf"
    doc = fitz.open()
    page = doc.new_page()
    page.insert_text((72, 72), "N° OGC : 14")
    page.insert_text((72, 100), "GHM apres recodage : 07C141")
    page.insert_text((72, 128), "142 : La facturation reste lisible")
    doc.save(source)
    doc.close()

    redact_pdf_vector(source, [PiiHit(0, "OGC", "14", "[OGC]")], output)

    redacted = fitz.open(output)
    text = redacted[0].get_text()
    redacted.close()
    assert "07C141" in text
    assert "142 : La facturation" in text


def test_pdf_redaction_directly_masks_finess_address_range(tmp_path):
    """Cas Dom 2026-06-16 : une adresse d'établissement visible dans le PDF
    doit être caviardée même si l'audit n'a pas fourni le hit exact."""
    if fitz is None:
        return

    source = tmp_path / "finess_address_range.pdf"
    output = tmp_path / "finess_address_range.redacted.pdf"
    doc = fitz.open()
    page = doc.new_page()
    page.insert_text((72, 72), "15 à 35 rue Claude Boucher Bordeaux Cedex")
    page.insert_text((72, 108), "Motif d'hospitalisation : contrôle clinique.")
    doc.save(source)
    doc.close()

    redact_pdf_vector(source, [], output)

    redacted = fitz.open(output)
    text = redacted[0].get_text()
    redacted.close()
    assert "Claude Boucher" not in text
    assert "15 à 35" not in text
    assert "Motif d'hospitalisation" in text


def test_crop_epi_header_name_is_masked():
    cfg = load_dictionaries(None)
    text = (
        "CROp Epi -  NOMTEST, Jean-Michel\n"
        "Compte rendu opératoire\n"
    )

    anon = anonymise_document_regex([text], [[]], cfg)

    assert "NOMTEST" not in anon.text_out
    assert "Jean-Michel" not in anon.text_out
    assert "CROp Epi -  [NOM]" in anon.text_out
    assert any(
        h.kind == "NOM_FORCE" and "NOMTEST" in h.original
        for h in anon.audit
    )


def test_crop_epi_header_name_with_spaced_suffix_is_masked():
    cfg = load_dictionaries(None)
    text = (
        "CROp Epi -  NOMTEST, Marie     NOMSUFFIX\n"
        "Compte rendu opératoire\n"
    )

    anon = anonymise_document_regex([text], [[]], cfg)

    assert "NOMTEST" not in anon.text_out
    assert "Marie" not in anon.text_out
    assert "NOMSUFFIX" not in anon.text_out
    assert "CROp Epi -  [NOM]" in anon.text_out
    assert any(
        h.kind == "NOM_FORCE" and "NOMSUFFIX" in h.original
        for h in anon.audit
    )


def test_standalone_compound_signature_name_is_masked():
    cfg = load_dictionaries(None)
    text = (
        "Observation clinique stable.\n"
        "Alix-Pierre Nomtest\n"
        "Suite de la prise en charge.\n"
    )

    anon = anonymise_document_regex([text], [[]], cfg)

    assert "Alix-Pierre" not in anon.text_out
    assert "Nomtest" not in anon.text_out
    assert "[NOM]" in anon.text_out
    assert any(
        h.kind == "NOM_FORCE" and "Alix-Pierre Nomtest" in h.original
        for h in anon.audit
    )


def test_modified_by_application_banner_name_is_masked():
    cfg = load_dictionaries(None)
    text = (
        ">>>CRO type 10/04/23 14 : 19   "
        "(mod. le 13/04/23 15:58 par NOMTEST Fanny, statut : complet)\n"
    )

    anon = anonymise_document_regex([text], [[]], cfg)

    assert "NOMTEST" not in anon.text_out
    assert "Fanny" not in anon.text_out
    assert "par [NOM], statut" in anon.text_out
    assert any(
        h.kind == "NOM_FORCE" and "NOMTEST Fanny" in h.original
        for h in anon.audit
    )


def test_ref_initials_are_page_local_for_pdf_redaction():
    cfg = load_dictionaries(None)
    text = "Réf : RBG/FL\n"

    anon = anonymise_document_regex([text], [[]], cfg)

    assert "Réf : [NOM]/[NOM]" in anon.text_out
    assert any(h.kind == "NOM_INITIAL" and h.original == "RBG" and h.page == 0 for h in anon.audit)
    assert any(h.kind == "NOM_INITIAL" and h.original == "FL" and h.page == 0 for h in anon.audit)


def test_ref_initials_with_suffix_label_are_masked():
    cfg = load_dictionaries(None)
    text = "Réf_CRO : EG/PB\nCOMPTE-RENDU OPÉRATOIRE\n"

    anon = anonymise_document_regex([text], [[]], cfg)

    assert "EG/PB" not in anon.text_out
    assert "Réf_CRO : [NOM]/[NOM]" in anon.text_out
    assert any(h.kind == "NOM_INITIAL" and h.original == "EG" and h.page == 0 for h in anon.audit)
    assert any(h.kind == "NOM_INITIAL" and h.original == "PB" and h.page == 0 for h in anon.audit)


def test_staff_role_with_interne_name_is_masked():
    cfg = load_dictionaries(None)
    text = "Aide : l'interne Charles NOMTEST\nAnesthésiste : Docteur [NOM]\n"

    anon = anonymise_document_regex([text], [[]], cfg)

    assert "Charles" not in anon.text_out
    assert "NOMTEST" not in anon.text_out
    assert "Aide : l'interne [NOM]" in anon.text_out
    assert any(
        h.kind == "NOM_FORCE" and "Charles NOMTEST" in h.original
        for h in anon.audit
    )


def test_trackare_iao_multiline_staff_name_is_masked():
    cfg = load_dictionaries(None)
    text = (
        "Heure d'orientation\n"
        "18 : 48\n"
        "IAO\n"
        "NOMTEST Marlène\n"
        "Priorité\n"
    )

    anon = anonymise_document_regex([text], [[]], cfg)

    assert "NOMTEST" not in anon.text_out
    assert "Marlène" not in anon.text_out
    assert any(h.kind in {"NOM", "NOM_FORCE", "NOM_GLOBAL"} for h in anon.audit)