Couvre les corrections PII batch A/A-2, le NIR multi-ligne en flux reel, le gazetteer FINESS Corse derive depuis la base locale, et les tests de regression associes. Aucun build ni diffusion.
264 lines
8.0 KiB
Python
264 lines
8.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Tests de non-régression sur des layouts d'identifiants vus en documents réels.
|
|
"""
|
|
from anonymizer_core_refactored_onnx import (
|
|
PiiHit,
|
|
RE_SCAN_FILENAME_ARTIFACT,
|
|
anonymise_document_regex,
|
|
fitz,
|
|
load_dictionaries,
|
|
redact_pdf_vector,
|
|
)
|
|
|
|
|
|
def test_bacterio_multiline_venue_number_before_ipp_is_masked():
|
|
cfg = load_dictionaries(None)
|
|
text = (
|
|
"Diffusé le :\n"
|
|
"à\n"
|
|
"N° venue :\n"
|
|
"31/07/1973\n"
|
|
"VAN DE GRAAF\n"
|
|
"23176885\n"
|
|
"IPP :\n"
|
|
"2300201230\n"
|
|
)
|
|
|
|
anon = anonymise_document_regex([text], [[]], cfg)
|
|
|
|
assert "23176885" not in anon.text_out
|
|
assert "[NDA]" in anon.text_out
|
|
assert any(h.kind == "NDA" and h.original == "23176885" for h in anon.audit)
|
|
|
|
|
|
def test_scan_filename_artifact_suffix_is_masked():
|
|
cfg = load_dictionaries(None)
|
|
text = (
|
|
"IPP:\n"
|
|
"16014215\n"
|
|
"Document scanné non\n"
|
|
"éditable pour patient (dont\ngénétique)\n"
|
|
"EXT2-16014215-2300249096.TIF\n"
|
|
)
|
|
|
|
anon = anonymise_document_regex([text], [[]], cfg)
|
|
|
|
assert RE_SCAN_FILENAME_ARTIFACT.search("EXT2-[IPP]-2300249096.TIF") is not None
|
|
assert "2300249096" not in anon.text_out
|
|
assert "EXT2-[IPP]-[DOSSIER].TIF" in anon.text_out
|
|
|
|
|
|
def test_practitioner_council_form_masks_professional_name_and_preserves_pmsi_codes():
|
|
cfg = load_dictionaries(None)
|
|
text = (
|
|
"N° OGC : 14\n"
|
|
"FICHE MEDICALE DE RECUEIL DU PRATICIEN CONSEIL (une fiche par RUM)\n"
|
|
"Nom du praticien-conseil : V NOMTEST\n"
|
|
"DP K851 PANCREATITE AIG. BIL.\n"
|
|
"GHM après recodage : 07C141\n"
|
|
"ARGUMENTAIRE DU MEDECIN CONTROLEUR\n"
|
|
"142 : La facturation du GHS par l'etablissement n'est pas conforme\n"
|
|
)
|
|
|
|
anon = anonymise_document_regex([text], [[]], cfg)
|
|
|
|
assert "NOMTEST" not in anon.text_out
|
|
assert "Nom du praticien-conseil : [NOM]" in anon.text_out
|
|
assert "N° OGC : 14" in anon.text_out
|
|
assert "07C141" in anon.text_out
|
|
assert "142 : La facturation" in anon.text_out
|
|
assert not any(h.kind in {"OGC", "OGC_court"} for h in anon.audit)
|
|
assert any(
|
|
h.kind == "NOM_FORCE" and "NOMTEST" in h.original
|
|
for h in anon.audit
|
|
)
|
|
|
|
|
|
def test_ogc_is_still_masked_outside_practitioner_council_form():
|
|
cfg = load_dictionaries(None)
|
|
text = "N° OGC : 12\nCompte rendu standard\n"
|
|
|
|
anon = anonymise_document_regex([text], [[]], cfg)
|
|
|
|
assert "N° OGC : [OGC]" in anon.text_out
|
|
assert "N° OGC : 12" not in anon.text_out
|
|
assert any(h.kind == "OGC" and h.original == "12" for h in anon.audit)
|
|
|
|
|
|
def test_ogc_pdf_redaction_does_not_mask_numeric_substrings(tmp_path):
|
|
if fitz is None:
|
|
return
|
|
|
|
source = tmp_path / "ogc_substrings.pdf"
|
|
output = tmp_path / "ogc_substrings.redacted.pdf"
|
|
doc = fitz.open()
|
|
page = doc.new_page()
|
|
page.insert_text((72, 72), "N° OGC : 14")
|
|
page.insert_text((72, 100), "GHM apres recodage : 07C141")
|
|
page.insert_text((72, 128), "142 : La facturation reste lisible")
|
|
doc.save(source)
|
|
doc.close()
|
|
|
|
redact_pdf_vector(source, [PiiHit(0, "OGC", "14", "[OGC]")], output)
|
|
|
|
redacted = fitz.open(output)
|
|
text = redacted[0].get_text()
|
|
redacted.close()
|
|
assert "07C141" in text
|
|
assert "142 : La facturation" in text
|
|
|
|
|
|
def test_pdf_redaction_directly_masks_finess_address_range(tmp_path):
|
|
"""Cas Dom 2026-06-16 : une adresse d'établissement visible dans le PDF
|
|
doit être caviardée même si l'audit n'a pas fourni le hit exact."""
|
|
if fitz is None:
|
|
return
|
|
|
|
source = tmp_path / "finess_address_range.pdf"
|
|
output = tmp_path / "finess_address_range.redacted.pdf"
|
|
doc = fitz.open()
|
|
page = doc.new_page()
|
|
page.insert_text((72, 72), "15 à 35 rue Claude Boucher Bordeaux Cedex")
|
|
page.insert_text((72, 108), "Motif d'hospitalisation : contrôle clinique.")
|
|
doc.save(source)
|
|
doc.close()
|
|
|
|
redact_pdf_vector(source, [], output)
|
|
|
|
redacted = fitz.open(output)
|
|
text = redacted[0].get_text()
|
|
redacted.close()
|
|
assert "Claude Boucher" not in text
|
|
assert "15 à 35" not in text
|
|
assert "Motif d'hospitalisation" in text
|
|
|
|
|
|
def test_crop_epi_header_name_is_masked():
|
|
cfg = load_dictionaries(None)
|
|
text = (
|
|
"CROp Epi - NOMTEST, Jean-Michel\n"
|
|
"Compte rendu opératoire\n"
|
|
)
|
|
|
|
anon = anonymise_document_regex([text], [[]], cfg)
|
|
|
|
assert "NOMTEST" not in anon.text_out
|
|
assert "Jean-Michel" not in anon.text_out
|
|
assert "CROp Epi - [NOM]" in anon.text_out
|
|
assert any(
|
|
h.kind == "NOM_FORCE" and "NOMTEST" in h.original
|
|
for h in anon.audit
|
|
)
|
|
|
|
|
|
def test_crop_epi_header_name_with_spaced_suffix_is_masked():
|
|
cfg = load_dictionaries(None)
|
|
text = (
|
|
"CROp Epi - NOMTEST, Marie NOMSUFFIX\n"
|
|
"Compte rendu opératoire\n"
|
|
)
|
|
|
|
anon = anonymise_document_regex([text], [[]], cfg)
|
|
|
|
assert "NOMTEST" not in anon.text_out
|
|
assert "Marie" not in anon.text_out
|
|
assert "NOMSUFFIX" not in anon.text_out
|
|
assert "CROp Epi - [NOM]" in anon.text_out
|
|
assert any(
|
|
h.kind == "NOM_FORCE" and "NOMSUFFIX" in h.original
|
|
for h in anon.audit
|
|
)
|
|
|
|
|
|
def test_standalone_compound_signature_name_is_masked():
|
|
cfg = load_dictionaries(None)
|
|
text = (
|
|
"Observation clinique stable.\n"
|
|
"Alix-Pierre Nomtest\n"
|
|
"Suite de la prise en charge.\n"
|
|
)
|
|
|
|
anon = anonymise_document_regex([text], [[]], cfg)
|
|
|
|
assert "Alix-Pierre" not in anon.text_out
|
|
assert "Nomtest" not in anon.text_out
|
|
assert "[NOM]" in anon.text_out
|
|
assert any(
|
|
h.kind == "NOM_FORCE" and "Alix-Pierre Nomtest" in h.original
|
|
for h in anon.audit
|
|
)
|
|
|
|
|
|
def test_modified_by_application_banner_name_is_masked():
|
|
cfg = load_dictionaries(None)
|
|
text = (
|
|
">>>CRO type 10/04/23 14 : 19 "
|
|
"(mod. le 13/04/23 15:58 par NOMTEST Fanny, statut : complet)\n"
|
|
)
|
|
|
|
anon = anonymise_document_regex([text], [[]], cfg)
|
|
|
|
assert "NOMTEST" not in anon.text_out
|
|
assert "Fanny" not in anon.text_out
|
|
assert "par [NOM], statut" in anon.text_out
|
|
assert any(
|
|
h.kind == "NOM_FORCE" and "NOMTEST Fanny" in h.original
|
|
for h in anon.audit
|
|
)
|
|
|
|
|
|
def test_ref_initials_are_page_local_for_pdf_redaction():
|
|
cfg = load_dictionaries(None)
|
|
text = "Réf : RBG/FL\n"
|
|
|
|
anon = anonymise_document_regex([text], [[]], cfg)
|
|
|
|
assert "Réf : [NOM]/[NOM]" in anon.text_out
|
|
assert any(h.kind == "NOM_INITIAL" and h.original == "RBG" and h.page == 0 for h in anon.audit)
|
|
assert any(h.kind == "NOM_INITIAL" and h.original == "FL" and h.page == 0 for h in anon.audit)
|
|
|
|
|
|
def test_ref_initials_with_suffix_label_are_masked():
|
|
cfg = load_dictionaries(None)
|
|
text = "Réf_CRO : EG/PB\nCOMPTE-RENDU OPÉRATOIRE\n"
|
|
|
|
anon = anonymise_document_regex([text], [[]], cfg)
|
|
|
|
assert "EG/PB" not in anon.text_out
|
|
assert "Réf_CRO : [NOM]/[NOM]" in anon.text_out
|
|
assert any(h.kind == "NOM_INITIAL" and h.original == "EG" and h.page == 0 for h in anon.audit)
|
|
assert any(h.kind == "NOM_INITIAL" and h.original == "PB" and h.page == 0 for h in anon.audit)
|
|
|
|
|
|
def test_staff_role_with_interne_name_is_masked():
|
|
cfg = load_dictionaries(None)
|
|
text = "Aide : l'interne Charles NOMTEST\nAnesthésiste : Docteur [NOM]\n"
|
|
|
|
anon = anonymise_document_regex([text], [[]], cfg)
|
|
|
|
assert "Charles" not in anon.text_out
|
|
assert "NOMTEST" not in anon.text_out
|
|
assert "Aide : l'interne [NOM]" in anon.text_out
|
|
assert any(
|
|
h.kind == "NOM_FORCE" and "Charles NOMTEST" in h.original
|
|
for h in anon.audit
|
|
)
|
|
|
|
|
|
def test_trackare_iao_multiline_staff_name_is_masked():
|
|
cfg = load_dictionaries(None)
|
|
text = (
|
|
"Heure d'orientation\n"
|
|
"18 : 48\n"
|
|
"IAO\n"
|
|
"NOMTEST Marlène\n"
|
|
"Priorité\n"
|
|
)
|
|
|
|
anon = anonymise_document_regex([text], [[]], cfg)
|
|
|
|
assert "NOMTEST" not in anon.text_out
|
|
assert "Marlène" not in anon.text_out
|
|
assert any(h.kind in {"NOM", "NOM_FORCE", "NOM_GLOBAL"} for h in anon.audit)
|