110 lines
3.3 KiB
Python
110 lines
3.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Tests de non-régression sur des layouts d'identifiants vus en documents réels.
|
|
"""
|
|
from anonymizer_core_refactored_onnx import (
|
|
PiiHit,
|
|
RE_SCAN_FILENAME_ARTIFACT,
|
|
anonymise_document_regex,
|
|
fitz,
|
|
load_dictionaries,
|
|
redact_pdf_vector,
|
|
)
|
|
|
|
|
|
def test_bacterio_multiline_venue_number_before_ipp_is_masked():
|
|
cfg = load_dictionaries(None)
|
|
text = (
|
|
"Diffusé le :\n"
|
|
"à\n"
|
|
"N° venue :\n"
|
|
"31/07/1973\n"
|
|
"VAN DE GRAAF\n"
|
|
"23176885\n"
|
|
"IPP :\n"
|
|
"2300201230\n"
|
|
)
|
|
|
|
anon = anonymise_document_regex([text], [[]], cfg)
|
|
|
|
assert "23176885" not in anon.text_out
|
|
assert "[NDA]" in anon.text_out
|
|
assert any(h.kind == "NDA" and h.original == "23176885" for h in anon.audit)
|
|
|
|
|
|
def test_scan_filename_artifact_suffix_is_masked():
|
|
cfg = load_dictionaries(None)
|
|
text = (
|
|
"IPP:\n"
|
|
"16014215\n"
|
|
"Document scanné non\n"
|
|
"éditable pour patient (dont\ngénétique)\n"
|
|
"EXT2-16014215-2300249096.TIF\n"
|
|
)
|
|
|
|
anon = anonymise_document_regex([text], [[]], cfg)
|
|
|
|
assert RE_SCAN_FILENAME_ARTIFACT.search("EXT2-[IPP]-2300249096.TIF") is not None
|
|
assert "2300249096" not in anon.text_out
|
|
assert "EXT2-[IPP]-[DOSSIER].TIF" in anon.text_out
|
|
|
|
|
|
def test_practitioner_council_form_masks_professional_name_and_preserves_pmsi_codes():
|
|
cfg = load_dictionaries(None)
|
|
text = (
|
|
"N° OGC : 14\n"
|
|
"FICHE MEDICALE DE RECUEIL DU PRATICIEN CONSEIL (une fiche par RUM)\n"
|
|
"Nom du praticien-conseil : V NOMTEST\n"
|
|
"DP K851 PANCREATITE AIG. BIL.\n"
|
|
"GHM après recodage : 07C141\n"
|
|
"ARGUMENTAIRE DU MEDECIN CONTROLEUR\n"
|
|
"142 : La facturation du GHS par l'etablissement n'est pas conforme\n"
|
|
)
|
|
|
|
anon = anonymise_document_regex([text], [[]], cfg)
|
|
|
|
assert "NOMTEST" not in anon.text_out
|
|
assert "Nom du praticien-conseil : [NOM]" in anon.text_out
|
|
assert "N° OGC : 14" in anon.text_out
|
|
assert "07C141" in anon.text_out
|
|
assert "142 : La facturation" in anon.text_out
|
|
assert not any(h.kind in {"OGC", "OGC_court"} for h in anon.audit)
|
|
assert any(
|
|
h.kind == "NOM_FORCE" and "NOMTEST" in h.original
|
|
for h in anon.audit
|
|
)
|
|
|
|
|
|
def test_ogc_is_still_masked_outside_practitioner_council_form():
|
|
cfg = load_dictionaries(None)
|
|
text = "N° OGC : 12\nCompte rendu standard\n"
|
|
|
|
anon = anonymise_document_regex([text], [[]], cfg)
|
|
|
|
assert "N° OGC : [OGC]" in anon.text_out
|
|
assert "N° OGC : 12" not in anon.text_out
|
|
assert any(h.kind == "OGC" and h.original == "12" for h in anon.audit)
|
|
|
|
|
|
def test_ogc_pdf_redaction_does_not_mask_numeric_substrings(tmp_path):
|
|
if fitz is None:
|
|
return
|
|
|
|
source = tmp_path / "ogc_substrings.pdf"
|
|
output = tmp_path / "ogc_substrings.redacted.pdf"
|
|
doc = fitz.open()
|
|
page = doc.new_page()
|
|
page.insert_text((72, 72), "N° OGC : 14")
|
|
page.insert_text((72, 100), "GHM apres recodage : 07C141")
|
|
page.insert_text((72, 128), "142 : La facturation reste lisible")
|
|
doc.save(source)
|
|
doc.close()
|
|
|
|
redact_pdf_vector(source, [PiiHit(0, "OGC", "14", "[OGC]")], output)
|
|
|
|
redacted = fitz.open(output)
|
|
text = redacted[0].get_text()
|
|
redacted.close()
|
|
assert "07C141" in text
|
|
assert "142 : La facturation" in text
|