#!/usr/bin/env python3 """ Tests de non-régression sur des layouts d'identifiants vus en documents réels. """ from anonymizer_core_refactored_onnx import ( RE_SCAN_FILENAME_ARTIFACT, anonymise_document_regex, load_dictionaries, ) def test_bacterio_multiline_venue_number_before_ipp_is_masked(): cfg = load_dictionaries(None) text = ( "Diffusé le :\n" "à\n" "N° venue :\n" "31/07/1973\n" "VAN DE GRAAF\n" "23176885\n" "IPP :\n" "2300201230\n" ) anon = anonymise_document_regex([text], [[]], cfg) assert "23176885" not in anon.text_out assert "[NDA]" in anon.text_out assert any(h.kind == "NDA" and h.original == "23176885" for h in anon.audit) def test_scan_filename_artifact_suffix_is_masked(): cfg = load_dictionaries(None) text = ( "IPP:\n" "16014215\n" "Document scanné non\n" "éditable pour patient (dont\ngénétique)\n" "EXT2-16014215-2300249096.TIF\n" ) anon = anonymise_document_regex([text], [[]], cfg) assert RE_SCAN_FILENAME_ARTIFACT.search("EXT2-[IPP]-2300249096.TIF") is not None assert "2300249096" not in anon.text_out assert "EXT2-[IPP]-[DOSSIER].TIF" in anon.text_out