diff --git a/tests/unit/test_f5_nom_compose_orphelin.py b/tests/unit/test_f5_nom_compose_orphelin.py new file mode 100644 index 0000000..8dc6d6b --- /dev/null +++ b/tests/unit/test_f5_nom_compose_orphelin.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python3 +""" +Test de non-regression pour le fix F5 (commit 299bbee). + +F5 : post-passe masquant la continuation orpheline d'un nom compose coupe +par un saut de ligne dans le format Trackare en colonnes. + +Cas reproduit : + ... 07:55 NOCENT- + EJNAINI + +Le nom "NOCENT-EJNAINI" est eclate sur deux lignes. Le NER ligne par ligne +ne peut pas les assembler. Le 1er composant (NOCENT-) est masque via un +autre artefact de remplacement, mais le 2e (EJNAINI) reste orphelin en clair. + +F5 ajoute une regex post-masquage qui detecte "[NOM]-\\n" +et masque le token orphelin. Le token doit etre directement apres le saut +de ligne (whitespace accepte), pas apres un autre texte. + +Source : anonymizer_core_refactored_onnx.py, lignes ~4505-4516, +fonction process_pdf(), bloc "3a-bis) Nettoyage post-masquage". +""" +from __future__ import annotations + +import re + +import pytest + +from anonymizer_core_refactored_onnx import PLACEHOLDERS + +# --------------------------------------------------------------------------- +# F5 regex — reproduite ici pour test unitaire (identique a process_pdf) +# --------------------------------------------------------------------------- +_RE_NOM_ORPHAN = re.compile( + r"(\[NOM\]-\s*\n?\s*)([A-Z\u00C0-\u0178][A-Z\u00C0-\u0178'\-]{3,})\b" +) + + +def _apply_f5_nom_orphan(text: str) -> tuple[str, list]: + """Applique la post-passe F5 sur une continuation orpheline de nom compose. + + Retourne le texte nettoye et la liste des tokens masques (pour audit). + Logique identique a celle dans process_pdf() etape 3a-bis. + """ + hits = [] + + # Stop-words medicaux exclus du masquage (meme liste que process_pdf) + _MEDICAL_STOP_WORDS = { + "ampoule", "ampoules", "comprime", "comprimes", "gelule", "gelules", + "solution", "solutions", "traitement", "traitements", "injection", + "perfusions", "prescription", "posologie", "diagnostic", "examen", + "resultat", "resultats", "observation", "antibiogramme", "bacterio", + } + + def _clean(m): + tok = m.group(2) + if tok.lower() in _MEDICAL_STOP_WORDS: + return m.group(0) + hits.append(tok) + return m.group(1) + PLACEHOLDERS["NOM"] + + cleaned = _RE_NOM_ORPHAN.sub(_clean, text) + return cleaned, hits + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + +class TestF5NomComposeOrphelin: + """F5 - Continuation orpheline d'un nom compose coupe par saut de ligne.""" + + # -- Regex seule -- + + def test_f5_regex_matches_nom_orphan_direct_after_dash_newline(self): + """La regex F5 capture un token majuscule directement apres [NOM]-\\n.""" + text = "[NOM]-\nEJNAINI" + match = _RE_NOM_ORPHAN.search(text) + assert match is not None + assert match.group(1) == "[NOM]-\n" + assert match.group(2) == "EJNAINI" + + def test_f5_regex_matches_with_leading_spaces_on_next_line(self): + """La regex F5 tolere des espaces en debut de ligne suivante.""" + text = "[NOM]-\n EJNAINI" + match = _RE_NOM_ORPHAN.search(text) + assert match is not None + assert match.group(2) == "EJNAINI" + + def test_f5_regex_matches_with_trailing_spaces_before_newline(self): + """La regex F5 tolere des espaces avant le saut de ligne.""" + text = "[NOM]- \n EJNAINI" + match = _RE_NOM_ORPHAN.search(text) + assert match is not None + assert match.group(2) == "EJNAINI" + + def test_f5_regex_no_match_when_intervening_text(self): + """La regex F5 ne matche PAS si du texte separe [NOM]-\\n du token. + C'est le cas quand le token n'est pas une continuation directe du nom + compose (ex: autre colonne du tableau Trackare).""" + text = "[NOM]-\nAmpoule(s) EJNAINI" + # "Ampoule(s)" n'est pas tout en majuscule, donc la regex ne le matche + # pas comme groupe 2, et EJNAINI n'est pas directement apres \n\s* + match = _RE_NOM_ORPHAN.search(text) + assert match is None, ( + "F5 ne doit pas matcher quand du texte separe [NOM]- du token orphelin" + ) + + def test_f5_regex_rejects_lowercase_start(self): + """Un token commencant par une minuscule n'est pas capture.""" + match = _RE_NOM_ORPHAN.search("[NOM]-\nejnaini") + assert match is None + + def test_f5_regex_minimum_length_4_chars(self): + """Le token doit faire au moins 4 caracteres (1 + {3,}).""" + assert _RE_NOM_ORPHAN.search("[NOM]-\nABC") is None, "3 chars = trop court" + assert _RE_NOM_ORPHAN.search("[NOM]-\nABCD") is not None, "4 chars = OK" + + # -- Application F5 -- + + def test_f5_apply_masks_orphan_token(self): + """_apply_f5_nom_orphan remplace le token orphelin par [NOM].""" + text = "[NOM]-\nEJNAINI" + cleaned, hits = _apply_f5_nom_orphan(text) + assert hits == ["EJNAINI"] + assert "[NOM]-" in cleaned + assert "EJNAINI" not in cleaned + # Les deux parties du nom compose doivent etre masquees + assert cleaned.count(PLACEHOLDERS["NOM"]) == 2 + + def test_f5_apply_preserves_context_around_orphan(self): + """Le contexte autour du nom orphelin n'est pas modifie.""" + text = "07:55 [NOM]-\nEJNAINI\nSuite du traitement" + cleaned, hits = _apply_f5_nom_orphan(text) + assert hits == ["EJNAINI"] + assert "07:55 " in cleaned + assert "Suite du traitement" in cleaned + assert "EJNAINI" not in cleaned + + def test_f5_apply_multiple_orphans(self): + """F5 masque plusieurs orphelines dans le meme texte.""" + text = "[NOM]-\nDUPONT\nAutre [NOM]-\nMARTIN" + cleaned, hits = _apply_f5_nom_orphan(text) + assert len(hits) == 2 + assert "DUPONT" not in cleaned + assert "MARTIN" not in cleaned + assert cleaned.count(PLACEHOLDERS["NOM"]) == 4 # 2 initiaux + 2 orphelins + + def test_f5_no_false_positive_on_normal_text(self): + """F5 ne modifie pas un texte sans pattern [NOM]-\\n.""" + text = "Patient presente le [DATE]. Traitement prescrit." + cleaned, hits = _apply_f5_nom_orphan(text) + assert hits == [] + assert cleaned == text + + # -- Cas reel Trackare -- + + def test_f5_full_trackare_scenario(self): + """Test du cas Trackare complet : nom NOCENT-EJNAINI coupe par saut + de ligne dans l'extraction PDF en colonnes. + + Format Trackare en colonnes : + Colonne nom : "07:55 NOCENT-" + Ligne suivante : "EJNAINI" + + Apres masquage initial (pre-F5) : + "07:55 [NOM]-\nEJNAINI" + + Apres F5 : + "07:55 [NOM]-\n[NOM]" + """ + # Input simulant le resultat pre-F5 (NOCENT masque, EJNAINI orphelin) + pre_f5 = "07:55 [NOM]-\nEJNAINI" + + cleaned, hits = _apply_f5_nom_orphan(pre_f5) + + # Verification : les deux composantes du nom compose sont masquees + assert "[NOM]-" in cleaned, "Le 1er composant doit rester masque" + assert "EJNAINI" not in cleaned, "Le 2e composant orphelin doit etre masque par F5" + assert "EJNAINI" not in cleaned, "Aucune fuite du nom orphelin" + assert cleaned.count(PLACEHOLDERS["NOM"]) == 2, ( + "Les deux parties du nom compose doivent etre masquees" + ) + assert hits == ["EJNAINI"], "EJNAINI doit etre loggue dans l'audit" + + def test_f5_trackare_with_spaces_in_column_alignment(self): + """Cas Trackare avec espaces d'alignement de colonne.""" + pre_f5 = "07:55 [NOM]- \n EJNAINI \nSuite" + cleaned, hits = _apply_f5_nom_orphan(pre_f5) + assert hits == ["EJNAINI"] + assert "EJNAINI" not in cleaned + assert "Suite" in cleaned + + def test_f5_nom_compose_with_apostrophe_and_dash(self): + """Token orphelin contenant apostrophes et tirets.""" + pre_f5 = "[NOM]-\nDUPONT-MARTIN" + cleaned, hits = _apply_f5_nom_orphan(pre_f5) + assert hits == ["DUPONT-MARTIN"] + assert "DUPONT-MARTIN" not in cleaned + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/unit/test_gui_batch_paths.py b/tests/unit/test_gui_batch_paths.py new file mode 100644 index 0000000..0cda0c3 --- /dev/null +++ b/tests/unit/test_gui_batch_paths.py @@ -0,0 +1,70 @@ +from pathlib import Path + +from gui_batch_paths import ( + build_batch_output_dir, + iter_pseudonymized_texts, + list_supported_documents, +) + + +def test_list_supported_documents_excludes_gui_output_tree(tmp_path: Path): + root = tmp_path / "cases" + source_case = root / "001_patient_header_and_birth" + source_case.mkdir(parents=True) + nested_case = root / "002_contact_bundle" + nested_case.mkdir(parents=True) + output_case = root / "anonymise" / "001_patient_header_and_birth" + output_case.mkdir(parents=True) + + source_txt = source_case / "test.txt" + source_pdf = nested_case / "source.pdf" + output_txt = output_case / "test.pseudonymise.txt" + + source_txt.write_text("source", encoding="utf-8") + source_pdf.write_text("pdf", encoding="utf-8") + output_txt.write_text("output", encoding="utf-8") + + documents = list_supported_documents(root, {".txt", ".pdf"}) + + assert documents == [source_txt, source_pdf] + + +def test_build_batch_output_dir_preserves_relative_parent(tmp_path: Path): + root = tmp_path / "cases" + output_root = root / "anonymise" + source = root / "010_spaced_establishment_header" / "test.txt" + source.parent.mkdir(parents=True) + source.write_text("test", encoding="utf-8") + + output_dir = build_batch_output_dir(root, output_root, source) + + assert output_dir == output_root / "010_spaced_establishment_header" + + +def test_build_batch_output_dir_keeps_root_files_at_output_root(tmp_path: Path): + root = tmp_path / "cases" + output_root = root / "anonymise" + source = root / "test.txt" + root.mkdir(parents=True) + source.write_text("test", encoding="utf-8") + + output_dir = build_batch_output_dir(root, output_root, source) + + assert output_dir == output_root + + +def test_iter_pseudonymized_texts_is_recursive(tmp_path: Path): + output_root = tmp_path / "anonymise" + nested = output_root / "001_patient_header_and_birth" + nested.mkdir(parents=True) + top_level = output_root / "summary.pseudonymise.txt" + nested_txt = nested / "test.pseudonymise.txt" + other_file = nested / "audit.jsonl" + + top_level.write_text("top", encoding="utf-8") + nested_txt.write_text("nested", encoding="utf-8") + other_file.write_text("{}", encoding="utf-8") + + found = sorted(iter_pseudonymized_texts(output_root)) + + assert found == [nested_txt, top_level] diff --git a/tests/unit/test_manual_masking.py b/tests/unit/test_manual_masking.py new file mode 100644 index 0000000..ee00c4d --- /dev/null +++ b/tests/unit/test_manual_masking.py @@ -0,0 +1,60 @@ +from pathlib import Path + +from manual_masking import ( + DEFAULT_MASK_OUTPUT_DIRNAME, + DEFAULT_MASK_PREVIEW_DIRNAME, + append_jsonl_file, + ensure_mask_templates_dir, + list_mask_templates, + mask_templates_dir, + mask_template_label, + resolve_manual_mask_pdf, +) + + +def test_mask_templates_dir_is_under_config(): + base = Path("/tmp/anonymisation") + assert mask_templates_dir(base) == base / "config" / "mask_templates" + + +def test_ensure_mask_templates_dir_creates_folder(tmp_path: Path): + created = ensure_mask_templates_dir(tmp_path) + assert created == tmp_path / "config" / "mask_templates" + assert created.is_dir() + + +def test_resolve_manual_mask_pdf_accepts_only_pdf(): + assert resolve_manual_mask_pdf(Path("/tmp/test.pdf")) == Path("/tmp/test.pdf") + assert resolve_manual_mask_pdf(Path("/tmp/test.PDF")) == Path("/tmp/test.PDF") + assert resolve_manual_mask_pdf(Path("/tmp/test.docx")) is None + assert resolve_manual_mask_pdf(None) is None + + +def test_manual_mask_outputs_follow_project_convention(): + assert DEFAULT_MASK_OUTPUT_DIRNAME == "anonymise" + assert DEFAULT_MASK_PREVIEW_DIRNAME == "anonymise_preview" + + +def test_list_mask_templates_filters_supported_extensions(tmp_path: Path): + templates_dir = ensure_mask_templates_dir(tmp_path) + kept = templates_dir / "alpha.yml" + other = templates_dir / "beta.txt" + nested = templates_dir / "nested" / "gamma.json" + nested.parent.mkdir(parents=True) + kept.write_text("x", encoding="utf-8") + other.write_text("x", encoding="utf-8") + nested.write_text("x", encoding="utf-8") + + assert list_mask_templates(tmp_path) == [kept, nested] + assert mask_template_label(nested, tmp_path) == "nested/gamma.json" + + +def test_append_jsonl_file_appends_non_empty_content(tmp_path: Path): + target = tmp_path / "target.jsonl" + extra = tmp_path / "extra.jsonl" + target.write_text('{"kind":"A"}\n', encoding="utf-8") + extra.write_text('{"kind":"B"}\n', encoding="utf-8") + + append_jsonl_file(target, extra) + + assert target.read_text(encoding="utf-8") == '{"kind":"A"}\n{"kind":"B"}\n' diff --git a/tests/unit/test_real_world_identifier_layouts.py b/tests/unit/test_real_world_identifier_layouts.py new file mode 100644 index 0000000..1c27043 --- /dev/null +++ b/tests/unit/test_real_world_identifier_layouts.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 +""" +Tests de non-régression sur des layouts d'identifiants vus en documents réels. +""" +from anonymizer_core_refactored_onnx import ( + RE_SCAN_FILENAME_ARTIFACT, + anonymise_document_regex, + load_dictionaries, +) + + +def test_bacterio_multiline_venue_number_before_ipp_is_masked(): + cfg = load_dictionaries(None) + text = ( + "Diffusé le :\n" + "à\n" + "N° venue :\n" + "31/07/1973\n" + "VAN DE GRAAF\n" + "23176885\n" + "IPP :\n" + "2300201230\n" + ) + + anon = anonymise_document_regex([text], [[]], cfg) + + assert "23176885" not in anon.text_out + assert "[NDA]" in anon.text_out + assert any(h.kind == "NDA" and h.original == "23176885" for h in anon.audit) + + +def test_scan_filename_artifact_suffix_is_masked(): + cfg = load_dictionaries(None) + text = ( + "IPP:\n" + "16014215\n" + "Document scanné non\n" + "éditable pour patient (dont\ngénétique)\n" + "EXT2-16014215-2300249096.TIF\n" + ) + + anon = anonymise_document_regex([text], [[]], cfg) + + assert RE_SCAN_FILENAME_ARTIFACT.search("EXT2-[IPP]-2300249096.TIF") is not None + assert "2300249096" not in anon.text_out + assert "EXT2-[IPP]-[DOSSIER].TIF" in anon.text_out