test: non-régression F5 + batch paths + masquage manuel + layouts réels

- test_f5_nom_compose_orphelin.py : 13 tests (regex F5, application, scénario Trackare EJNAINI) - test_gui_batch_paths.py / test_manual_masking.py : couverture des modules - test_real_world_identifier_layouts.py : non-régression layouts réels (D-15) Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-04 16:30:56 +02:00
parent 5f8825a0d9
commit f2375d6be2
4 changed files with 379 additions and 0 deletions
--- a/tests/unit/test_f5_nom_compose_orphelin.py
+++ b/tests/unit/test_f5_nom_compose_orphelin.py
@@ -0,0 +1,203 @@
 #!/usr/bin/env python3
 """
 Test de non-regression pour le fix F5 (commit 299bbee).
 F5 : post-passe masquant la continuation orpheline d'un nom compose coupe
 par un saut de ligne dans le format Trackare en colonnes.
 Cas reproduit :
    ... 07:55 NOCENT-
    EJNAINI
 Le nom "NOCENT-EJNAINI" est eclate sur deux lignes. Le NER ligne par ligne
 ne peut pas les assembler. Le 1er composant (NOCENT-) est masque via un
 autre artefact de remplacement, mais le 2e (EJNAINI) reste orphelin en clair.
 F5 ajoute une regex post-masquage qui detecte "[NOM]-\\n<TOKEN_MAJUSCULE>"
 et masque le token orphelin. Le token doit etre directement apres le saut
 de ligne (whitespace accepte), pas apres un autre texte.
 Source : anonymizer_core_refactored_onnx.py, lignes ~4505-4516,
 fonction process_pdf(), bloc "3a-bis) Nettoyage post-masquage".
 """
 from __future__ import annotations
 import re
 import pytest
 from anonymizer_core_refactored_onnx import PLACEHOLDERS
 # ---------------------------------------------------------------------------
 # F5 regex — reproduite ici pour test unitaire (identique a process_pdf)
 # ---------------------------------------------------------------------------
 _RE_NOM_ORPHAN = re.compile(
    r"(\[NOM\]-\s*\n?\s*)([A-Z\u00C0-\u0178][A-Z\u00C0-\u0178'\-]{3,})\b"
 )
 def _apply_f5_nom_orphan(text: str) -> tuple[str, list]:
    """Applique la post-passe F5 sur une continuation orpheline de nom compose.
    Retourne le texte nettoye et la liste des tokens masques (pour audit).
    Logique identique a celle dans process_pdf() etape 3a-bis.
    """
    hits = []
    # Stop-words medicaux exclus du masquage (meme liste que process_pdf)
    _MEDICAL_STOP_WORDS = {
        "ampoule", "ampoules", "comprime", "comprimes", "gelule", "gelules",
        "solution", "solutions", "traitement", "traitements", "injection",
        "perfusions", "prescription", "posologie", "diagnostic", "examen",
        "resultat", "resultats", "observation", "antibiogramme", "bacterio",
    }
    def _clean(m):
        tok = m.group(2)
        if tok.lower() in _MEDICAL_STOP_WORDS:
            return m.group(0)
        hits.append(tok)
        return m.group(1) + PLACEHOLDERS["NOM"]
    cleaned = _RE_NOM_ORPHAN.sub(_clean, text)
    return cleaned, hits
 # ---------------------------------------------------------------------------
 # Tests
 # ---------------------------------------------------------------------------
 class TestF5NomComposeOrphelin:
    """F5 - Continuation orpheline d'un nom compose coupe par saut de ligne."""
    # -- Regex seule --
    def test_f5_regex_matches_nom_orphan_direct_after_dash_newline(self):
        """La regex F5 capture un token majuscule directement apres [NOM]-\\n."""
        text = "[NOM]-\nEJNAINI"
        match = _RE_NOM_ORPHAN.search(text)
        assert match is not None
        assert match.group(1) == "[NOM]-\n"
        assert match.group(2) == "EJNAINI"
    def test_f5_regex_matches_with_leading_spaces_on_next_line(self):
        """La regex F5 tolere des espaces en debut de ligne suivante."""
        text = "[NOM]-\n  EJNAINI"
        match = _RE_NOM_ORPHAN.search(text)
        assert match is not None
        assert match.group(2) == "EJNAINI"
    def test_f5_regex_matches_with_trailing_spaces_before_newline(self):
        """La regex F5 tolere des espaces avant le saut de ligne."""
        text = "[NOM]-  \n  EJNAINI"
        match = _RE_NOM_ORPHAN.search(text)
        assert match is not None
        assert match.group(2) == "EJNAINI"
    def test_f5_regex_no_match_when_intervening_text(self):
        """La regex F5 ne matche PAS si du texte separe [NOM]-\\n du token.
        C'est le cas quand le token n'est pas une continuation directe du nom
        compose (ex: autre colonne du tableau Trackare)."""
        text = "[NOM]-\nAmpoule(s) EJNAINI"
        # "Ampoule(s)" n'est pas tout en majuscule, donc la regex ne le matche
        # pas comme groupe 2, et EJNAINI n'est pas directement apres \n\s*
        match = _RE_NOM_ORPHAN.search(text)
        assert match is None, (
            "F5 ne doit pas matcher quand du texte separe [NOM]- du token orphelin"
        )
    def test_f5_regex_rejects_lowercase_start(self):
        """Un token commencant par une minuscule n'est pas capture."""
        match = _RE_NOM_ORPHAN.search("[NOM]-\nejnaini")
        assert match is None
    def test_f5_regex_minimum_length_4_chars(self):
        """Le token doit faire au moins 4 caracteres (1 + {3,})."""
        assert _RE_NOM_ORPHAN.search("[NOM]-\nABC") is None, "3 chars = trop court"
        assert _RE_NOM_ORPHAN.search("[NOM]-\nABCD") is not None, "4 chars = OK"
    # -- Application F5 --
    def test_f5_apply_masks_orphan_token(self):
        """_apply_f5_nom_orphan remplace le token orphelin par [NOM]."""
        text = "[NOM]-\nEJNAINI"
        cleaned, hits = _apply_f5_nom_orphan(text)
        assert hits == ["EJNAINI"]
        assert "[NOM]-" in cleaned
        assert "EJNAINI" not in cleaned
        # Les deux parties du nom compose doivent etre masquees
        assert cleaned.count(PLACEHOLDERS["NOM"]) == 2
    def test_f5_apply_preserves_context_around_orphan(self):
        """Le contexte autour du nom orphelin n'est pas modifie."""
        text = "07:55 [NOM]-\nEJNAINI\nSuite du traitement"
        cleaned, hits = _apply_f5_nom_orphan(text)
        assert hits == ["EJNAINI"]
        assert "07:55 " in cleaned
        assert "Suite du traitement" in cleaned
        assert "EJNAINI" not in cleaned
    def test_f5_apply_multiple_orphans(self):
        """F5 masque plusieurs orphelines dans le meme texte."""
        text = "[NOM]-\nDUPONT\nAutre [NOM]-\nMARTIN"
        cleaned, hits = _apply_f5_nom_orphan(text)
        assert len(hits) == 2
        assert "DUPONT" not in cleaned
        assert "MARTIN" not in cleaned
        assert cleaned.count(PLACEHOLDERS["NOM"]) == 4  # 2 initiaux + 2 orphelins
    def test_f5_no_false_positive_on_normal_text(self):
        """F5 ne modifie pas un texte sans pattern [NOM]-\\n<TOKEN>."""
        text = "Patient presente le [DATE]. Traitement prescrit."
        cleaned, hits = _apply_f5_nom_orphan(text)
        assert hits == []
        assert cleaned == text
    # -- Cas reel Trackare --
    def test_f5_full_trackare_scenario(self):
        """Test du cas Trackare complet : nom NOCENT-EJNAINI coupe par saut
        de ligne dans l'extraction PDF en colonnes.
        Format Trackare en colonnes :
          Colonne nom :   "07:55 NOCENT-"
          Ligne suivante : "EJNAINI"
        Apres masquage initial (pre-F5) :
          "07:55 [NOM]-\nEJNAINI"
        Apres F5 :
          "07:55 [NOM]-\n[NOM]"
        """
        # Input simulant le resultat pre-F5 (NOCENT masque, EJNAINI orphelin)
        pre_f5 = "07:55 [NOM]-\nEJNAINI"
        cleaned, hits = _apply_f5_nom_orphan(pre_f5)
        # Verification : les deux composantes du nom compose sont masquees
        assert "[NOM]-" in cleaned, "Le 1er composant doit rester masque"
        assert "EJNAINI" not in cleaned, "Le 2e composant orphelin doit etre masque par F5"
        assert "EJNAINI" not in cleaned, "Aucune fuite du nom orphelin"
        assert cleaned.count(PLACEHOLDERS["NOM"]) == 2, (
            "Les deux parties du nom compose doivent etre masquees"
        )
        assert hits == ["EJNAINI"], "EJNAINI doit etre loggue dans l'audit"
    def test_f5_trackare_with_spaces_in_column_alignment(self):
        """Cas Trackare avec espaces d'alignement de colonne."""
        pre_f5 = "07:55  [NOM]-  \n  EJNAINI  \nSuite"
        cleaned, hits = _apply_f5_nom_orphan(pre_f5)
        assert hits == ["EJNAINI"]
        assert "EJNAINI" not in cleaned
        assert "Suite" in cleaned
    def test_f5_nom_compose_with_apostrophe_and_dash(self):
        """Token orphelin contenant apostrophes et tirets."""
        pre_f5 = "[NOM]-\nDUPONT-MARTIN"
        cleaned, hits = _apply_f5_nom_orphan(pre_f5)
        assert hits == ["DUPONT-MARTIN"]
        assert "DUPONT-MARTIN" not in cleaned
 if __name__ == "__main__":
    pytest.main([__file__, "-v"])
--- a/tests/unit/test_gui_batch_paths.py
+++ b/tests/unit/test_gui_batch_paths.py
@@ -0,0 +1,70 @@
 from pathlib import Path
 from gui_batch_paths import (
    build_batch_output_dir,
    iter_pseudonymized_texts,
    list_supported_documents,
 )
 def test_list_supported_documents_excludes_gui_output_tree(tmp_path: Path):
    root = tmp_path / "cases"
    source_case = root / "001_patient_header_and_birth"
    source_case.mkdir(parents=True)
    nested_case = root / "002_contact_bundle"
    nested_case.mkdir(parents=True)
    output_case = root / "anonymise" / "001_patient_header_and_birth"
    output_case.mkdir(parents=True)
    source_txt = source_case / "test.txt"
    source_pdf = nested_case / "source.pdf"
    output_txt = output_case / "test.pseudonymise.txt"
    source_txt.write_text("source", encoding="utf-8")
    source_pdf.write_text("pdf", encoding="utf-8")
    output_txt.write_text("output", encoding="utf-8")
    documents = list_supported_documents(root, {".txt", ".pdf"})
    assert documents == [source_txt, source_pdf]
 def test_build_batch_output_dir_preserves_relative_parent(tmp_path: Path):
    root = tmp_path / "cases"
    output_root = root / "anonymise"
    source = root / "010_spaced_establishment_header" / "test.txt"
    source.parent.mkdir(parents=True)
    source.write_text("test", encoding="utf-8")
    output_dir = build_batch_output_dir(root, output_root, source)
    assert output_dir == output_root / "010_spaced_establishment_header"
 def test_build_batch_output_dir_keeps_root_files_at_output_root(tmp_path: Path):
    root = tmp_path / "cases"
    output_root = root / "anonymise"
    source = root / "test.txt"
    root.mkdir(parents=True)
    source.write_text("test", encoding="utf-8")
    output_dir = build_batch_output_dir(root, output_root, source)
    assert output_dir == output_root
 def test_iter_pseudonymized_texts_is_recursive(tmp_path: Path):
    output_root = tmp_path / "anonymise"
    nested = output_root / "001_patient_header_and_birth"
    nested.mkdir(parents=True)
    top_level = output_root / "summary.pseudonymise.txt"
    nested_txt = nested / "test.pseudonymise.txt"
    other_file = nested / "audit.jsonl"
    top_level.write_text("top", encoding="utf-8")
    nested_txt.write_text("nested", encoding="utf-8")
    other_file.write_text("{}", encoding="utf-8")
    found = sorted(iter_pseudonymized_texts(output_root))
    assert found == [nested_txt, top_level]
--- a/tests/unit/test_manual_masking.py
+++ b/tests/unit/test_manual_masking.py
@@ -0,0 +1,60 @@
 from pathlib import Path
 from manual_masking import (
    DEFAULT_MASK_OUTPUT_DIRNAME,
    DEFAULT_MASK_PREVIEW_DIRNAME,
    append_jsonl_file,
    ensure_mask_templates_dir,
    list_mask_templates,
    mask_templates_dir,
    mask_template_label,
    resolve_manual_mask_pdf,
 )
 def test_mask_templates_dir_is_under_config():
    base = Path("/tmp/anonymisation")
    assert mask_templates_dir(base) == base / "config" / "mask_templates"
 def test_ensure_mask_templates_dir_creates_folder(tmp_path: Path):
    created = ensure_mask_templates_dir(tmp_path)
    assert created == tmp_path / "config" / "mask_templates"
    assert created.is_dir()
 def test_resolve_manual_mask_pdf_accepts_only_pdf():
    assert resolve_manual_mask_pdf(Path("/tmp/test.pdf")) == Path("/tmp/test.pdf")
    assert resolve_manual_mask_pdf(Path("/tmp/test.PDF")) == Path("/tmp/test.PDF")
    assert resolve_manual_mask_pdf(Path("/tmp/test.docx")) is None
    assert resolve_manual_mask_pdf(None) is None
 def test_manual_mask_outputs_follow_project_convention():
    assert DEFAULT_MASK_OUTPUT_DIRNAME == "anonymise"
    assert DEFAULT_MASK_PREVIEW_DIRNAME == "anonymise_preview"
 def test_list_mask_templates_filters_supported_extensions(tmp_path: Path):
    templates_dir = ensure_mask_templates_dir(tmp_path)
    kept = templates_dir / "alpha.yml"
    other = templates_dir / "beta.txt"
    nested = templates_dir / "nested" / "gamma.json"
    nested.parent.mkdir(parents=True)
    kept.write_text("x", encoding="utf-8")
    other.write_text("x", encoding="utf-8")
    nested.write_text("x", encoding="utf-8")
    assert list_mask_templates(tmp_path) == [kept, nested]
    assert mask_template_label(nested, tmp_path) == "nested/gamma.json"
 def test_append_jsonl_file_appends_non_empty_content(tmp_path: Path):
    target = tmp_path / "target.jsonl"
    extra = tmp_path / "extra.jsonl"
    target.write_text('{"kind":"A"}\n', encoding="utf-8")
    extra.write_text('{"kind":"B"}\n', encoding="utf-8")
    append_jsonl_file(target, extra)
    assert target.read_text(encoding="utf-8") == '{"kind":"A"}\n{"kind":"B"}\n'
--- a/tests/unit/test_real_world_identifier_layouts.py
+++ b/tests/unit/test_real_world_identifier_layouts.py
@@ -0,0 +1,46 @@
 #!/usr/bin/env python3
 """
 Tests de non-régression sur des layouts d'identifiants vus en documents réels.
 """
 from anonymizer_core_refactored_onnx import (
    RE_SCAN_FILENAME_ARTIFACT,
    anonymise_document_regex,
    load_dictionaries,
 )
 def test_bacterio_multiline_venue_number_before_ipp_is_masked():
    cfg = load_dictionaries(None)
    text = (
        "Diffusé le :\n"
        "à\n"
        "N° venue :\n"
        "31/07/1973\n"
        "VAN DE GRAAF\n"
        "23176885\n"
        "IPP :\n"
        "2300201230\n"
    )
    anon = anonymise_document_regex([text], [[]], cfg)
    assert "23176885" not in anon.text_out
    assert "[NDA]" in anon.text_out
    assert any(h.kind == "NDA" and h.original == "23176885" for h in anon.audit)
 def test_scan_filename_artifact_suffix_is_masked():
    cfg = load_dictionaries(None)
    text = (
        "IPP:\n"
        "16014215\n"
        "Document scanné non\n"
        "éditable pour patient (dont\ngénétique)\n"
        "EXT2-16014215-2300249096.TIF\n"
    )
    anon = anonymise_document_regex([text], [[]], cfg)
    assert RE_SCAN_FILENAME_ARTIFACT.search("EXT2-[IPP]-2300249096.TIF") is not None
    assert "2300249096" not in anon.text_out
    assert "EXT2-[IPP]-[DOSSIER].TIF" in anon.text_out