test: non-régression F5 + batch paths + masquage manuel + layouts réels

- test_f5_nom_compose_orphelin.py : 13 tests (regex F5, application, scénario Trackare EJNAINI)
- test_gui_batch_paths.py / test_manual_masking.py : couverture des modules
- test_real_world_identifier_layouts.py : non-régression layouts réels (D-15)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-04 16:30:56 +02:00
parent 5f8825a0d9
commit f2375d6be2
4 changed files with 379 additions and 0 deletions

View File

@@ -0,0 +1,203 @@
#!/usr/bin/env python3
"""
Test de non-regression pour le fix F5 (commit 299bbee).
F5 : post-passe masquant la continuation orpheline d'un nom compose coupe
par un saut de ligne dans le format Trackare en colonnes.
Cas reproduit :
... 07:55 NOCENT-
EJNAINI
Le nom "NOCENT-EJNAINI" est eclate sur deux lignes. Le NER ligne par ligne
ne peut pas les assembler. Le 1er composant (NOCENT-) est masque via un
autre artefact de remplacement, mais le 2e (EJNAINI) reste orphelin en clair.
F5 ajoute une regex post-masquage qui detecte "[NOM]-\\n<TOKEN_MAJUSCULE>"
et masque le token orphelin. Le token doit etre directement apres le saut
de ligne (whitespace accepte), pas apres un autre texte.
Source : anonymizer_core_refactored_onnx.py, lignes ~4505-4516,
fonction process_pdf(), bloc "3a-bis) Nettoyage post-masquage".
"""
from __future__ import annotations
import re
import pytest
from anonymizer_core_refactored_onnx import PLACEHOLDERS
# ---------------------------------------------------------------------------
# F5 regex — reproduite ici pour test unitaire (identique a process_pdf)
# ---------------------------------------------------------------------------
_RE_NOM_ORPHAN = re.compile(
r"(\[NOM\]-\s*\n?\s*)([A-Z\u00C0-\u0178][A-Z\u00C0-\u0178'\-]{3,})\b"
)
def _apply_f5_nom_orphan(text: str) -> tuple[str, list]:
"""Applique la post-passe F5 sur une continuation orpheline de nom compose.
Retourne le texte nettoye et la liste des tokens masques (pour audit).
Logique identique a celle dans process_pdf() etape 3a-bis.
"""
hits = []
# Stop-words medicaux exclus du masquage (meme liste que process_pdf)
_MEDICAL_STOP_WORDS = {
"ampoule", "ampoules", "comprime", "comprimes", "gelule", "gelules",
"solution", "solutions", "traitement", "traitements", "injection",
"perfusions", "prescription", "posologie", "diagnostic", "examen",
"resultat", "resultats", "observation", "antibiogramme", "bacterio",
}
def _clean(m):
tok = m.group(2)
if tok.lower() in _MEDICAL_STOP_WORDS:
return m.group(0)
hits.append(tok)
return m.group(1) + PLACEHOLDERS["NOM"]
cleaned = _RE_NOM_ORPHAN.sub(_clean, text)
return cleaned, hits
# ---------------------------------------------------------------------------
# Tests
# ---------------------------------------------------------------------------
class TestF5NomComposeOrphelin:
"""F5 - Continuation orpheline d'un nom compose coupe par saut de ligne."""
# -- Regex seule --
def test_f5_regex_matches_nom_orphan_direct_after_dash_newline(self):
"""La regex F5 capture un token majuscule directement apres [NOM]-\\n."""
text = "[NOM]-\nEJNAINI"
match = _RE_NOM_ORPHAN.search(text)
assert match is not None
assert match.group(1) == "[NOM]-\n"
assert match.group(2) == "EJNAINI"
def test_f5_regex_matches_with_leading_spaces_on_next_line(self):
"""La regex F5 tolere des espaces en debut de ligne suivante."""
text = "[NOM]-\n EJNAINI"
match = _RE_NOM_ORPHAN.search(text)
assert match is not None
assert match.group(2) == "EJNAINI"
def test_f5_regex_matches_with_trailing_spaces_before_newline(self):
"""La regex F5 tolere des espaces avant le saut de ligne."""
text = "[NOM]- \n EJNAINI"
match = _RE_NOM_ORPHAN.search(text)
assert match is not None
assert match.group(2) == "EJNAINI"
def test_f5_regex_no_match_when_intervening_text(self):
"""La regex F5 ne matche PAS si du texte separe [NOM]-\\n du token.
C'est le cas quand le token n'est pas une continuation directe du nom
compose (ex: autre colonne du tableau Trackare)."""
text = "[NOM]-\nAmpoule(s) EJNAINI"
# "Ampoule(s)" n'est pas tout en majuscule, donc la regex ne le matche
# pas comme groupe 2, et EJNAINI n'est pas directement apres \n\s*
match = _RE_NOM_ORPHAN.search(text)
assert match is None, (
"F5 ne doit pas matcher quand du texte separe [NOM]- du token orphelin"
)
def test_f5_regex_rejects_lowercase_start(self):
"""Un token commencant par une minuscule n'est pas capture."""
match = _RE_NOM_ORPHAN.search("[NOM]-\nejnaini")
assert match is None
def test_f5_regex_minimum_length_4_chars(self):
"""Le token doit faire au moins 4 caracteres (1 + {3,})."""
assert _RE_NOM_ORPHAN.search("[NOM]-\nABC") is None, "3 chars = trop court"
assert _RE_NOM_ORPHAN.search("[NOM]-\nABCD") is not None, "4 chars = OK"
# -- Application F5 --
def test_f5_apply_masks_orphan_token(self):
"""_apply_f5_nom_orphan remplace le token orphelin par [NOM]."""
text = "[NOM]-\nEJNAINI"
cleaned, hits = _apply_f5_nom_orphan(text)
assert hits == ["EJNAINI"]
assert "[NOM]-" in cleaned
assert "EJNAINI" not in cleaned
# Les deux parties du nom compose doivent etre masquees
assert cleaned.count(PLACEHOLDERS["NOM"]) == 2
def test_f5_apply_preserves_context_around_orphan(self):
"""Le contexte autour du nom orphelin n'est pas modifie."""
text = "07:55 [NOM]-\nEJNAINI\nSuite du traitement"
cleaned, hits = _apply_f5_nom_orphan(text)
assert hits == ["EJNAINI"]
assert "07:55 " in cleaned
assert "Suite du traitement" in cleaned
assert "EJNAINI" not in cleaned
def test_f5_apply_multiple_orphans(self):
"""F5 masque plusieurs orphelines dans le meme texte."""
text = "[NOM]-\nDUPONT\nAutre [NOM]-\nMARTIN"
cleaned, hits = _apply_f5_nom_orphan(text)
assert len(hits) == 2
assert "DUPONT" not in cleaned
assert "MARTIN" not in cleaned
assert cleaned.count(PLACEHOLDERS["NOM"]) == 4 # 2 initiaux + 2 orphelins
def test_f5_no_false_positive_on_normal_text(self):
"""F5 ne modifie pas un texte sans pattern [NOM]-\\n<TOKEN>."""
text = "Patient presente le [DATE]. Traitement prescrit."
cleaned, hits = _apply_f5_nom_orphan(text)
assert hits == []
assert cleaned == text
# -- Cas reel Trackare --
def test_f5_full_trackare_scenario(self):
"""Test du cas Trackare complet : nom NOCENT-EJNAINI coupe par saut
de ligne dans l'extraction PDF en colonnes.
Format Trackare en colonnes :
Colonne nom : "07:55 NOCENT-"
Ligne suivante : "EJNAINI"
Apres masquage initial (pre-F5) :
"07:55 [NOM]-\nEJNAINI"
Apres F5 :
"07:55 [NOM]-\n[NOM]"
"""
# Input simulant le resultat pre-F5 (NOCENT masque, EJNAINI orphelin)
pre_f5 = "07:55 [NOM]-\nEJNAINI"
cleaned, hits = _apply_f5_nom_orphan(pre_f5)
# Verification : les deux composantes du nom compose sont masquees
assert "[NOM]-" in cleaned, "Le 1er composant doit rester masque"
assert "EJNAINI" not in cleaned, "Le 2e composant orphelin doit etre masque par F5"
assert "EJNAINI" not in cleaned, "Aucune fuite du nom orphelin"
assert cleaned.count(PLACEHOLDERS["NOM"]) == 2, (
"Les deux parties du nom compose doivent etre masquees"
)
assert hits == ["EJNAINI"], "EJNAINI doit etre loggue dans l'audit"
def test_f5_trackare_with_spaces_in_column_alignment(self):
"""Cas Trackare avec espaces d'alignement de colonne."""
pre_f5 = "07:55 [NOM]- \n EJNAINI \nSuite"
cleaned, hits = _apply_f5_nom_orphan(pre_f5)
assert hits == ["EJNAINI"]
assert "EJNAINI" not in cleaned
assert "Suite" in cleaned
def test_f5_nom_compose_with_apostrophe_and_dash(self):
"""Token orphelin contenant apostrophes et tirets."""
pre_f5 = "[NOM]-\nDUPONT-MARTIN"
cleaned, hits = _apply_f5_nom_orphan(pre_f5)
assert hits == ["DUPONT-MARTIN"]
assert "DUPONT-MARTIN" not in cleaned
if __name__ == "__main__":
pytest.main([__file__, "-v"])

View File

@@ -0,0 +1,70 @@
from pathlib import Path
from gui_batch_paths import (
build_batch_output_dir,
iter_pseudonymized_texts,
list_supported_documents,
)
def test_list_supported_documents_excludes_gui_output_tree(tmp_path: Path):
root = tmp_path / "cases"
source_case = root / "001_patient_header_and_birth"
source_case.mkdir(parents=True)
nested_case = root / "002_contact_bundle"
nested_case.mkdir(parents=True)
output_case = root / "anonymise" / "001_patient_header_and_birth"
output_case.mkdir(parents=True)
source_txt = source_case / "test.txt"
source_pdf = nested_case / "source.pdf"
output_txt = output_case / "test.pseudonymise.txt"
source_txt.write_text("source", encoding="utf-8")
source_pdf.write_text("pdf", encoding="utf-8")
output_txt.write_text("output", encoding="utf-8")
documents = list_supported_documents(root, {".txt", ".pdf"})
assert documents == [source_txt, source_pdf]
def test_build_batch_output_dir_preserves_relative_parent(tmp_path: Path):
root = tmp_path / "cases"
output_root = root / "anonymise"
source = root / "010_spaced_establishment_header" / "test.txt"
source.parent.mkdir(parents=True)
source.write_text("test", encoding="utf-8")
output_dir = build_batch_output_dir(root, output_root, source)
assert output_dir == output_root / "010_spaced_establishment_header"
def test_build_batch_output_dir_keeps_root_files_at_output_root(tmp_path: Path):
root = tmp_path / "cases"
output_root = root / "anonymise"
source = root / "test.txt"
root.mkdir(parents=True)
source.write_text("test", encoding="utf-8")
output_dir = build_batch_output_dir(root, output_root, source)
assert output_dir == output_root
def test_iter_pseudonymized_texts_is_recursive(tmp_path: Path):
output_root = tmp_path / "anonymise"
nested = output_root / "001_patient_header_and_birth"
nested.mkdir(parents=True)
top_level = output_root / "summary.pseudonymise.txt"
nested_txt = nested / "test.pseudonymise.txt"
other_file = nested / "audit.jsonl"
top_level.write_text("top", encoding="utf-8")
nested_txt.write_text("nested", encoding="utf-8")
other_file.write_text("{}", encoding="utf-8")
found = sorted(iter_pseudonymized_texts(output_root))
assert found == [nested_txt, top_level]

View File

@@ -0,0 +1,60 @@
from pathlib import Path
from manual_masking import (
DEFAULT_MASK_OUTPUT_DIRNAME,
DEFAULT_MASK_PREVIEW_DIRNAME,
append_jsonl_file,
ensure_mask_templates_dir,
list_mask_templates,
mask_templates_dir,
mask_template_label,
resolve_manual_mask_pdf,
)
def test_mask_templates_dir_is_under_config():
base = Path("/tmp/anonymisation")
assert mask_templates_dir(base) == base / "config" / "mask_templates"
def test_ensure_mask_templates_dir_creates_folder(tmp_path: Path):
created = ensure_mask_templates_dir(tmp_path)
assert created == tmp_path / "config" / "mask_templates"
assert created.is_dir()
def test_resolve_manual_mask_pdf_accepts_only_pdf():
assert resolve_manual_mask_pdf(Path("/tmp/test.pdf")) == Path("/tmp/test.pdf")
assert resolve_manual_mask_pdf(Path("/tmp/test.PDF")) == Path("/tmp/test.PDF")
assert resolve_manual_mask_pdf(Path("/tmp/test.docx")) is None
assert resolve_manual_mask_pdf(None) is None
def test_manual_mask_outputs_follow_project_convention():
assert DEFAULT_MASK_OUTPUT_DIRNAME == "anonymise"
assert DEFAULT_MASK_PREVIEW_DIRNAME == "anonymise_preview"
def test_list_mask_templates_filters_supported_extensions(tmp_path: Path):
templates_dir = ensure_mask_templates_dir(tmp_path)
kept = templates_dir / "alpha.yml"
other = templates_dir / "beta.txt"
nested = templates_dir / "nested" / "gamma.json"
nested.parent.mkdir(parents=True)
kept.write_text("x", encoding="utf-8")
other.write_text("x", encoding="utf-8")
nested.write_text("x", encoding="utf-8")
assert list_mask_templates(tmp_path) == [kept, nested]
assert mask_template_label(nested, tmp_path) == "nested/gamma.json"
def test_append_jsonl_file_appends_non_empty_content(tmp_path: Path):
target = tmp_path / "target.jsonl"
extra = tmp_path / "extra.jsonl"
target.write_text('{"kind":"A"}\n', encoding="utf-8")
extra.write_text('{"kind":"B"}\n', encoding="utf-8")
append_jsonl_file(target, extra)
assert target.read_text(encoding="utf-8") == '{"kind":"A"}\n{"kind":"B"}\n'

View File

@@ -0,0 +1,46 @@
#!/usr/bin/env python3
"""
Tests de non-régression sur des layouts d'identifiants vus en documents réels.
"""
from anonymizer_core_refactored_onnx import (
RE_SCAN_FILENAME_ARTIFACT,
anonymise_document_regex,
load_dictionaries,
)
def test_bacterio_multiline_venue_number_before_ipp_is_masked():
cfg = load_dictionaries(None)
text = (
"Diffusé le :\n"
"à\n"
"N° venue :\n"
"31/07/1973\n"
"VAN DE GRAAF\n"
"23176885\n"
"IPP :\n"
"2300201230\n"
)
anon = anonymise_document_regex([text], [[]], cfg)
assert "23176885" not in anon.text_out
assert "[NDA]" in anon.text_out
assert any(h.kind == "NDA" and h.original == "23176885" for h in anon.audit)
def test_scan_filename_artifact_suffix_is_masked():
cfg = load_dictionaries(None)
text = (
"IPP:\n"
"16014215\n"
"Document scanné non\n"
"éditable pour patient (dont\ngénétique)\n"
"EXT2-16014215-2300249096.TIF\n"
)
anon = anonymise_document_regex([text], [[]], cfg)
assert RE_SCAN_FILENAME_ARTIFACT.search("EXT2-[IPP]-2300249096.TIF") is not None
assert "2300249096" not in anon.text_out
assert "EXT2-[IPP]-[DOSSIER].TIF" in anon.text_out