test: non-régression F5 + batch paths + masquage manuel + layouts réels
- test_f5_nom_compose_orphelin.py : 13 tests (regex F5, application, scénario Trackare EJNAINI) - test_gui_batch_paths.py / test_manual_masking.py : couverture des modules - test_real_world_identifier_layouts.py : non-régression layouts réels (D-15) Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
203
tests/unit/test_f5_nom_compose_orphelin.py
Normal file
203
tests/unit/test_f5_nom_compose_orphelin.py
Normal file
@@ -0,0 +1,203 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Test de non-regression pour le fix F5 (commit 299bbee).
|
||||||
|
|
||||||
|
F5 : post-passe masquant la continuation orpheline d'un nom compose coupe
|
||||||
|
par un saut de ligne dans le format Trackare en colonnes.
|
||||||
|
|
||||||
|
Cas reproduit :
|
||||||
|
... 07:55 NOCENT-
|
||||||
|
EJNAINI
|
||||||
|
|
||||||
|
Le nom "NOCENT-EJNAINI" est eclate sur deux lignes. Le NER ligne par ligne
|
||||||
|
ne peut pas les assembler. Le 1er composant (NOCENT-) est masque via un
|
||||||
|
autre artefact de remplacement, mais le 2e (EJNAINI) reste orphelin en clair.
|
||||||
|
|
||||||
|
F5 ajoute une regex post-masquage qui detecte "[NOM]-\\n<TOKEN_MAJUSCULE>"
|
||||||
|
et masque le token orphelin. Le token doit etre directement apres le saut
|
||||||
|
de ligne (whitespace accepte), pas apres un autre texte.
|
||||||
|
|
||||||
|
Source : anonymizer_core_refactored_onnx.py, lignes ~4505-4516,
|
||||||
|
fonction process_pdf(), bloc "3a-bis) Nettoyage post-masquage".
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from anonymizer_core_refactored_onnx import PLACEHOLDERS
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# F5 regex — reproduite ici pour test unitaire (identique a process_pdf)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
_RE_NOM_ORPHAN = re.compile(
|
||||||
|
r"(\[NOM\]-\s*\n?\s*)([A-Z\u00C0-\u0178][A-Z\u00C0-\u0178'\-]{3,})\b"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _apply_f5_nom_orphan(text: str) -> tuple[str, list]:
|
||||||
|
"""Applique la post-passe F5 sur une continuation orpheline de nom compose.
|
||||||
|
|
||||||
|
Retourne le texte nettoye et la liste des tokens masques (pour audit).
|
||||||
|
Logique identique a celle dans process_pdf() etape 3a-bis.
|
||||||
|
"""
|
||||||
|
hits = []
|
||||||
|
|
||||||
|
# Stop-words medicaux exclus du masquage (meme liste que process_pdf)
|
||||||
|
_MEDICAL_STOP_WORDS = {
|
||||||
|
"ampoule", "ampoules", "comprime", "comprimes", "gelule", "gelules",
|
||||||
|
"solution", "solutions", "traitement", "traitements", "injection",
|
||||||
|
"perfusions", "prescription", "posologie", "diagnostic", "examen",
|
||||||
|
"resultat", "resultats", "observation", "antibiogramme", "bacterio",
|
||||||
|
}
|
||||||
|
|
||||||
|
def _clean(m):
|
||||||
|
tok = m.group(2)
|
||||||
|
if tok.lower() in _MEDICAL_STOP_WORDS:
|
||||||
|
return m.group(0)
|
||||||
|
hits.append(tok)
|
||||||
|
return m.group(1) + PLACEHOLDERS["NOM"]
|
||||||
|
|
||||||
|
cleaned = _RE_NOM_ORPHAN.sub(_clean, text)
|
||||||
|
return cleaned, hits
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Tests
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestF5NomComposeOrphelin:
|
||||||
|
"""F5 - Continuation orpheline d'un nom compose coupe par saut de ligne."""
|
||||||
|
|
||||||
|
# -- Regex seule --
|
||||||
|
|
||||||
|
def test_f5_regex_matches_nom_orphan_direct_after_dash_newline(self):
|
||||||
|
"""La regex F5 capture un token majuscule directement apres [NOM]-\\n."""
|
||||||
|
text = "[NOM]-\nEJNAINI"
|
||||||
|
match = _RE_NOM_ORPHAN.search(text)
|
||||||
|
assert match is not None
|
||||||
|
assert match.group(1) == "[NOM]-\n"
|
||||||
|
assert match.group(2) == "EJNAINI"
|
||||||
|
|
||||||
|
def test_f5_regex_matches_with_leading_spaces_on_next_line(self):
|
||||||
|
"""La regex F5 tolere des espaces en debut de ligne suivante."""
|
||||||
|
text = "[NOM]-\n EJNAINI"
|
||||||
|
match = _RE_NOM_ORPHAN.search(text)
|
||||||
|
assert match is not None
|
||||||
|
assert match.group(2) == "EJNAINI"
|
||||||
|
|
||||||
|
def test_f5_regex_matches_with_trailing_spaces_before_newline(self):
|
||||||
|
"""La regex F5 tolere des espaces avant le saut de ligne."""
|
||||||
|
text = "[NOM]- \n EJNAINI"
|
||||||
|
match = _RE_NOM_ORPHAN.search(text)
|
||||||
|
assert match is not None
|
||||||
|
assert match.group(2) == "EJNAINI"
|
||||||
|
|
||||||
|
def test_f5_regex_no_match_when_intervening_text(self):
|
||||||
|
"""La regex F5 ne matche PAS si du texte separe [NOM]-\\n du token.
|
||||||
|
C'est le cas quand le token n'est pas une continuation directe du nom
|
||||||
|
compose (ex: autre colonne du tableau Trackare)."""
|
||||||
|
text = "[NOM]-\nAmpoule(s) EJNAINI"
|
||||||
|
# "Ampoule(s)" n'est pas tout en majuscule, donc la regex ne le matche
|
||||||
|
# pas comme groupe 2, et EJNAINI n'est pas directement apres \n\s*
|
||||||
|
match = _RE_NOM_ORPHAN.search(text)
|
||||||
|
assert match is None, (
|
||||||
|
"F5 ne doit pas matcher quand du texte separe [NOM]- du token orphelin"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_f5_regex_rejects_lowercase_start(self):
|
||||||
|
"""Un token commencant par une minuscule n'est pas capture."""
|
||||||
|
match = _RE_NOM_ORPHAN.search("[NOM]-\nejnaini")
|
||||||
|
assert match is None
|
||||||
|
|
||||||
|
def test_f5_regex_minimum_length_4_chars(self):
|
||||||
|
"""Le token doit faire au moins 4 caracteres (1 + {3,})."""
|
||||||
|
assert _RE_NOM_ORPHAN.search("[NOM]-\nABC") is None, "3 chars = trop court"
|
||||||
|
assert _RE_NOM_ORPHAN.search("[NOM]-\nABCD") is not None, "4 chars = OK"
|
||||||
|
|
||||||
|
# -- Application F5 --
|
||||||
|
|
||||||
|
def test_f5_apply_masks_orphan_token(self):
|
||||||
|
"""_apply_f5_nom_orphan remplace le token orphelin par [NOM]."""
|
||||||
|
text = "[NOM]-\nEJNAINI"
|
||||||
|
cleaned, hits = _apply_f5_nom_orphan(text)
|
||||||
|
assert hits == ["EJNAINI"]
|
||||||
|
assert "[NOM]-" in cleaned
|
||||||
|
assert "EJNAINI" not in cleaned
|
||||||
|
# Les deux parties du nom compose doivent etre masquees
|
||||||
|
assert cleaned.count(PLACEHOLDERS["NOM"]) == 2
|
||||||
|
|
||||||
|
def test_f5_apply_preserves_context_around_orphan(self):
|
||||||
|
"""Le contexte autour du nom orphelin n'est pas modifie."""
|
||||||
|
text = "07:55 [NOM]-\nEJNAINI\nSuite du traitement"
|
||||||
|
cleaned, hits = _apply_f5_nom_orphan(text)
|
||||||
|
assert hits == ["EJNAINI"]
|
||||||
|
assert "07:55 " in cleaned
|
||||||
|
assert "Suite du traitement" in cleaned
|
||||||
|
assert "EJNAINI" not in cleaned
|
||||||
|
|
||||||
|
def test_f5_apply_multiple_orphans(self):
|
||||||
|
"""F5 masque plusieurs orphelines dans le meme texte."""
|
||||||
|
text = "[NOM]-\nDUPONT\nAutre [NOM]-\nMARTIN"
|
||||||
|
cleaned, hits = _apply_f5_nom_orphan(text)
|
||||||
|
assert len(hits) == 2
|
||||||
|
assert "DUPONT" not in cleaned
|
||||||
|
assert "MARTIN" not in cleaned
|
||||||
|
assert cleaned.count(PLACEHOLDERS["NOM"]) == 4 # 2 initiaux + 2 orphelins
|
||||||
|
|
||||||
|
def test_f5_no_false_positive_on_normal_text(self):
|
||||||
|
"""F5 ne modifie pas un texte sans pattern [NOM]-\\n<TOKEN>."""
|
||||||
|
text = "Patient presente le [DATE]. Traitement prescrit."
|
||||||
|
cleaned, hits = _apply_f5_nom_orphan(text)
|
||||||
|
assert hits == []
|
||||||
|
assert cleaned == text
|
||||||
|
|
||||||
|
# -- Cas reel Trackare --
|
||||||
|
|
||||||
|
def test_f5_full_trackare_scenario(self):
|
||||||
|
"""Test du cas Trackare complet : nom NOCENT-EJNAINI coupe par saut
|
||||||
|
de ligne dans l'extraction PDF en colonnes.
|
||||||
|
|
||||||
|
Format Trackare en colonnes :
|
||||||
|
Colonne nom : "07:55 NOCENT-"
|
||||||
|
Ligne suivante : "EJNAINI"
|
||||||
|
|
||||||
|
Apres masquage initial (pre-F5) :
|
||||||
|
"07:55 [NOM]-\nEJNAINI"
|
||||||
|
|
||||||
|
Apres F5 :
|
||||||
|
"07:55 [NOM]-\n[NOM]"
|
||||||
|
"""
|
||||||
|
# Input simulant le resultat pre-F5 (NOCENT masque, EJNAINI orphelin)
|
||||||
|
pre_f5 = "07:55 [NOM]-\nEJNAINI"
|
||||||
|
|
||||||
|
cleaned, hits = _apply_f5_nom_orphan(pre_f5)
|
||||||
|
|
||||||
|
# Verification : les deux composantes du nom compose sont masquees
|
||||||
|
assert "[NOM]-" in cleaned, "Le 1er composant doit rester masque"
|
||||||
|
assert "EJNAINI" not in cleaned, "Le 2e composant orphelin doit etre masque par F5"
|
||||||
|
assert "EJNAINI" not in cleaned, "Aucune fuite du nom orphelin"
|
||||||
|
assert cleaned.count(PLACEHOLDERS["NOM"]) == 2, (
|
||||||
|
"Les deux parties du nom compose doivent etre masquees"
|
||||||
|
)
|
||||||
|
assert hits == ["EJNAINI"], "EJNAINI doit etre loggue dans l'audit"
|
||||||
|
|
||||||
|
def test_f5_trackare_with_spaces_in_column_alignment(self):
|
||||||
|
"""Cas Trackare avec espaces d'alignement de colonne."""
|
||||||
|
pre_f5 = "07:55 [NOM]- \n EJNAINI \nSuite"
|
||||||
|
cleaned, hits = _apply_f5_nom_orphan(pre_f5)
|
||||||
|
assert hits == ["EJNAINI"]
|
||||||
|
assert "EJNAINI" not in cleaned
|
||||||
|
assert "Suite" in cleaned
|
||||||
|
|
||||||
|
def test_f5_nom_compose_with_apostrophe_and_dash(self):
|
||||||
|
"""Token orphelin contenant apostrophes et tirets."""
|
||||||
|
pre_f5 = "[NOM]-\nDUPONT-MARTIN"
|
||||||
|
cleaned, hits = _apply_f5_nom_orphan(pre_f5)
|
||||||
|
assert hits == ["DUPONT-MARTIN"]
|
||||||
|
assert "DUPONT-MARTIN" not in cleaned
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pytest.main([__file__, "-v"])
|
||||||
70
tests/unit/test_gui_batch_paths.py
Normal file
70
tests/unit/test_gui_batch_paths.py
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from gui_batch_paths import (
|
||||||
|
build_batch_output_dir,
|
||||||
|
iter_pseudonymized_texts,
|
||||||
|
list_supported_documents,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_list_supported_documents_excludes_gui_output_tree(tmp_path: Path):
|
||||||
|
root = tmp_path / "cases"
|
||||||
|
source_case = root / "001_patient_header_and_birth"
|
||||||
|
source_case.mkdir(parents=True)
|
||||||
|
nested_case = root / "002_contact_bundle"
|
||||||
|
nested_case.mkdir(parents=True)
|
||||||
|
output_case = root / "anonymise" / "001_patient_header_and_birth"
|
||||||
|
output_case.mkdir(parents=True)
|
||||||
|
|
||||||
|
source_txt = source_case / "test.txt"
|
||||||
|
source_pdf = nested_case / "source.pdf"
|
||||||
|
output_txt = output_case / "test.pseudonymise.txt"
|
||||||
|
|
||||||
|
source_txt.write_text("source", encoding="utf-8")
|
||||||
|
source_pdf.write_text("pdf", encoding="utf-8")
|
||||||
|
output_txt.write_text("output", encoding="utf-8")
|
||||||
|
|
||||||
|
documents = list_supported_documents(root, {".txt", ".pdf"})
|
||||||
|
|
||||||
|
assert documents == [source_txt, source_pdf]
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_batch_output_dir_preserves_relative_parent(tmp_path: Path):
|
||||||
|
root = tmp_path / "cases"
|
||||||
|
output_root = root / "anonymise"
|
||||||
|
source = root / "010_spaced_establishment_header" / "test.txt"
|
||||||
|
source.parent.mkdir(parents=True)
|
||||||
|
source.write_text("test", encoding="utf-8")
|
||||||
|
|
||||||
|
output_dir = build_batch_output_dir(root, output_root, source)
|
||||||
|
|
||||||
|
assert output_dir == output_root / "010_spaced_establishment_header"
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_batch_output_dir_keeps_root_files_at_output_root(tmp_path: Path):
|
||||||
|
root = tmp_path / "cases"
|
||||||
|
output_root = root / "anonymise"
|
||||||
|
source = root / "test.txt"
|
||||||
|
root.mkdir(parents=True)
|
||||||
|
source.write_text("test", encoding="utf-8")
|
||||||
|
|
||||||
|
output_dir = build_batch_output_dir(root, output_root, source)
|
||||||
|
|
||||||
|
assert output_dir == output_root
|
||||||
|
|
||||||
|
|
||||||
|
def test_iter_pseudonymized_texts_is_recursive(tmp_path: Path):
|
||||||
|
output_root = tmp_path / "anonymise"
|
||||||
|
nested = output_root / "001_patient_header_and_birth"
|
||||||
|
nested.mkdir(parents=True)
|
||||||
|
top_level = output_root / "summary.pseudonymise.txt"
|
||||||
|
nested_txt = nested / "test.pseudonymise.txt"
|
||||||
|
other_file = nested / "audit.jsonl"
|
||||||
|
|
||||||
|
top_level.write_text("top", encoding="utf-8")
|
||||||
|
nested_txt.write_text("nested", encoding="utf-8")
|
||||||
|
other_file.write_text("{}", encoding="utf-8")
|
||||||
|
|
||||||
|
found = sorted(iter_pseudonymized_texts(output_root))
|
||||||
|
|
||||||
|
assert found == [nested_txt, top_level]
|
||||||
60
tests/unit/test_manual_masking.py
Normal file
60
tests/unit/test_manual_masking.py
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from manual_masking import (
|
||||||
|
DEFAULT_MASK_OUTPUT_DIRNAME,
|
||||||
|
DEFAULT_MASK_PREVIEW_DIRNAME,
|
||||||
|
append_jsonl_file,
|
||||||
|
ensure_mask_templates_dir,
|
||||||
|
list_mask_templates,
|
||||||
|
mask_templates_dir,
|
||||||
|
mask_template_label,
|
||||||
|
resolve_manual_mask_pdf,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_mask_templates_dir_is_under_config():
|
||||||
|
base = Path("/tmp/anonymisation")
|
||||||
|
assert mask_templates_dir(base) == base / "config" / "mask_templates"
|
||||||
|
|
||||||
|
|
||||||
|
def test_ensure_mask_templates_dir_creates_folder(tmp_path: Path):
|
||||||
|
created = ensure_mask_templates_dir(tmp_path)
|
||||||
|
assert created == tmp_path / "config" / "mask_templates"
|
||||||
|
assert created.is_dir()
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_manual_mask_pdf_accepts_only_pdf():
|
||||||
|
assert resolve_manual_mask_pdf(Path("/tmp/test.pdf")) == Path("/tmp/test.pdf")
|
||||||
|
assert resolve_manual_mask_pdf(Path("/tmp/test.PDF")) == Path("/tmp/test.PDF")
|
||||||
|
assert resolve_manual_mask_pdf(Path("/tmp/test.docx")) is None
|
||||||
|
assert resolve_manual_mask_pdf(None) is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_manual_mask_outputs_follow_project_convention():
|
||||||
|
assert DEFAULT_MASK_OUTPUT_DIRNAME == "anonymise"
|
||||||
|
assert DEFAULT_MASK_PREVIEW_DIRNAME == "anonymise_preview"
|
||||||
|
|
||||||
|
|
||||||
|
def test_list_mask_templates_filters_supported_extensions(tmp_path: Path):
|
||||||
|
templates_dir = ensure_mask_templates_dir(tmp_path)
|
||||||
|
kept = templates_dir / "alpha.yml"
|
||||||
|
other = templates_dir / "beta.txt"
|
||||||
|
nested = templates_dir / "nested" / "gamma.json"
|
||||||
|
nested.parent.mkdir(parents=True)
|
||||||
|
kept.write_text("x", encoding="utf-8")
|
||||||
|
other.write_text("x", encoding="utf-8")
|
||||||
|
nested.write_text("x", encoding="utf-8")
|
||||||
|
|
||||||
|
assert list_mask_templates(tmp_path) == [kept, nested]
|
||||||
|
assert mask_template_label(nested, tmp_path) == "nested/gamma.json"
|
||||||
|
|
||||||
|
|
||||||
|
def test_append_jsonl_file_appends_non_empty_content(tmp_path: Path):
|
||||||
|
target = tmp_path / "target.jsonl"
|
||||||
|
extra = tmp_path / "extra.jsonl"
|
||||||
|
target.write_text('{"kind":"A"}\n', encoding="utf-8")
|
||||||
|
extra.write_text('{"kind":"B"}\n', encoding="utf-8")
|
||||||
|
|
||||||
|
append_jsonl_file(target, extra)
|
||||||
|
|
||||||
|
assert target.read_text(encoding="utf-8") == '{"kind":"A"}\n{"kind":"B"}\n'
|
||||||
46
tests/unit/test_real_world_identifier_layouts.py
Normal file
46
tests/unit/test_real_world_identifier_layouts.py
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Tests de non-régression sur des layouts d'identifiants vus en documents réels.
|
||||||
|
"""
|
||||||
|
from anonymizer_core_refactored_onnx import (
|
||||||
|
RE_SCAN_FILENAME_ARTIFACT,
|
||||||
|
anonymise_document_regex,
|
||||||
|
load_dictionaries,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_bacterio_multiline_venue_number_before_ipp_is_masked():
|
||||||
|
cfg = load_dictionaries(None)
|
||||||
|
text = (
|
||||||
|
"Diffusé le :\n"
|
||||||
|
"à\n"
|
||||||
|
"N° venue :\n"
|
||||||
|
"31/07/1973\n"
|
||||||
|
"VAN DE GRAAF\n"
|
||||||
|
"23176885\n"
|
||||||
|
"IPP :\n"
|
||||||
|
"2300201230\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
anon = anonymise_document_regex([text], [[]], cfg)
|
||||||
|
|
||||||
|
assert "23176885" not in anon.text_out
|
||||||
|
assert "[NDA]" in anon.text_out
|
||||||
|
assert any(h.kind == "NDA" and h.original == "23176885" for h in anon.audit)
|
||||||
|
|
||||||
|
|
||||||
|
def test_scan_filename_artifact_suffix_is_masked():
|
||||||
|
cfg = load_dictionaries(None)
|
||||||
|
text = (
|
||||||
|
"IPP:\n"
|
||||||
|
"16014215\n"
|
||||||
|
"Document scanné non\n"
|
||||||
|
"éditable pour patient (dont\ngénétique)\n"
|
||||||
|
"EXT2-16014215-2300249096.TIF\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
anon = anonymise_document_regex([text], [[]], cfg)
|
||||||
|
|
||||||
|
assert RE_SCAN_FILENAME_ARTIFACT.search("EXT2-[IPP]-2300249096.TIF") is not None
|
||||||
|
assert "2300249096" not in anon.text_out
|
||||||
|
assert "EXT2-[IPP]-[DOSSIER].TIF" in anon.text_out
|
||||||
Reference in New Issue
Block a user