Externalize dictionaries and add anonymization review corpus

This commit is contained in:
2026-04-21 10:32:57 +02:00
parent 39db675052
commit 34dcf8f360
99 changed files with 1805 additions and 805 deletions

View File

@@ -2,12 +2,12 @@
"""Debug force_term mechanism."""
import re
import yaml
from pathlib import Path
# Load config
cfg_path = Path("config/dictionnaires.yml")
cfg = yaml.safe_load(cfg_path.read_text(encoding="utf-8"))
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH, load_effective_dictionaries_dict
# Load effective config
cfg_path = RUNTIME_DICTIONARIES_CONFIG_PATH
cfg = load_effective_dictionaries_dict(cfg_path)
print("=" * 80)
print("CONFIG LOADED")

View File

@@ -5,6 +5,7 @@ import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
from anonymizer_core_refactored_onnx import process_pdf
# Test sur 3 documents du test dataset
@@ -32,7 +33,7 @@ for doc in test_docs:
out_dir=out_dir,
make_vector_redaction=False,
also_make_raster_burn=False,
config_path=Path("config/dictionnaires.yml"),
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
use_hf=False,
ner_manager=None,
vlm_manager=None,
@@ -56,4 +57,3 @@ for doc in test_docs:
print(f"{pdf_path.name}: Erreur - {e}")
print("\n✅ Test terminé")

View File

@@ -0,0 +1,169 @@
#!/usr/bin/env python3
"""
Exécute le corpus synthétique de revue humaine et produit les diffs.
"""
from __future__ import annotations
import argparse
import difflib
import json
import shutil
import sys
from collections import Counter
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
from anonymizer_core_refactored_onnx import ( # noqa: E402
anonymise_document_regex,
load_dictionaries,
selective_rescan,
)
from evaluation.leak_scanner import LeakScanner # noqa: E402
CORPUS_DIR = ROOT / "tests" / "synthetic_review"
CASES_DIR = CORPUS_DIR / "cases"
ACTUAL_DIR = CORPUS_DIR / "actual"
SCANNER = LeakScanner()
def normalize_text(text: str) -> str:
text = text.replace("\r\n", "\n").replace("\r", "\n")
return "\n".join(line.rstrip() for line in text.strip().splitlines()) + "\n"
def load_expectations(case_dir: Path) -> dict:
expectations_path = case_dir / "expectations.json"
if not expectations_path.exists():
return {}
return json.loads(expectations_path.read_text(encoding="utf-8"))
def build_leak_scan_seed(audit: list[dict]) -> list[dict]:
"""Évite les faux positifs sur les valeurs trop courtes ou ambiguës."""
seed = []
for item in audit:
original = str(item.get("original", "")).strip()
compact = original.replace(" ", "")
if len(compact) < 4:
continue
if compact.isdigit() and len(compact) < 6:
continue
seed.append(
{
"kind": item["kind"],
"original": original,
}
)
return seed
def run_case(case_dir: Path) -> dict:
cfg_path = case_dir / "config_overlay.yml"
cfg = load_dictionaries(cfg_path if cfg_path.exists() else None)
source_text = (case_dir / "test.txt").read_text(encoding="utf-8")
expected_text = normalize_text((case_dir / "expected.txt").read_text(encoding="utf-8"))
expectations = load_expectations(case_dir)
anon = anonymise_document_regex([source_text], [[]], cfg)
actual_text = normalize_text(selective_rescan(anon.text_out, cfg))
audit = [
{
"kind": hit.kind,
"original": hit.original,
"replacement": hit.placeholder,
}
for hit in anon.audit
]
summary = {
"kinds_present": sorted(set(item["kind"] for item in audit)),
"kind_counts": dict(sorted(Counter(item["kind"] for item in audit).items())),
"audit_len": len(audit),
"leaks": SCANNER.scan_text(actual_text, build_leak_scan_seed(audit)),
}
case_actual_dir = ACTUAL_DIR / case_dir.name
if case_actual_dir.exists():
shutil.rmtree(case_actual_dir)
case_actual_dir.mkdir(parents=True, exist_ok=True)
(case_actual_dir / "actual.txt").write_text(actual_text, encoding="utf-8")
(case_actual_dir / "actual.audit.json").write_text(
json.dumps(audit, ensure_ascii=False, indent=2) + "\n",
encoding="utf-8",
)
(case_actual_dir / "actual.summary.json").write_text(
json.dumps(summary, ensure_ascii=False, indent=2) + "\n",
encoding="utf-8",
)
diff_lines = list(
difflib.unified_diff(
expected_text.splitlines(keepends=True),
actual_text.splitlines(keepends=True),
fromfile=f"{case_dir.name}/expected.txt",
tofile=f"{case_dir.name}/actual.txt",
)
)
(case_actual_dir / "diff.txt").write_text("".join(diff_lines), encoding="utf-8")
failures = []
if actual_text != expected_text:
failures.append("text_diff")
if summary["leaks"]:
failures.append("leak_detected")
required_kinds = expectations.get("required_kinds", [])
missing_kinds = sorted(kind for kind in required_kinds if kind not in summary["kinds_present"])
if missing_kinds:
failures.append(f"missing_kinds:{','.join(missing_kinds)}")
for required in expectations.get("must_contain", []):
if required not in actual_text:
failures.append(f"missing_text:{required}")
for forbidden in expectations.get("must_not_contain", []):
if forbidden in actual_text:
failures.append(f"forbidden_text:{forbidden}")
return {
"case": case_dir.name,
"failures": failures,
"output_dir": str(case_actual_dir),
}
def main() -> int:
parser = argparse.ArgumentParser(description="Exécuter le corpus synthétique de revue humaine")
parser.add_argument(
"--strict",
action="store_true",
help="Retourne un code non nul si un cas diffère de l'attendu.",
)
args = parser.parse_args()
ACTUAL_DIR.mkdir(parents=True, exist_ok=True)
case_dirs = sorted(path for path in CASES_DIR.iterdir() if path.is_dir())
results = [run_case(case_dir) for case_dir in case_dirs]
has_failures = False
for result in results:
if result["failures"]:
has_failures = True
print(f"[FAIL] {result['case']}: {', '.join(result['failures'])}")
else:
print(f"[OK] {result['case']}")
print(f" -> {result['output_dir']}")
if args.strict and has_failures:
return 1
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -8,6 +8,7 @@ sys.path.insert(0, '.')
from pathlib import Path
import re
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
from anonymizer_core_refactored_onnx import process_pdf
import time
@@ -47,7 +48,7 @@ def test_all_cro():
output_dir,
make_vector_redaction=False,
also_make_raster_burn=False,
config_path=Path("config/dictionnaires.yml")
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH
)
# Lire le texte anonymisé

View File

@@ -8,6 +8,7 @@ import sys
sys.path.insert(0, str(Path(__file__).parent.parent))
import anonymizer_core_refactored_onnx as core
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
def test_chcb_detection():
"""Test CHCB detection on the 2 documents with leaks."""
@@ -53,7 +54,7 @@ def test_chcb_detection():
out_dir=outdir,
make_vector_redaction=False,
also_make_raster_burn=False,
config_path=Path("config/dictionnaires.yml"),
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
use_hf=False,
)
@@ -102,7 +103,7 @@ def test_chcb_detection():
out_dir=outdir,
make_vector_redaction=False,
also_make_raster_burn=False,
config_path=Path("config/dictionnaires.yml"),
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
use_hf=False,
)

View File

@@ -9,6 +9,7 @@ sys.path.insert(0, '.')
from pathlib import Path
import re
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
from anonymizer_core_refactored_onnx import process_pdf
def test_date_propagation():
@@ -47,7 +48,7 @@ def test_date_propagation():
output_dir,
make_vector_redaction=False,
also_make_raster_burn=False,
config_path=Path("config/dictionnaires.yml")
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH
)
# Lire le texte anonymisé

View File

@@ -9,6 +9,7 @@ import time
sys.path.insert(0, str(Path(__file__).parent.parent))
import anonymizer_core_refactored_onnx as core
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
# Dossier de test
test_dir = Path("/tmp/test_gui_pdfs")
@@ -39,7 +40,7 @@ for i, pdf in enumerate(pdfs, start=1):
out_dir=out_dir,
make_vector_redaction=False,
also_make_raster_burn=True,
config_path=Path("config/dictionnaires.yml"),
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
use_hf=False,
ner_manager=None,
ner_thresholds=None,

View File

@@ -8,6 +8,7 @@ import sys
sys.path.insert(0, str(Path(__file__).parent.parent))
import anonymizer_core_refactored_onnx as core
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
# Simuler exactement ce que fait le GUI
test_pdf = Path("/tmp/test_gui_pdfs/001_simple_unknown_BACTERIO_23018396.pdf")
@@ -27,7 +28,7 @@ try:
out_dir=out_dir,
make_vector_redaction=False,
also_make_raster_burn=True,
config_path=Path("config/dictionnaires.yml"),
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
use_hf=False,
ner_manager=None,
ner_thresholds=None,

View File

@@ -16,6 +16,7 @@ import re
# Ajouter le répertoire racine au path
sys.path.insert(0, str(Path(__file__).parent.parent))
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
from anonymizer_core_refactored_onnx import process_pdf
def test_phase1_corrections():
@@ -52,7 +53,7 @@ def test_phase1_corrections():
# Anonymiser le document
result = process_pdf(
pdf_path=pdf_path,
config_path=Path("config/dictionnaires.yml"),
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
ner_manager=None,
eds_pseudo_manager=None,
vlm_manager=None,

View File

@@ -16,6 +16,7 @@ import re
sys.path.insert(0, str(Path(__file__).parent.parent))
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
from anonymizer_core_refactored_onnx import process_pdf
def validate_corpus_sample():
@@ -94,7 +95,7 @@ def validate_corpus_sample():
output_dir,
make_vector_redaction=False,
also_make_raster_burn=False, # Pas de PDF pour aller plus vite
config_path=Path("config/dictionnaires.yml")
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH
)
doc_time = time.time() - doc_start

View File

@@ -17,6 +17,7 @@ import re
sys.path.insert(0, str(Path(__file__).parent.parent))
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
from anonymizer_core_refactored_onnx import process_pdf
def validate_full_corpus():
@@ -70,7 +71,7 @@ def validate_full_corpus():
output_dir,
make_vector_redaction=False,
also_make_raster_burn=True,
config_path=Path("config/dictionnaires.yml")
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH
)
doc_time = time.time() - doc_start

View File

@@ -10,6 +10,7 @@ from pathlib import Path
import json
sys.path.insert(0, str(Path(__file__).parent.parent))
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
from anonymizer_core_refactored_onnx import process_pdf
# 5 documents du corpus production (OGC 008)
@@ -58,7 +59,7 @@ for pdf_path in test_docs[:5]:
out_dir=out_dir,
make_vector_redaction=False,
also_make_raster_burn=False,
config_path=Path("config/dictionnaires.yml"),
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
use_hf=False,
ner_manager=None,
vlm_manager=None,