Externalize dictionaries and add anonymization review corpus
This commit is contained in:
@@ -2,12 +2,12 @@
|
||||
"""Debug force_term mechanism."""
|
||||
|
||||
import re
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
|
||||
# Load config
|
||||
cfg_path = Path("config/dictionnaires.yml")
|
||||
cfg = yaml.safe_load(cfg_path.read_text(encoding="utf-8"))
|
||||
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH, load_effective_dictionaries_dict
|
||||
|
||||
# Load effective config
|
||||
cfg_path = RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||
cfg = load_effective_dictionaries_dict(cfg_path)
|
||||
|
||||
print("=" * 80)
|
||||
print("CONFIG LOADED")
|
||||
|
||||
@@ -5,6 +5,7 @@ import sys
|
||||
from pathlib import Path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||
from anonymizer_core_refactored_onnx import process_pdf
|
||||
|
||||
# Test sur 3 documents du test dataset
|
||||
@@ -32,7 +33,7 @@ for doc in test_docs:
|
||||
out_dir=out_dir,
|
||||
make_vector_redaction=False,
|
||||
also_make_raster_burn=False,
|
||||
config_path=Path("config/dictionnaires.yml"),
|
||||
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
|
||||
use_hf=False,
|
||||
ner_manager=None,
|
||||
vlm_manager=None,
|
||||
@@ -56,4 +57,3 @@ for doc in test_docs:
|
||||
print(f"❌ {pdf_path.name}: Erreur - {e}")
|
||||
|
||||
print("\n✅ Test terminé")
|
||||
|
||||
|
||||
169
tools/run_synthetic_review_corpus.py
Normal file
169
tools/run_synthetic_review_corpus.py
Normal file
@@ -0,0 +1,169 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Exécute le corpus synthétique de revue humaine et produit les diffs.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import difflib
|
||||
import json
|
||||
import shutil
|
||||
import sys
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
if str(ROOT) not in sys.path:
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from anonymizer_core_refactored_onnx import ( # noqa: E402
|
||||
anonymise_document_regex,
|
||||
load_dictionaries,
|
||||
selective_rescan,
|
||||
)
|
||||
from evaluation.leak_scanner import LeakScanner # noqa: E402
|
||||
|
||||
|
||||
CORPUS_DIR = ROOT / "tests" / "synthetic_review"
|
||||
CASES_DIR = CORPUS_DIR / "cases"
|
||||
ACTUAL_DIR = CORPUS_DIR / "actual"
|
||||
SCANNER = LeakScanner()
|
||||
|
||||
|
||||
def normalize_text(text: str) -> str:
|
||||
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
||||
return "\n".join(line.rstrip() for line in text.strip().splitlines()) + "\n"
|
||||
|
||||
|
||||
def load_expectations(case_dir: Path) -> dict:
|
||||
expectations_path = case_dir / "expectations.json"
|
||||
if not expectations_path.exists():
|
||||
return {}
|
||||
return json.loads(expectations_path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def build_leak_scan_seed(audit: list[dict]) -> list[dict]:
|
||||
"""Évite les faux positifs sur les valeurs trop courtes ou ambiguës."""
|
||||
seed = []
|
||||
for item in audit:
|
||||
original = str(item.get("original", "")).strip()
|
||||
compact = original.replace(" ", "")
|
||||
if len(compact) < 4:
|
||||
continue
|
||||
if compact.isdigit() and len(compact) < 6:
|
||||
continue
|
||||
seed.append(
|
||||
{
|
||||
"kind": item["kind"],
|
||||
"original": original,
|
||||
}
|
||||
)
|
||||
return seed
|
||||
|
||||
|
||||
def run_case(case_dir: Path) -> dict:
|
||||
cfg_path = case_dir / "config_overlay.yml"
|
||||
cfg = load_dictionaries(cfg_path if cfg_path.exists() else None)
|
||||
|
||||
source_text = (case_dir / "test.txt").read_text(encoding="utf-8")
|
||||
expected_text = normalize_text((case_dir / "expected.txt").read_text(encoding="utf-8"))
|
||||
expectations = load_expectations(case_dir)
|
||||
|
||||
anon = anonymise_document_regex([source_text], [[]], cfg)
|
||||
actual_text = normalize_text(selective_rescan(anon.text_out, cfg))
|
||||
audit = [
|
||||
{
|
||||
"kind": hit.kind,
|
||||
"original": hit.original,
|
||||
"replacement": hit.placeholder,
|
||||
}
|
||||
for hit in anon.audit
|
||||
]
|
||||
summary = {
|
||||
"kinds_present": sorted(set(item["kind"] for item in audit)),
|
||||
"kind_counts": dict(sorted(Counter(item["kind"] for item in audit).items())),
|
||||
"audit_len": len(audit),
|
||||
"leaks": SCANNER.scan_text(actual_text, build_leak_scan_seed(audit)),
|
||||
}
|
||||
|
||||
case_actual_dir = ACTUAL_DIR / case_dir.name
|
||||
if case_actual_dir.exists():
|
||||
shutil.rmtree(case_actual_dir)
|
||||
case_actual_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
(case_actual_dir / "actual.txt").write_text(actual_text, encoding="utf-8")
|
||||
(case_actual_dir / "actual.audit.json").write_text(
|
||||
json.dumps(audit, ensure_ascii=False, indent=2) + "\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
(case_actual_dir / "actual.summary.json").write_text(
|
||||
json.dumps(summary, ensure_ascii=False, indent=2) + "\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
diff_lines = list(
|
||||
difflib.unified_diff(
|
||||
expected_text.splitlines(keepends=True),
|
||||
actual_text.splitlines(keepends=True),
|
||||
fromfile=f"{case_dir.name}/expected.txt",
|
||||
tofile=f"{case_dir.name}/actual.txt",
|
||||
)
|
||||
)
|
||||
(case_actual_dir / "diff.txt").write_text("".join(diff_lines), encoding="utf-8")
|
||||
|
||||
failures = []
|
||||
if actual_text != expected_text:
|
||||
failures.append("text_diff")
|
||||
|
||||
if summary["leaks"]:
|
||||
failures.append("leak_detected")
|
||||
|
||||
required_kinds = expectations.get("required_kinds", [])
|
||||
missing_kinds = sorted(kind for kind in required_kinds if kind not in summary["kinds_present"])
|
||||
if missing_kinds:
|
||||
failures.append(f"missing_kinds:{','.join(missing_kinds)}")
|
||||
|
||||
for required in expectations.get("must_contain", []):
|
||||
if required not in actual_text:
|
||||
failures.append(f"missing_text:{required}")
|
||||
|
||||
for forbidden in expectations.get("must_not_contain", []):
|
||||
if forbidden in actual_text:
|
||||
failures.append(f"forbidden_text:{forbidden}")
|
||||
|
||||
return {
|
||||
"case": case_dir.name,
|
||||
"failures": failures,
|
||||
"output_dir": str(case_actual_dir),
|
||||
}
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="Exécuter le corpus synthétique de revue humaine")
|
||||
parser.add_argument(
|
||||
"--strict",
|
||||
action="store_true",
|
||||
help="Retourne un code non nul si un cas diffère de l'attendu.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
ACTUAL_DIR.mkdir(parents=True, exist_ok=True)
|
||||
case_dirs = sorted(path for path in CASES_DIR.iterdir() if path.is_dir())
|
||||
results = [run_case(case_dir) for case_dir in case_dirs]
|
||||
|
||||
has_failures = False
|
||||
for result in results:
|
||||
if result["failures"]:
|
||||
has_failures = True
|
||||
print(f"[FAIL] {result['case']}: {', '.join(result['failures'])}")
|
||||
else:
|
||||
print(f"[OK] {result['case']}")
|
||||
print(f" -> {result['output_dir']}")
|
||||
|
||||
if args.strict and has_failures:
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -8,6 +8,7 @@ sys.path.insert(0, '.')
|
||||
|
||||
from pathlib import Path
|
||||
import re
|
||||
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||
from anonymizer_core_refactored_onnx import process_pdf
|
||||
import time
|
||||
|
||||
@@ -47,7 +48,7 @@ def test_all_cro():
|
||||
output_dir,
|
||||
make_vector_redaction=False,
|
||||
also_make_raster_burn=False,
|
||||
config_path=Path("config/dictionnaires.yml")
|
||||
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||
)
|
||||
|
||||
# Lire le texte anonymisé
|
||||
|
||||
@@ -8,6 +8,7 @@ import sys
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
import anonymizer_core_refactored_onnx as core
|
||||
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||
|
||||
def test_chcb_detection():
|
||||
"""Test CHCB detection on the 2 documents with leaks."""
|
||||
@@ -53,7 +54,7 @@ def test_chcb_detection():
|
||||
out_dir=outdir,
|
||||
make_vector_redaction=False,
|
||||
also_make_raster_burn=False,
|
||||
config_path=Path("config/dictionnaires.yml"),
|
||||
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
|
||||
use_hf=False,
|
||||
)
|
||||
|
||||
@@ -102,7 +103,7 @@ def test_chcb_detection():
|
||||
out_dir=outdir,
|
||||
make_vector_redaction=False,
|
||||
also_make_raster_burn=False,
|
||||
config_path=Path("config/dictionnaires.yml"),
|
||||
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
|
||||
use_hf=False,
|
||||
)
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ sys.path.insert(0, '.')
|
||||
|
||||
from pathlib import Path
|
||||
import re
|
||||
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||
from anonymizer_core_refactored_onnx import process_pdf
|
||||
|
||||
def test_date_propagation():
|
||||
@@ -47,7 +48,7 @@ def test_date_propagation():
|
||||
output_dir,
|
||||
make_vector_redaction=False,
|
||||
also_make_raster_burn=False,
|
||||
config_path=Path("config/dictionnaires.yml")
|
||||
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||
)
|
||||
|
||||
# Lire le texte anonymisé
|
||||
|
||||
@@ -9,6 +9,7 @@ import time
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
import anonymizer_core_refactored_onnx as core
|
||||
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||
|
||||
# Dossier de test
|
||||
test_dir = Path("/tmp/test_gui_pdfs")
|
||||
@@ -39,7 +40,7 @@ for i, pdf in enumerate(pdfs, start=1):
|
||||
out_dir=out_dir,
|
||||
make_vector_redaction=False,
|
||||
also_make_raster_burn=True,
|
||||
config_path=Path("config/dictionnaires.yml"),
|
||||
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
|
||||
use_hf=False,
|
||||
ner_manager=None,
|
||||
ner_thresholds=None,
|
||||
|
||||
@@ -8,6 +8,7 @@ import sys
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
import anonymizer_core_refactored_onnx as core
|
||||
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||
|
||||
# Simuler exactement ce que fait le GUI
|
||||
test_pdf = Path("/tmp/test_gui_pdfs/001_simple_unknown_BACTERIO_23018396.pdf")
|
||||
@@ -27,7 +28,7 @@ try:
|
||||
out_dir=out_dir,
|
||||
make_vector_redaction=False,
|
||||
also_make_raster_burn=True,
|
||||
config_path=Path("config/dictionnaires.yml"),
|
||||
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
|
||||
use_hf=False,
|
||||
ner_manager=None,
|
||||
ner_thresholds=None,
|
||||
|
||||
@@ -16,6 +16,7 @@ import re
|
||||
# Ajouter le répertoire racine au path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||
from anonymizer_core_refactored_onnx import process_pdf
|
||||
|
||||
def test_phase1_corrections():
|
||||
@@ -52,7 +53,7 @@ def test_phase1_corrections():
|
||||
# Anonymiser le document
|
||||
result = process_pdf(
|
||||
pdf_path=pdf_path,
|
||||
config_path=Path("config/dictionnaires.yml"),
|
||||
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
|
||||
ner_manager=None,
|
||||
eds_pseudo_manager=None,
|
||||
vlm_manager=None,
|
||||
|
||||
@@ -16,6 +16,7 @@ import re
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||
from anonymizer_core_refactored_onnx import process_pdf
|
||||
|
||||
def validate_corpus_sample():
|
||||
@@ -94,7 +95,7 @@ def validate_corpus_sample():
|
||||
output_dir,
|
||||
make_vector_redaction=False,
|
||||
also_make_raster_burn=False, # Pas de PDF pour aller plus vite
|
||||
config_path=Path("config/dictionnaires.yml")
|
||||
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||
)
|
||||
doc_time = time.time() - doc_start
|
||||
|
||||
|
||||
@@ -17,6 +17,7 @@ import re
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||
from anonymizer_core_refactored_onnx import process_pdf
|
||||
|
||||
def validate_full_corpus():
|
||||
@@ -70,7 +71,7 @@ def validate_full_corpus():
|
||||
output_dir,
|
||||
make_vector_redaction=False,
|
||||
also_make_raster_burn=True,
|
||||
config_path=Path("config/dictionnaires.yml")
|
||||
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||
)
|
||||
doc_time = time.time() - doc_start
|
||||
|
||||
|
||||
@@ -10,6 +10,7 @@ from pathlib import Path
|
||||
import json
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||
from anonymizer_core_refactored_onnx import process_pdf
|
||||
|
||||
# 5 documents du corpus production (OGC 008)
|
||||
@@ -58,7 +59,7 @@ for pdf_path in test_docs[:5]:
|
||||
out_dir=out_dir,
|
||||
make_vector_redaction=False,
|
||||
also_make_raster_burn=False,
|
||||
config_path=Path("config/dictionnaires.yml"),
|
||||
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
|
||||
use_hf=False,
|
||||
ner_manager=None,
|
||||
vlm_manager=None,
|
||||
|
||||
Reference in New Issue
Block a user