anonymisation/tests/unit/test_gui_v6_processing_runner.py

"""Tests du runner G2 : process_fn injectée, vrais fichiers tmp, aucun moteur réel."""

from __future__ import annotations

import threading
from pathlib import Path

import pytest

from gui_v6.processing_runner import (
    ProcessingRunner,
    RunSummary,
    default_output_dir,
    discover_documents,
)

_EXTS = (".pdf", ".txt")


def _touch(path: Path) -> Path:
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text("x", encoding="utf-8")
    return path


# -- découverte & chemins --------------------------------------------------

def test_discover_single_file(tmp_path):
    f = _touch(tmp_path / "doc.pdf")
    assert discover_documents(f, _EXTS) == [f]


def test_discover_single_file_unsupported(tmp_path):
    f = _touch(tmp_path / "doc.xyz")
    assert discover_documents(f, _EXTS) == []


def test_discover_folder_sorted_and_skips_output(tmp_path):
    _touch(tmp_path / "b.pdf")
    _touch(tmp_path / "a.pdf")
    _touch(tmp_path / "note.txt")
    _touch(tmp_path / "anonymise" / "already.pdf")  # sous-arbre de sortie ignoré
    found = discover_documents(tmp_path, _EXTS)
    names = [p.name for p in found]
    assert names == ["a.pdf", "b.pdf", "note.txt"]


def test_default_output_dir_file_and_dir(tmp_path):
    f = _touch(tmp_path / "doc.pdf")
    assert default_output_dir(f) == tmp_path / "anonymise"
    assert default_output_dir(tmp_path) == tmp_path / "anonymise"


# -- exécution -------------------------------------------------------------

def test_run_processes_all_docs(tmp_path):
    _touch(tmp_path / "a.pdf")
    _touch(tmp_path / "b.pdf")
    calls = []
    runner = ProcessingRunner(process_fn=lambda d, o: calls.append((d, o)) or {}, extensions=_EXTS)

    summary = runner.run(tmp_path)

    assert isinstance(summary, RunSummary)
    assert summary.total == 2
    assert summary.succeeded == 2
    assert summary.failed == 0
    assert summary.ok is True
    assert len(calls) == 2
    # Le dossier de sortie par défaut a été créé.
    assert (tmp_path / "anonymise").is_dir()


def test_run_single_file_uses_output_dir(tmp_path):
    f = _touch(tmp_path / "doc.pdf")
    out = tmp_path / "sortie"
    seen = {}
    runner = ProcessingRunner(process_fn=lambda d, o: seen.update(doc=d, out=o) or {}, extensions=_EXTS)

    summary = runner.run(f, output_dir=out)

    assert summary.total == 1 and summary.succeeded == 1
    assert seen["doc"] == f
    assert seen["out"] == out
    assert out.is_dir()


def test_run_continues_after_failure(tmp_path):
    _touch(tmp_path / "a.pdf")
    _touch(tmp_path / "boom.pdf")
    _touch(tmp_path / "c.pdf")

    def proc(doc, out):
        if doc.name == "boom.pdf":
            raise RuntimeError("explosion")
        return {}

    runner = ProcessingRunner(process_fn=proc, extensions=_EXTS)
    summary = runner.run(tmp_path)

    assert summary.total == 3
    assert summary.succeeded == 2
    assert summary.failed == 1
    assert summary.ok is False
    assert summary.errors[0][0] == "boom.pdf"
    assert "explosion" in summary.errors[0][1]


def test_run_marks_quarantined_engine_result_as_failure(tmp_path):
    f = _touch(tmp_path / "scan.pdf")
    logs = []

    def proc(doc, out):
        return {"status": "quarantined", "reason": "preflight_text_too_short"}

    runner = ProcessingRunner(process_fn=proc, extensions=_EXTS)
    summary = runner.run(f, on_log=logs.append)

    assert summary.succeeded == 0
    assert summary.failed == 1
    assert summary.ok is False
    assert summary.documents[0].status == "failed"
    assert "preflight_text_too_short" in summary.errors[0][1]
    assert any("ÉCHEC : scan.pdf" in item for item in logs)


def test_run_marks_missing_pdf_output_as_failure(tmp_path):
    f = _touch(tmp_path / "doc.pdf")
    out = tmp_path / "sortie"

    def proc(doc, out_dir):
        txt = out_dir / "doc.pseudonymise.txt"
        audit = out_dir / "doc.audit.jsonl"
        txt.write_text("ok", encoding="utf-8")
        audit.write_text("{}", encoding="utf-8")
        return {"text": str(txt), "audit": str(audit)}

    runner = ProcessingRunner(process_fn=proc, extensions=_EXTS)
    summary = runner.run(f, output_dir=out)

    assert summary.succeeded == 0
    assert summary.failed == 1
    assert summary.documents[0].status == "failed"
    assert "Aucune sortie PDF" in summary.errors[0][1]


def test_run_accepts_existing_pdf_output(tmp_path):
    f = _touch(tmp_path / "doc.pdf")
    out = tmp_path / "sortie"

    def proc(doc, out_dir):
        pdf = out_dir / "doc.redacted_raster.pdf"
        pdf.write_bytes(b"%PDF-1.4\n")
        return {"pdf_raster": str(pdf)}

    runner = ProcessingRunner(process_fn=proc, extensions=_EXTS)
    summary = runner.run(f, output_dir=out)

    assert summary.succeeded == 1
    assert summary.failed == 0
    assert summary.documents[0].status == "success"


def test_run_empty_folder(tmp_path):
    logs = []
    runner = ProcessingRunner(process_fn=lambda d, o: {}, extensions=_EXTS)
    summary = runner.run(tmp_path, on_log=logs.append)
    assert summary.total == 0
    assert any("Aucun document" in m for m in logs)


def test_stop_event_interrupts_between_docs(tmp_path):
    for name in ("a.pdf", "b.pdf", "c.pdf"):
        _touch(tmp_path / name)
    stop = threading.Event()
    processed = []

    def proc(doc, out):
        processed.append(doc.name)
        stop.set()  # demande l'arrêt après le 1er document
        return {}

    runner = ProcessingRunner(process_fn=proc, extensions=_EXTS)
    summary = runner.run(tmp_path, stop_event=stop)

    assert summary.stopped is True
    assert summary.succeeded == 1
    assert len(processed) == 1  # arrêt effectif entre deux documents


def test_progress_callbacks(tmp_path):
    _touch(tmp_path / "a.pdf")
    _touch(tmp_path / "b.pdf")
    events = []
    runner = ProcessingRunner(process_fn=lambda d, o: {}, extensions=_EXTS)
    runner.run(tmp_path, on_progress=lambda done, total, name: events.append((done, total)))
    assert (2, 2) in events  # progression finale atteinte


def test_run_fails_fast_when_output_not_writable(tmp_path, monkeypatch):
    from gui_v6.processing_runner import ProcessingRunner, OutputNotWritableError
    src = tmp_path / "in"
    src.mkdir()
    (src / "a.txt").write_text("x", encoding="utf-8")
    out = tmp_path / "ro"
    out.mkdir()

    def boom(*a, **k):
        raise PermissionError("read-only")

    monkeypatch.setattr("gui_v6.processing_runner.Path.mkdir", boom)
    runner = ProcessingRunner(process_fn=lambda d, o: {})
    with pytest.raises(OutputNotWritableError):
        runner.run(src, out)


def test_no_double_run(tmp_path):
    _touch(tmp_path / "a.pdf")
    started = threading.Event()
    release = threading.Event()
    result = {}

    def proc(doc, out):
        started.set()
        release.wait(timeout=2)
        return {}

    runner = ProcessingRunner(process_fn=proc, extensions=_EXTS)
    worker = threading.Thread(target=lambda: runner.run(tmp_path))
    worker.start()
    assert started.wait(timeout=2)
    # Pendant le run, un second lancement est refusé.
    with pytest.raises(RuntimeError):
        runner.run(tmp_path)
    release.set()
    worker.join(timeout=2)
    assert runner.is_running is False


# -- détails par document (télémétrie) -------------------------------------

def test_run_records_per_document_details(tmp_path):
    _touch(tmp_path / "a.pdf")
    _touch(tmp_path / "b.pdf")

    def fake(doc, out):
        if doc.name == "b.pdf":
            raise RuntimeError("boom")
        return {}

    runner = ProcessingRunner(process_fn=fake, extensions=_EXTS)
    summary = runner.run(tmp_path)

    assert len(summary.documents) == 2
    statuses = {doc.ordinal: doc.status for doc in summary.documents}
    assert statuses == {0: "success", 1: "failed"}
    for doc in summary.documents:
        assert doc.extension == "pdf"
        assert isinstance(doc.duration_ms, int)
        # RGPD : aucun nom/chemin de fichier dans les détails
        assert not hasattr(doc, "path")
        assert not hasattr(doc, "filename")
        assert not hasattr(doc, "name")


# -- diagnostics d'erreur RGPD-safe (E2) -----------------------------------

def test_failed_doc_carries_rgpd_safe_error_fields(tmp_path):
    from gui_v6.processing_runner import ProcessingRunner

    secret = "Dupont Jean 1980"  # simulacre de PII dans un message d'exception

    def boom(_inp, _out):
        raise ValueError(f"échec sur patient {secret}")

    inp = tmp_path / "in"; inp.mkdir()
    (inp / "a.pdf").write_bytes(b"%PDF-1.4\n")
    out = tmp_path / "out"; out.mkdir()
    runner = ProcessingRunner(process_fn=boom)
    summary = runner.run(inp, out)

    assert summary.failed == 1
    doc = summary.documents[0]
    assert doc.error_type == "ValueError"
    assert doc.error_code in {"ner_unavailable", "quarantined", "no_output", "processing_error"}
    blob = repr(vars(doc)).lower()
    assert "dupont" not in blob and "patient" not in blob and secret.lower() not in blob


def test_success_doc_has_no_error_fields(tmp_path):
    from gui_v6.processing_runner import ProcessingRunner

    def ok(_inp, out_dir):
        # process_fn reçoit le DOSSIER de sortie : on y écrit un PDF livrable.
        pdf = out_dir / "a.redacted_raster.pdf"
        pdf.write_bytes(b"%PDF-1.4\n")
        return {"status": "ok", "pdf_raster": str(pdf)}

    inp = tmp_path / "in"; inp.mkdir()
    (inp / "a.pdf").write_bytes(b"%PDF-1.4\n")
    out = tmp_path / "out"; out.mkdir()
    summary = ProcessingRunner(process_fn=ok).run(inp, out)
    doc = summary.documents[0]
    assert doc.status == "success"
    assert doc.error_type is None and doc.error_code is None


# -- classification d'erreur : une assertion par branche (mapping vérifié) -

def test_classify_error_code_ner_unavailable():
    from gui_v6.processing_runner import classify_error_code
    from gui_v6.engine_bridge import EngineUnavailableError  # import the REAL class
    # importing the real class means a future rename breaks this test (intended guard)
    assert classify_error_code(EngineUnavailableError("modèle indispo")) == "ner_unavailable"


def test_classify_error_code_quarantined():
    from gui_v6.processing_runner import classify_error_code
    assert classify_error_code(RuntimeError("Document mis en quarantaine : texte trop court")) == "quarantined"


def test_classify_error_code_no_output():
    from gui_v6.processing_runner import classify_error_code
    assert classify_error_code(RuntimeError("Aucune sortie PDF anonymisée produite")) == "no_output"


def test_classify_error_code_processing_error_default():
    from gui_v6.processing_runner import classify_error_code, _ERROR_CODES
    assert classify_error_code(ValueError("patient Dupont")) == "processing_error"
    assert classify_error_code(ValueError("x")) in _ERROR_CODES