feat(ocr): migrer l'OCR de docTR (PyTorch) vers OnnxTR (ONNX Runtime)

OnnxTR exécute les MÊMES modèles que docTR (db_resnet50 + crnn_vgg16_bn) sur ONNX Runtime, sans PyTorch. Corrige le crash torch/oneDNN « could not create a primitive » sur CPU contraint (VM 2 cœurs collaborateur : OCR scan impossible → quarantaine). Qualité identique validée empiriquement (CER 0,10-0,23 % vs docTR, 2 validations indépendantes Claude+Qwen), OCR ~2-3× plus rapide CPU. - core : import OnnxTR, _get_ocr_model(), _OCR_AVAILABLE, boucle OCR inchangée (API miroir) ; ONNXTR_CACHE_DIR pour le frozen ; bandeau de logs ENV au démarrage (OS, CPU+AVX, cœurs, RAM, versions, providers) pour retours terrain auto-suffisants. - 3 .spec : embarquent les poids ONNX OnnxTR (fail-closed) + hiddenimports onnxtr. - requirements : onnxtr[cpu] (python-doctr conservé transitoirement). - inclut le correctif quarantaine-visible du runner (GO Qwen). Tests : test_ocr_onnxtr.py (RED→GREEN), 95 unit passed, e2e scan client OK (OCR 5/5, PDF produit, plus de crash). Retrait torch du frozen + rebuild Windows = étapes suivantes (gates Dom). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-19 17:07:00 +02:00
parent 80d8cc230b
commit 8d683bc6d8
8 changed files with 323 additions and 21 deletions
--- a/tests/unit/test_gui_v6_processing_runner.py
+++ b/tests/unit/test_gui_v6_processing_runner.py
@@ -106,6 +106,61 @@ def test_run_continues_after_failure(tmp_path):
    assert "explosion" in summary.errors[0][1]


+def test_run_marks_quarantined_engine_result_as_failure(tmp_path):
+    f = _touch(tmp_path / "scan.pdf")
+    logs = []
+
+    def proc(doc, out):
+        return {"status": "quarantined", "reason": "preflight_text_too_short"}
+
+    runner = ProcessingRunner(process_fn=proc, extensions=_EXTS)
+    summary = runner.run(f, on_log=logs.append)
+
+    assert summary.succeeded == 0
+    assert summary.failed == 1
+    assert summary.ok is False
+    assert summary.documents[0].status == "failed"
+    assert "preflight_text_too_short" in summary.errors[0][1]
+    assert any("ÉCHEC : scan.pdf" in item for item in logs)
+
+
+def test_run_marks_missing_pdf_output_as_failure(tmp_path):
+    f = _touch(tmp_path / "doc.pdf")
+    out = tmp_path / "sortie"
+
+    def proc(doc, out_dir):
+        txt = out_dir / "doc.pseudonymise.txt"
+        audit = out_dir / "doc.audit.jsonl"
+        txt.write_text("ok", encoding="utf-8")
+        audit.write_text("{}", encoding="utf-8")
+        return {"text": str(txt), "audit": str(audit)}
+
+    runner = ProcessingRunner(process_fn=proc, extensions=_EXTS)
+    summary = runner.run(f, output_dir=out)
+
+    assert summary.succeeded == 0
+    assert summary.failed == 1
+    assert summary.documents[0].status == "failed"
+    assert "Aucune sortie PDF" in summary.errors[0][1]
+
+
+def test_run_accepts_existing_pdf_output(tmp_path):
+    f = _touch(tmp_path / "doc.pdf")
+    out = tmp_path / "sortie"
+
+    def proc(doc, out_dir):
+        pdf = out_dir / "doc.redacted_raster.pdf"
+        pdf.write_bytes(b"%PDF-1.4\n")
+        return {"pdf_raster": str(pdf)}
+
+    runner = ProcessingRunner(process_fn=proc, extensions=_EXTS)
+    summary = runner.run(f, output_dir=out)
+
+    assert summary.succeeded == 1
+    assert summary.failed == 0
+    assert summary.documents[0].status == "success"
+
+
 def test_run_empty_folder(tmp_path):
    logs = []
    runner = ProcessingRunner(process_fn=lambda d, o: {}, extensions=_EXTS)
--- a/tests/unit/test_ocr_onnxtr.py
+++ b/tests/unit/test_ocr_onnxtr.py
@@ -0,0 +1,39 @@
+"""Migration OCR docTR → OnnxTR : le moteur OCR est OnnxTR et lit le texte rendu.
+
+Pas de mock : on exerce le vrai predictor OCR du moteur sur une image réelle.
+"""
+import numpy as np
+import pytest
+from PIL import Image, ImageDraw, ImageFont
+
+import anonymizer_core_refactored_onnx as core
+
+
+def test_ocr_engine_is_onnxtr():
+    # Après migration : le moteur OCR doit être OnnxTR (ONNX Runtime, sans torch).
+    assert core._OCR_AVAILABLE, "moteur OCR indisponible"
+    model = core._get_ocr_model()
+    assert "onnxtr" in type(model).__module__.lower(), type(model).__module__
+
+
+@pytest.mark.slow
+def test_ocr_reads_rendered_text():
+    img = Image.new("RGB", (1400, 300), "white")
+    draw = ImageDraw.Draw(img)
+    try:
+        font = ImageFont.truetype("DejaVuSans-Bold.ttf", 64)
+    except OSError:
+        try:
+            font = ImageFont.truetype("DejaVuSans.ttf", 64)
+        except OSError:
+            font = ImageFont.load_default()
+    words = ["BORDEAUX", "DUPONT", "MARTIN", "BAYONNE"]
+    draw.text((40, 110), " ".join(words), fill="black", font=font)
+
+    model = core._get_ocr_model()
+    result = model([np.array(img)])
+    got = " ".join(
+        w.value for b in result.pages[0].blocks for l in b.lines for w in l.words
+    ).upper()
+    found = sum(1 for w in words if w in got)
+    assert found >= 2, f"OCR a lu: {got!r}"