feat(ocr): migrer l'OCR de docTR (PyTorch) vers OnnxTR (ONNX Runtime)
OnnxTR exécute les MÊMES modèles que docTR (db_resnet50 + crnn_vgg16_bn) sur ONNX Runtime, sans PyTorch. Corrige le crash torch/oneDNN « could not create a primitive » sur CPU contraint (VM 2 cœurs collaborateur : OCR scan impossible → quarantaine). Qualité identique validée empiriquement (CER 0,10-0,23 % vs docTR, 2 validations indépendantes Claude+Qwen), OCR ~2-3× plus rapide CPU. - core : import OnnxTR, _get_ocr_model(), _OCR_AVAILABLE, boucle OCR inchangée (API miroir) ; ONNXTR_CACHE_DIR pour le frozen ; bandeau de logs ENV au démarrage (OS, CPU+AVX, cœurs, RAM, versions, providers) pour retours terrain auto-suffisants. - 3 .spec : embarquent les poids ONNX OnnxTR (fail-closed) + hiddenimports onnxtr. - requirements : onnxtr[cpu] (python-doctr conservé transitoirement). - inclut le correctif quarantaine-visible du runner (GO Qwen). Tests : test_ocr_onnxtr.py (RED→GREEN), 95 unit passed, e2e scan client OK (OCR 5/5, PDF produit, plus de crash). Retrait torch du frozen + rebuild Windows = étapes suivantes (gates Dom). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -106,6 +106,61 @@ def test_run_continues_after_failure(tmp_path):
|
||||
assert "explosion" in summary.errors[0][1]
|
||||
|
||||
|
||||
def test_run_marks_quarantined_engine_result_as_failure(tmp_path):
|
||||
f = _touch(tmp_path / "scan.pdf")
|
||||
logs = []
|
||||
|
||||
def proc(doc, out):
|
||||
return {"status": "quarantined", "reason": "preflight_text_too_short"}
|
||||
|
||||
runner = ProcessingRunner(process_fn=proc, extensions=_EXTS)
|
||||
summary = runner.run(f, on_log=logs.append)
|
||||
|
||||
assert summary.succeeded == 0
|
||||
assert summary.failed == 1
|
||||
assert summary.ok is False
|
||||
assert summary.documents[0].status == "failed"
|
||||
assert "preflight_text_too_short" in summary.errors[0][1]
|
||||
assert any("ÉCHEC : scan.pdf" in item for item in logs)
|
||||
|
||||
|
||||
def test_run_marks_missing_pdf_output_as_failure(tmp_path):
|
||||
f = _touch(tmp_path / "doc.pdf")
|
||||
out = tmp_path / "sortie"
|
||||
|
||||
def proc(doc, out_dir):
|
||||
txt = out_dir / "doc.pseudonymise.txt"
|
||||
audit = out_dir / "doc.audit.jsonl"
|
||||
txt.write_text("ok", encoding="utf-8")
|
||||
audit.write_text("{}", encoding="utf-8")
|
||||
return {"text": str(txt), "audit": str(audit)}
|
||||
|
||||
runner = ProcessingRunner(process_fn=proc, extensions=_EXTS)
|
||||
summary = runner.run(f, output_dir=out)
|
||||
|
||||
assert summary.succeeded == 0
|
||||
assert summary.failed == 1
|
||||
assert summary.documents[0].status == "failed"
|
||||
assert "Aucune sortie PDF" in summary.errors[0][1]
|
||||
|
||||
|
||||
def test_run_accepts_existing_pdf_output(tmp_path):
|
||||
f = _touch(tmp_path / "doc.pdf")
|
||||
out = tmp_path / "sortie"
|
||||
|
||||
def proc(doc, out_dir):
|
||||
pdf = out_dir / "doc.redacted_raster.pdf"
|
||||
pdf.write_bytes(b"%PDF-1.4\n")
|
||||
return {"pdf_raster": str(pdf)}
|
||||
|
||||
runner = ProcessingRunner(process_fn=proc, extensions=_EXTS)
|
||||
summary = runner.run(f, output_dir=out)
|
||||
|
||||
assert summary.succeeded == 1
|
||||
assert summary.failed == 0
|
||||
assert summary.documents[0].status == "success"
|
||||
|
||||
|
||||
def test_run_empty_folder(tmp_path):
|
||||
logs = []
|
||||
runner = ProcessingRunner(process_fn=lambda d, o: {}, extensions=_EXTS)
|
||||
|
||||
39
tests/unit/test_ocr_onnxtr.py
Normal file
39
tests/unit/test_ocr_onnxtr.py
Normal file
@@ -0,0 +1,39 @@
|
||||
"""Migration OCR docTR → OnnxTR : le moteur OCR est OnnxTR et lit le texte rendu.
|
||||
|
||||
Pas de mock : on exerce le vrai predictor OCR du moteur sur une image réelle.
|
||||
"""
|
||||
import numpy as np
|
||||
import pytest
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
|
||||
import anonymizer_core_refactored_onnx as core
|
||||
|
||||
|
||||
def test_ocr_engine_is_onnxtr():
|
||||
# Après migration : le moteur OCR doit être OnnxTR (ONNX Runtime, sans torch).
|
||||
assert core._OCR_AVAILABLE, "moteur OCR indisponible"
|
||||
model = core._get_ocr_model()
|
||||
assert "onnxtr" in type(model).__module__.lower(), type(model).__module__
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_ocr_reads_rendered_text():
|
||||
img = Image.new("RGB", (1400, 300), "white")
|
||||
draw = ImageDraw.Draw(img)
|
||||
try:
|
||||
font = ImageFont.truetype("DejaVuSans-Bold.ttf", 64)
|
||||
except OSError:
|
||||
try:
|
||||
font = ImageFont.truetype("DejaVuSans.ttf", 64)
|
||||
except OSError:
|
||||
font = ImageFont.load_default()
|
||||
words = ["BORDEAUX", "DUPONT", "MARTIN", "BAYONNE"]
|
||||
draw.text((40, 110), " ".join(words), fill="black", font=font)
|
||||
|
||||
model = core._get_ocr_model()
|
||||
result = model([np.array(img)])
|
||||
got = " ".join(
|
||||
w.value for b in result.pages[0].blocks for l in b.lines for w in l.words
|
||||
).upper()
|
||||
found = sum(1 for w in words if w in got)
|
||||
assert found >= 2, f"OCR a lu: {got!r}"
|
||||
Reference in New Issue
Block a user