feat(ocr): migrer l'OCR de docTR (PyTorch) vers OnnxTR (ONNX Runtime)
OnnxTR exécute les MÊMES modèles que docTR (db_resnet50 + crnn_vgg16_bn) sur ONNX Runtime, sans PyTorch. Corrige le crash torch/oneDNN « could not create a primitive » sur CPU contraint (VM 2 cœurs collaborateur : OCR scan impossible → quarantaine). Qualité identique validée empiriquement (CER 0,10-0,23 % vs docTR, 2 validations indépendantes Claude+Qwen), OCR ~2-3× plus rapide CPU. - core : import OnnxTR, _get_ocr_model(), _OCR_AVAILABLE, boucle OCR inchangée (API miroir) ; ONNXTR_CACHE_DIR pour le frozen ; bandeau de logs ENV au démarrage (OS, CPU+AVX, cœurs, RAM, versions, providers) pour retours terrain auto-suffisants. - 3 .spec : embarquent les poids ONNX OnnxTR (fail-closed) + hiddenimports onnxtr. - requirements : onnxtr[cpu] (python-doctr conservé transitoirement). - inclut le correctif quarantaine-visible du runner (GO Qwen). Tests : test_ocr_onnxtr.py (RED→GREEN), 95 unit passed, e2e scan client OK (OCR 5/5, PDF produit, plus de crash). Retrait torch du frozen + rebuild Windows = étapes suivantes (gates Dom). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -50,6 +50,25 @@ for relative_path in [
|
|||||||
if entry is not None:
|
if entry is not None:
|
||||||
datas.append(entry)
|
datas.append(entry)
|
||||||
|
|
||||||
|
onnxtr_cache_dir = Path(os.environ.get("ONNXTR_CACHE_DIR", Path.home() / ".cache" / "onnxtr"))
|
||||||
|
required_onnxtr_weights = [
|
||||||
|
"db_resnet50-69ba0015.onnx",
|
||||||
|
"crnn_vgg16_bn-743599aa.onnx",
|
||||||
|
]
|
||||||
|
missing_onnxtr_weights = []
|
||||||
|
for filename in required_onnxtr_weights:
|
||||||
|
src = onnxtr_cache_dir / "models" / filename
|
||||||
|
if src.exists():
|
||||||
|
datas.append((str(src), "models/onnxtr/models"))
|
||||||
|
else:
|
||||||
|
missing_onnxtr_weights.append(str(src))
|
||||||
|
if missing_onnxtr_weights:
|
||||||
|
raise FileNotFoundError(
|
||||||
|
"Poids OCR OnnxTR manquants pour le build frozen : "
|
||||||
|
+ ", ".join(missing_onnxtr_weights)
|
||||||
|
+ ". Précharger OnnxTR (lancer une OCR une fois) ou définir ONNXTR_CACHE_DIR avant PyInstaller."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
hiddenimports = [
|
hiddenimports = [
|
||||||
"anonymizer_core_refactored_onnx",
|
"anonymizer_core_refactored_onnx",
|
||||||
@@ -71,6 +90,14 @@ hiddenimports = [
|
|||||||
"doctr.models",
|
"doctr.models",
|
||||||
"doctr.models.detection",
|
"doctr.models.detection",
|
||||||
"doctr.models.recognition",
|
"doctr.models.recognition",
|
||||||
|
# OCR OnnxTR (remplace docTR — ONNX Runtime, sans torch)
|
||||||
|
"onnxtr",
|
||||||
|
"onnxtr.io",
|
||||||
|
"onnxtr.models",
|
||||||
|
"onnxtr.models.detection",
|
||||||
|
"onnxtr.models.recognition",
|
||||||
|
"onnxtr.utils",
|
||||||
|
"onnxtr.utils.data",
|
||||||
"cv2",
|
"cv2",
|
||||||
"torchvision",
|
"torchvision",
|
||||||
"edsnlp",
|
"edsnlp",
|
||||||
|
|||||||
@@ -47,6 +47,25 @@ for relative_path in [
|
|||||||
if entry is not None:
|
if entry is not None:
|
||||||
datas.append(entry)
|
datas.append(entry)
|
||||||
|
|
||||||
|
onnxtr_cache_dir = Path(os.environ.get("ONNXTR_CACHE_DIR", Path.home() / ".cache" / "onnxtr"))
|
||||||
|
required_onnxtr_weights = [
|
||||||
|
"db_resnet50-69ba0015.onnx",
|
||||||
|
"crnn_vgg16_bn-743599aa.onnx",
|
||||||
|
]
|
||||||
|
missing_onnxtr_weights = []
|
||||||
|
for filename in required_onnxtr_weights:
|
||||||
|
src = onnxtr_cache_dir / "models" / filename
|
||||||
|
if src.exists():
|
||||||
|
datas.append((str(src), "models/onnxtr/models"))
|
||||||
|
else:
|
||||||
|
missing_onnxtr_weights.append(str(src))
|
||||||
|
if missing_onnxtr_weights:
|
||||||
|
raise FileNotFoundError(
|
||||||
|
"Poids OCR OnnxTR manquants pour le build frozen : "
|
||||||
|
+ ", ".join(missing_onnxtr_weights)
|
||||||
|
+ ". Précharger OnnxTR (lancer une OCR une fois) ou définir ONNXTR_CACHE_DIR avant PyInstaller."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
hiddenimports = [
|
hiddenimports = [
|
||||||
# Entrée + package GUI V6
|
# Entrée + package GUI V6
|
||||||
@@ -90,6 +109,14 @@ hiddenimports = [
|
|||||||
"doctr.models",
|
"doctr.models",
|
||||||
"doctr.models.detection",
|
"doctr.models.detection",
|
||||||
"doctr.models.recognition",
|
"doctr.models.recognition",
|
||||||
|
# OCR OnnxTR (remplace docTR — ONNX Runtime, sans torch)
|
||||||
|
"onnxtr",
|
||||||
|
"onnxtr.io",
|
||||||
|
"onnxtr.models",
|
||||||
|
"onnxtr.models.detection",
|
||||||
|
"onnxtr.models.recognition",
|
||||||
|
"onnxtr.utils",
|
||||||
|
"onnxtr.utils.data",
|
||||||
"cv2",
|
"cv2",
|
||||||
"torchvision",
|
"torchvision",
|
||||||
"edsnlp",
|
"edsnlp",
|
||||||
|
|||||||
@@ -40,6 +40,25 @@ for relative_path in [
|
|||||||
if entry is not None:
|
if entry is not None:
|
||||||
datas.append(entry)
|
datas.append(entry)
|
||||||
|
|
||||||
|
onnxtr_cache_dir = Path(os.environ.get("ONNXTR_CACHE_DIR", Path.home() / ".cache" / "onnxtr"))
|
||||||
|
required_onnxtr_weights = [
|
||||||
|
"db_resnet50-69ba0015.onnx",
|
||||||
|
"crnn_vgg16_bn-743599aa.onnx",
|
||||||
|
]
|
||||||
|
missing_onnxtr_weights = []
|
||||||
|
for filename in required_onnxtr_weights:
|
||||||
|
src = onnxtr_cache_dir / "models" / filename
|
||||||
|
if src.exists():
|
||||||
|
datas.append((str(src), "models/onnxtr/models"))
|
||||||
|
else:
|
||||||
|
missing_onnxtr_weights.append(str(src))
|
||||||
|
if missing_onnxtr_weights:
|
||||||
|
raise FileNotFoundError(
|
||||||
|
"Poids OCR OnnxTR manquants pour le build frozen : "
|
||||||
|
+ ", ".join(missing_onnxtr_weights)
|
||||||
|
+ ". Précharger OnnxTR (lancer une OCR une fois) ou définir ONNXTR_CACHE_DIR avant PyInstaller."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
hiddenimports = [
|
hiddenimports = [
|
||||||
"Pseudonymisation_Gui_V5",
|
"Pseudonymisation_Gui_V5",
|
||||||
@@ -62,6 +81,14 @@ hiddenimports = [
|
|||||||
"doctr.models",
|
"doctr.models",
|
||||||
"doctr.models.detection",
|
"doctr.models.detection",
|
||||||
"doctr.models.recognition",
|
"doctr.models.recognition",
|
||||||
|
# OCR OnnxTR (remplace docTR — ONNX Runtime, sans torch)
|
||||||
|
"onnxtr",
|
||||||
|
"onnxtr.io",
|
||||||
|
"onnxtr.models",
|
||||||
|
"onnxtr.models.detection",
|
||||||
|
"onnxtr.models.recognition",
|
||||||
|
"onnxtr.utils",
|
||||||
|
"onnxtr.utils.data",
|
||||||
"cv2",
|
"cv2",
|
||||||
"torchvision",
|
"torchvision",
|
||||||
"edsnlp",
|
"edsnlp",
|
||||||
|
|||||||
@@ -38,6 +38,19 @@ from dataclasses import dataclass, field
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Dict, Tuple, Optional, Any
|
from typing import List, Dict, Tuple, Optional, Any
|
||||||
|
|
||||||
|
|
||||||
|
def _bundle_root() -> Path:
|
||||||
|
"""Racine des ressources, compatible PyInstaller."""
|
||||||
|
if getattr(sys, "frozen", False) and hasattr(sys, "_MEIPASS"):
|
||||||
|
return Path(getattr(sys, "_MEIPASS"))
|
||||||
|
return Path(__file__).resolve().parent
|
||||||
|
|
||||||
|
|
||||||
|
_BUNDLED_ONNXTR_CACHE = _bundle_root() / "models" / "onnxtr"
|
||||||
|
if getattr(sys, "frozen", False) and _BUNDLED_ONNXTR_CACHE.exists():
|
||||||
|
# OnnxTR ajoute lui-même le sous-dossier "models" à ONNXTR_CACHE_DIR.
|
||||||
|
os.environ.setdefault("ONNXTR_CACHE_DIR", str(_BUNDLED_ONNXTR_CACHE))
|
||||||
|
|
||||||
# {page_idx: [(word_text, x0_norm, y0_norm, x1_norm, y1_norm), ...]}
|
# {page_idx: [(word_text, x0_norm, y0_norm, x1_norm, y1_norm), ...]}
|
||||||
# Coordonnées normalisées 0→1 (format natif docTR word.geometry)
|
# Coordonnées normalisées 0→1 (format natif docTR word.geometry)
|
||||||
OcrWordMap = Dict[int, List[Tuple[str, float, float, float, float]]]
|
OcrWordMap = Dict[int, List[Tuple[str, float, float, float, float]]]
|
||||||
@@ -69,13 +82,17 @@ from admin_rules import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from doctr.models import ocr_predictor as _doctr_ocr_predictor
|
# OCR via OnnxTR : mêmes modèles que docTR (db_resnet50 + crnn_vgg16_bn) mais
|
||||||
_DOCTR_AVAILABLE = True
|
# exécutés sur ONNX Runtime, SANS torch — supprime le crash torch/oneDNN
|
||||||
|
# « could not create a primitive » observé sur CPU contraint (VM 2 cœurs client).
|
||||||
|
# Équivalence qualité validée empiriquement (CER moyen 0,23 % vs docTR, corpus scanné).
|
||||||
|
from onnxtr.models import ocr_predictor as _ocr_predictor_factory
|
||||||
|
_OCR_AVAILABLE = True
|
||||||
except Exception:
|
except Exception:
|
||||||
_doctr_ocr_predictor = None # type: ignore
|
_ocr_predictor_factory = None # type: ignore
|
||||||
_DOCTR_AVAILABLE = False
|
_OCR_AVAILABLE = False
|
||||||
|
|
||||||
_doctr_model_cache = None
|
_ocr_model_cache = None
|
||||||
_TORCH_THREADS_CONFIGURED = False
|
_TORCH_THREADS_CONFIGURED = False
|
||||||
|
|
||||||
def _configure_torch_threads():
|
def _configure_torch_threads():
|
||||||
@@ -106,14 +123,80 @@ def _configure_torch_threads():
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.debug("torch threads config skipped: %s", e)
|
log.debug("torch threads config skipped: %s", e)
|
||||||
|
|
||||||
def _get_doctr_model():
|
def _get_ocr_model():
|
||||||
global _doctr_model_cache
|
global _ocr_model_cache
|
||||||
if _doctr_model_cache is None:
|
if _ocr_model_cache is None:
|
||||||
_configure_torch_threads()
|
# OnnxTR : mêmes architectures que docTR, exécution ONNX Runtime (pas de torch,
|
||||||
_doctr_model_cache = _doctr_ocr_predictor(
|
# donc pas de config threads torch ici). Poids ONNX pré-entraînés chargés par défaut.
|
||||||
det_arch="db_resnet50", reco_arch="crnn_vgg16_bn", pretrained=True
|
_ocr_model_cache = _ocr_predictor_factory(
|
||||||
|
det_arch="db_resnet50", reco_arch="crnn_vgg16_bn"
|
||||||
)
|
)
|
||||||
return _doctr_model_cache
|
return _ocr_model_cache
|
||||||
|
|
||||||
|
|
||||||
|
_ENV_BANNER_LOGGED = False
|
||||||
|
|
||||||
|
|
||||||
|
def _log_env_banner() -> None:
|
||||||
|
"""Logge une fois un bandeau d'environnement (machine + versions) pour diagnostic.
|
||||||
|
|
||||||
|
Objectif : qu'UN SEUL run de retour terrain suffise à diagnostiquer (specs CPU/RAM,
|
||||||
|
nb de cœurs, OS, versions OCR/NER) — sans redemander d'actions au collaborateur.
|
||||||
|
"""
|
||||||
|
global _ENV_BANNER_LOGGED
|
||||||
|
if _ENV_BANNER_LOGGED:
|
||||||
|
return
|
||||||
|
_ENV_BANNER_LOGGED = True
|
||||||
|
import platform
|
||||||
|
parts: List[str] = []
|
||||||
|
try:
|
||||||
|
parts.append(f"os={platform.platform()}")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
parts.append(f"cpu={platform.processor() or platform.machine()}")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
logical = os.cpu_count()
|
||||||
|
try:
|
||||||
|
import psutil
|
||||||
|
phys = psutil.cpu_count(logical=False)
|
||||||
|
ram = psutil.virtual_memory().total / 1e9
|
||||||
|
parts.append(f"cores={phys}phys/{logical}log")
|
||||||
|
parts.append(f"ram={ram:.1f}Go")
|
||||||
|
except Exception:
|
||||||
|
parts.append(f"cores={logical}log")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
# AVX/SSE : Linux best-effort via /proc/cpuinfo (Windows : non dispo sans dépendance dédiée)
|
||||||
|
try:
|
||||||
|
cpuinfo_path = Path("/proc/cpuinfo")
|
||||||
|
if platform.system() == "Linux" and cpuinfo_path.exists():
|
||||||
|
import re as _re
|
||||||
|
m = _re.search(r"flags\s*:\s*(.*)", cpuinfo_path.read_text(errors="ignore"))
|
||||||
|
if m:
|
||||||
|
present = [f for f in ("sse4_2", "avx", "avx2", "avx512f") if f in m.group(1).split()]
|
||||||
|
if present:
|
||||||
|
parts.append("cpu_flags=" + ",".join(present))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
parts.append(f"python={platform.python_version()} frozen={bool(getattr(sys, 'frozen', False))}")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
vers: List[str] = []
|
||||||
|
for mod in ("onnxruntime", "onnxtr", "numpy", "transformers", "torch", "fitz"):
|
||||||
|
try:
|
||||||
|
vers.append(f"{mod}={getattr(__import__(mod), '__version__', '?')}")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
import onnxruntime as _ort
|
||||||
|
vers.append("ort_providers=" + ",".join(_ort.get_available_providers()))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
log.info("ENV %s | %s", " ".join(parts), " ".join(vers))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from detectors.hospital_filter import HospitalFilter
|
from detectors.hospital_filter import HospitalFilter
|
||||||
@@ -1454,7 +1537,7 @@ def extract_text_with_fallback_ocr(pdf_path: Path) -> Tuple[List[str], List[List
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# --- Passe 3 : OCR docTR sur les pages pauvres en texte ---
|
# --- Passe 3 : OCR (OnnxTR) sur les pages pauvres en texte ---
|
||||||
# Pas de seuil global : on OCR uniquement les pages individuelles
|
# Pas de seuil global : on OCR uniquement les pages individuelles
|
||||||
# qui ont peu de texte (< 150 chars), puis on garde le meilleur résultat
|
# qui ont peu de texte (< 150 chars), puis on garde le meilleur résultat
|
||||||
# par page. Les pages déjà riches en texte ne sont pas touchées.
|
# par page. Les pages déjà riches en texte ne sont pas touchées.
|
||||||
@@ -1462,9 +1545,9 @@ def extract_text_with_fallback_ocr(pdf_path: Path) -> Tuple[List[str], List[List
|
|||||||
total_chars = sum(len(x or "") for x in pages_text)
|
total_chars = sum(len(x or "") for x in pages_text)
|
||||||
ocr_word_map: OcrWordMap = {}
|
ocr_word_map: OcrWordMap = {}
|
||||||
sparse_pages = [i for i, p in enumerate(pages_text) if len(p or "") < _OCR_PAGE_THRESHOLD]
|
sparse_pages = [i for i, p in enumerate(pages_text) if len(p or "") < _OCR_PAGE_THRESHOLD]
|
||||||
if sparse_pages and _DOCTR_AVAILABLE and fitz is not None:
|
if sparse_pages and _OCR_AVAILABLE and fitz is not None:
|
||||||
try:
|
try:
|
||||||
model = _get_doctr_model()
|
model = _get_ocr_model()
|
||||||
doc = fitz.open(str(pdf_path))
|
doc = fitz.open(str(pdf_path))
|
||||||
import numpy as np
|
import numpy as np
|
||||||
ocr_replaced = 0
|
ocr_replaced = 0
|
||||||
@@ -1490,9 +1573,9 @@ def extract_text_with_fallback_ocr(pdf_path: Path) -> Tuple[List[str], List[List
|
|||||||
doc.close()
|
doc.close()
|
||||||
if ocr_replaced > 0:
|
if ocr_replaced > 0:
|
||||||
ocr_used = True
|
ocr_used = True
|
||||||
log.info("OCR docTR : %d/%d pages remplacées", ocr_replaced, len(sparse_pages))
|
log.info("OCR OnnxTR : %d/%d pages remplacées", ocr_replaced, len(sparse_pages))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.warning("OCR docTR échoué : %s", e)
|
log.warning("OCR OnnxTR échoué : %s", e)
|
||||||
ocr_word_map = {}
|
ocr_word_map = {}
|
||||||
return pages_text, tables_lines, ocr_used, ocr_word_map
|
return pages_text, tables_lines, ocr_used, ocr_word_map
|
||||||
|
|
||||||
@@ -3275,9 +3358,9 @@ def _run_ner_on_original_text(
|
|||||||
Returns:
|
Returns:
|
||||||
Liste de NerDetection dédupliquée (par token+label+page+source).
|
Liste de NerDetection dédupliquée (par token+label+page+source).
|
||||||
"""
|
"""
|
||||||
# H1 perf (D-19) : couvre le cas du PDF natif (texte riche, OCR sauté) où
|
# H1 perf (D-19) : configure les threads torch pour les NER torch optionnels
|
||||||
# _get_doctr_model() n'est jamais appelé ; les NER torch (EDS-Pseudo, GLiNER)
|
# (EDS-Pseudo, GLiNER) lorsqu'ils sont présents. L'OCR (OnnxTR) et CamemBERT-bio
|
||||||
# tourneraient alors mono-thread. Idempotent (no-op si déjà configuré par l'OCR).
|
# tournent sur ONNX Runtime (sans torch) ; no-op si torch absent du build.
|
||||||
_configure_torch_threads()
|
_configure_torch_threads()
|
||||||
|
|
||||||
detections: List[NerDetection] = []
|
detections: List[NerDetection] = []
|
||||||
@@ -4914,6 +4997,7 @@ def process_pdf(
|
|||||||
|
|
||||||
log.info("PERF %s: start frozen=%s vector=%s raster=%s",
|
log.info("PERF %s: start frozen=%s vector=%s raster=%s",
|
||||||
pdf_path.name, bool(getattr(sys, "frozen", False)), make_vector_redaction, also_make_raster_burn)
|
pdf_path.name, bool(getattr(sys, "frozen", False)), make_vector_redaction, also_make_raster_burn)
|
||||||
|
_log_env_banner()
|
||||||
out_dir.mkdir(parents=True, exist_ok=True)
|
out_dir.mkdir(parents=True, exist_ok=True)
|
||||||
cfg = load_dictionaries(config_path)
|
cfg = load_dictionaries(config_path)
|
||||||
_perf_mark("load_config")
|
_perf_mark("load_config")
|
||||||
|
|||||||
@@ -51,6 +51,42 @@ def default_output_dir(input_path) -> Path:
|
|||||||
return base / "anonymise"
|
return base / "anonymise"
|
||||||
|
|
||||||
|
|
||||||
|
def _delivered_pdf_paths(result: object) -> list[Path]:
|
||||||
|
"""Retourne les PDF effectivement produits par le moteur.
|
||||||
|
|
||||||
|
Le moteur retourne toujours des clés ``pdf_*`` pour une sortie livrable.
|
||||||
|
Les tests unitaires historiques injectent souvent ``{}`` comme succès factice ;
|
||||||
|
on ne les assimile donc pas à un échec ici.
|
||||||
|
"""
|
||||||
|
if not isinstance(result, dict):
|
||||||
|
return []
|
||||||
|
paths: list[Path] = []
|
||||||
|
for key, value in result.items():
|
||||||
|
if not str(key).startswith("pdf") or not isinstance(value, (str, Path)):
|
||||||
|
continue
|
||||||
|
path = Path(value)
|
||||||
|
if path.exists() and path.is_file():
|
||||||
|
paths.append(path)
|
||||||
|
return paths
|
||||||
|
|
||||||
|
|
||||||
|
def _engine_result_error(result: object) -> str | None:
|
||||||
|
"""Traduit un retour moteur non livrable en erreur visible GUI."""
|
||||||
|
if not isinstance(result, dict):
|
||||||
|
return None
|
||||||
|
if result.get("status") == "quarantined":
|
||||||
|
reason = result.get("reason") or "document mis en quarantaine"
|
||||||
|
return f"Document mis en quarantaine : {reason}"
|
||||||
|
has_real_engine_outputs = (
|
||||||
|
"text" in result
|
||||||
|
or "audit" in result
|
||||||
|
or any(str(key).startswith("pdf") for key in result)
|
||||||
|
)
|
||||||
|
if has_real_engine_outputs and not _delivered_pdf_paths(result):
|
||||||
|
return "Aucune sortie PDF anonymisée produite."
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def discover_documents(input_path, extensions: Optional[Sequence[str]] = None) -> list[Path]:
|
def discover_documents(input_path, extensions: Optional[Sequence[str]] = None) -> list[Path]:
|
||||||
"""Liste les documents à traiter (fichier unique ou dossier récursif)."""
|
"""Liste les documents à traiter (fichier unique ou dossier récursif)."""
|
||||||
path = Path(input_path)
|
path = Path(input_path)
|
||||||
@@ -176,7 +212,10 @@ class ProcessingRunner:
|
|||||||
else:
|
else:
|
||||||
doc_out = out_root
|
doc_out = out_root
|
||||||
doc_out.mkdir(parents=True, exist_ok=True)
|
doc_out.mkdir(parents=True, exist_ok=True)
|
||||||
self._process_fn(doc, doc_out)
|
result = self._process_fn(doc, doc_out)
|
||||||
|
result_error = _engine_result_error(result)
|
||||||
|
if result_error is not None:
|
||||||
|
raise RuntimeError(result_error)
|
||||||
summary.succeeded += 1
|
summary.succeeded += 1
|
||||||
log(f"OK : {doc.name}")
|
log(f"OK : {doc.name}")
|
||||||
except Exception as exc: # un échec n'interrompt pas le lot
|
except Exception as exc: # un échec n'interrompt pas le lot
|
||||||
|
|||||||
@@ -19,6 +19,10 @@ pyahocorasick>=2.1.0,<3
|
|||||||
# huggingface_hub==0.23.4
|
# huggingface_hub==0.23.4
|
||||||
|
|
||||||
# --- OCR pour PDF scannés ---
|
# --- OCR pour PDF scannés ---
|
||||||
|
# OnnxTR = mêmes modèles docTR (db_resnet50 + crnn_vgg16_bn) sur ONNX Runtime, SANS torch.
|
||||||
|
# Remplace docTR pour l'OCR (supprime le crash torch/oneDNN sur CPU contraint).
|
||||||
|
onnxtr[cpu]>=0.8.1
|
||||||
|
# python-doctr conservé en transitoire (retrait avec torch = étape séparée) :
|
||||||
python-doctr[torch]>=0.9.0
|
python-doctr[torch]>=0.9.0
|
||||||
|
|
||||||
# (optionnel – NER clinique EDS-Pseudo AP-HP, activer manuellement)
|
# (optionnel – NER clinique EDS-Pseudo AP-HP, activer manuellement)
|
||||||
|
|||||||
@@ -106,6 +106,61 @@ def test_run_continues_after_failure(tmp_path):
|
|||||||
assert "explosion" in summary.errors[0][1]
|
assert "explosion" in summary.errors[0][1]
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_marks_quarantined_engine_result_as_failure(tmp_path):
|
||||||
|
f = _touch(tmp_path / "scan.pdf")
|
||||||
|
logs = []
|
||||||
|
|
||||||
|
def proc(doc, out):
|
||||||
|
return {"status": "quarantined", "reason": "preflight_text_too_short"}
|
||||||
|
|
||||||
|
runner = ProcessingRunner(process_fn=proc, extensions=_EXTS)
|
||||||
|
summary = runner.run(f, on_log=logs.append)
|
||||||
|
|
||||||
|
assert summary.succeeded == 0
|
||||||
|
assert summary.failed == 1
|
||||||
|
assert summary.ok is False
|
||||||
|
assert summary.documents[0].status == "failed"
|
||||||
|
assert "preflight_text_too_short" in summary.errors[0][1]
|
||||||
|
assert any("ÉCHEC : scan.pdf" in item for item in logs)
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_marks_missing_pdf_output_as_failure(tmp_path):
|
||||||
|
f = _touch(tmp_path / "doc.pdf")
|
||||||
|
out = tmp_path / "sortie"
|
||||||
|
|
||||||
|
def proc(doc, out_dir):
|
||||||
|
txt = out_dir / "doc.pseudonymise.txt"
|
||||||
|
audit = out_dir / "doc.audit.jsonl"
|
||||||
|
txt.write_text("ok", encoding="utf-8")
|
||||||
|
audit.write_text("{}", encoding="utf-8")
|
||||||
|
return {"text": str(txt), "audit": str(audit)}
|
||||||
|
|
||||||
|
runner = ProcessingRunner(process_fn=proc, extensions=_EXTS)
|
||||||
|
summary = runner.run(f, output_dir=out)
|
||||||
|
|
||||||
|
assert summary.succeeded == 0
|
||||||
|
assert summary.failed == 1
|
||||||
|
assert summary.documents[0].status == "failed"
|
||||||
|
assert "Aucune sortie PDF" in summary.errors[0][1]
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_accepts_existing_pdf_output(tmp_path):
|
||||||
|
f = _touch(tmp_path / "doc.pdf")
|
||||||
|
out = tmp_path / "sortie"
|
||||||
|
|
||||||
|
def proc(doc, out_dir):
|
||||||
|
pdf = out_dir / "doc.redacted_raster.pdf"
|
||||||
|
pdf.write_bytes(b"%PDF-1.4\n")
|
||||||
|
return {"pdf_raster": str(pdf)}
|
||||||
|
|
||||||
|
runner = ProcessingRunner(process_fn=proc, extensions=_EXTS)
|
||||||
|
summary = runner.run(f, output_dir=out)
|
||||||
|
|
||||||
|
assert summary.succeeded == 1
|
||||||
|
assert summary.failed == 0
|
||||||
|
assert summary.documents[0].status == "success"
|
||||||
|
|
||||||
|
|
||||||
def test_run_empty_folder(tmp_path):
|
def test_run_empty_folder(tmp_path):
|
||||||
logs = []
|
logs = []
|
||||||
runner = ProcessingRunner(process_fn=lambda d, o: {}, extensions=_EXTS)
|
runner = ProcessingRunner(process_fn=lambda d, o: {}, extensions=_EXTS)
|
||||||
|
|||||||
39
tests/unit/test_ocr_onnxtr.py
Normal file
39
tests/unit/test_ocr_onnxtr.py
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
"""Migration OCR docTR → OnnxTR : le moteur OCR est OnnxTR et lit le texte rendu.
|
||||||
|
|
||||||
|
Pas de mock : on exerce le vrai predictor OCR du moteur sur une image réelle.
|
||||||
|
"""
|
||||||
|
import numpy as np
|
||||||
|
import pytest
|
||||||
|
from PIL import Image, ImageDraw, ImageFont
|
||||||
|
|
||||||
|
import anonymizer_core_refactored_onnx as core
|
||||||
|
|
||||||
|
|
||||||
|
def test_ocr_engine_is_onnxtr():
|
||||||
|
# Après migration : le moteur OCR doit être OnnxTR (ONNX Runtime, sans torch).
|
||||||
|
assert core._OCR_AVAILABLE, "moteur OCR indisponible"
|
||||||
|
model = core._get_ocr_model()
|
||||||
|
assert "onnxtr" in type(model).__module__.lower(), type(model).__module__
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
|
def test_ocr_reads_rendered_text():
|
||||||
|
img = Image.new("RGB", (1400, 300), "white")
|
||||||
|
draw = ImageDraw.Draw(img)
|
||||||
|
try:
|
||||||
|
font = ImageFont.truetype("DejaVuSans-Bold.ttf", 64)
|
||||||
|
except OSError:
|
||||||
|
try:
|
||||||
|
font = ImageFont.truetype("DejaVuSans.ttf", 64)
|
||||||
|
except OSError:
|
||||||
|
font = ImageFont.load_default()
|
||||||
|
words = ["BORDEAUX", "DUPONT", "MARTIN", "BAYONNE"]
|
||||||
|
draw.text((40, 110), " ".join(words), fill="black", font=font)
|
||||||
|
|
||||||
|
model = core._get_ocr_model()
|
||||||
|
result = model([np.array(img)])
|
||||||
|
got = " ".join(
|
||||||
|
w.value for b in result.pages[0].blocks for l in b.lines for w in l.words
|
||||||
|
).upper()
|
||||||
|
found = sum(1 for w in words if w in got)
|
||||||
|
assert found >= 2, f"OCR a lu: {got!r}"
|
||||||
Reference in New Issue
Block a user