Files
anonymisation/anonymisation_cli_onefile.spec
Domi31tls 8d683bc6d8 feat(ocr): migrer l'OCR de docTR (PyTorch) vers OnnxTR (ONNX Runtime)
OnnxTR exécute les MÊMES modèles que docTR (db_resnet50 + crnn_vgg16_bn) sur
ONNX Runtime, sans PyTorch. Corrige le crash torch/oneDNN « could not create a
primitive » sur CPU contraint (VM 2 cœurs collaborateur : OCR scan impossible →
quarantaine). Qualité identique validée empiriquement (CER 0,10-0,23 % vs docTR,
2 validations indépendantes Claude+Qwen), OCR ~2-3× plus rapide CPU.

- core : import OnnxTR, _get_ocr_model(), _OCR_AVAILABLE, boucle OCR inchangée
  (API miroir) ; ONNXTR_CACHE_DIR pour le frozen ; bandeau de logs ENV au démarrage
  (OS, CPU+AVX, cœurs, RAM, versions, providers) pour retours terrain auto-suffisants.
- 3 .spec : embarquent les poids ONNX OnnxTR (fail-closed) + hiddenimports onnxtr.
- requirements : onnxtr[cpu] (python-doctr conservé transitoirement).
- inclut le correctif quarantaine-visible du runner (GO Qwen).

Tests : test_ocr_onnxtr.py (RED→GREEN), 95 unit passed, e2e scan client OK
(OCR 5/5, PDF produit, plus de crash). Retrait torch du frozen + rebuild Windows
= étapes suivantes (gates Dom).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-19 17:07:00 +02:00

183 lines
4.6 KiB
Python

import os
from pathlib import Path
from PyInstaller.utils.hooks import collect_all, copy_metadata
# Spec CLI frozen — EXE de PRODUCTION (anonymisation fichier unique sans GUI).
# Même moteur / mêmes datas que anonymisation_onefile.spec, mais :
# - entrypoint = scripts/anonymize_cli.py (CLI production, pas launcher.py)
# Contrat : Anonymisation-CLI.exe <fichier> <dossier_sortie>
# Modèle CamemBERT-bio ONNX OBLIGATOIRE (fail-closed, code 3 si absent).
# - console=True (CLI), pas de Splash
# - name = Anonymisation-CLI -> ne remplace pas dist/Anonymisation.exe
# (Le harnais perf D-19 reste scripts/anonymize_batch_cli.py, non buildé ici.)
block_cipher = None
project_dir = Path(globals().get("SPECPATH", os.getcwd())).resolve()
def _data_entry(relative_path: str, target_dir: str | None = None):
src = project_dir / relative_path
if not src.exists():
return None
return (str(src), target_dir or relative_path)
binaries = []
datas = []
for relative_path, target_dir in [
("config", "config"),
("data/bdpm", "data/bdpm"),
("data/finess", "data/finess"),
("data/insee", "data/insee"),
("models/camembert-bio-deid/onnx", "models/camembert-bio-deid/onnx"),
("detectors", "detectors"),
("scripts", "scripts"),
("assets", "assets"),
]:
entry = _data_entry(relative_path, target_dir)
if entry is not None:
datas.append(entry)
for relative_path in [
"data/stopwords_manuels.txt",
"data/villes_blacklist.txt",
"data/dpi_labels_blacklist.txt",
"data/companion_blacklist.txt",
]:
entry = _data_entry(relative_path, "data")
if entry is not None:
datas.append(entry)
onnxtr_cache_dir = Path(os.environ.get("ONNXTR_CACHE_DIR", Path.home() / ".cache" / "onnxtr"))
required_onnxtr_weights = [
"db_resnet50-69ba0015.onnx",
"crnn_vgg16_bn-743599aa.onnx",
]
missing_onnxtr_weights = []
for filename in required_onnxtr_weights:
src = onnxtr_cache_dir / "models" / filename
if src.exists():
datas.append((str(src), "models/onnxtr/models"))
else:
missing_onnxtr_weights.append(str(src))
if missing_onnxtr_weights:
raise FileNotFoundError(
"Poids OCR OnnxTR manquants pour le build frozen : "
+ ", ".join(missing_onnxtr_weights)
+ ". Précharger OnnxTR (lancer une OCR une fois) ou définir ONNXTR_CACHE_DIR avant PyInstaller."
)
hiddenimports = [
"anonymizer_core_refactored_onnx",
"admin_rules",
"config_defaults",
"profile_defaults",
"gui_batch_paths",
"manual_masking",
"pdf_mask_designer",
"format_converter",
"ner_manager_onnx",
"camembert_ner_manager",
"eds_pseudo_manager",
"gliner_manager",
"vlm_manager",
"build_info",
"doctr",
"doctr.io",
"doctr.models",
"doctr.models.detection",
"doctr.models.recognition",
# OCR OnnxTR (remplace docTR — ONNX Runtime, sans torch)
"onnxtr",
"onnxtr.io",
"onnxtr.models",
"onnxtr.models.detection",
"onnxtr.models.recognition",
"onnxtr.utils",
"onnxtr.utils.data",
"cv2",
"torchvision",
"edsnlp",
"edsnlp.pipes",
"edsnlp.pipes.ner",
"edsnlp.pipes.ner.pseudo",
"spacy",
"spacy.lang.fr",
"gliner",
"onnxruntime",
"transformers",
"tokenizers",
"torch",
"pdfplumber",
"fitz",
"PIL",
"yaml",
"loguru",
"regex",
"optimum",
"optimum.onnxruntime",
"optimum.pipelines",
"optimum.modeling_base",
"optimum.exporters.onnx",
]
def _collect_optional_package(package_name: str):
try:
package_datas, package_binaries, package_hiddenimports = collect_all(package_name)
datas.extend(package_datas)
binaries.extend(package_binaries)
hiddenimports.extend(package_hiddenimports)
try:
datas.extend(copy_metadata(package_name, recursive=True))
except Exception:
pass
except Exception:
pass
for _package_name in [
"edsnlp",
"spacy",
"thinc",
"blis",
"srsly",
"catalogue",
"confection",
"cymem",
"preshed",
"murmurhash",
"gliner",
"loguru",
]:
_collect_optional_package(_package_name)
a = Analysis(
[str(project_dir / "scripts" / "anonymize_cli.py")],
pathex=[str(project_dir)],
binaries=binaries,
datas=datas,
hiddenimports=hiddenimports,
cipher=block_cipher,
noarchive=False,
)
pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)
exe = EXE(
pyz,
a.scripts,
a.binaries,
a.zipfiles,
a.datas,
[],
name="Anonymisation-CLI",
debug=False,
strip=False,
upx=False,
console=True,
)