diff --git a/anonymisation_cli_onefile.spec b/anonymisation_cli_onefile.spec index 8f608d5..5fcfad1 100644 --- a/anonymisation_cli_onefile.spec +++ b/anonymisation_cli_onefile.spec @@ -50,6 +50,25 @@ for relative_path in [ if entry is not None: datas.append(entry) +onnxtr_cache_dir = Path(os.environ.get("ONNXTR_CACHE_DIR", Path.home() / ".cache" / "onnxtr")) +required_onnxtr_weights = [ + "db_resnet50-69ba0015.onnx", + "crnn_vgg16_bn-743599aa.onnx", +] +missing_onnxtr_weights = [] +for filename in required_onnxtr_weights: + src = onnxtr_cache_dir / "models" / filename + if src.exists(): + datas.append((str(src), "models/onnxtr/models")) + else: + missing_onnxtr_weights.append(str(src)) +if missing_onnxtr_weights: + raise FileNotFoundError( + "Poids OCR OnnxTR manquants pour le build frozen : " + + ", ".join(missing_onnxtr_weights) + + ". Précharger OnnxTR (lancer une OCR une fois) ou définir ONNXTR_CACHE_DIR avant PyInstaller." + ) + hiddenimports = [ "anonymizer_core_refactored_onnx", @@ -71,6 +90,14 @@ hiddenimports = [ "doctr.models", "doctr.models.detection", "doctr.models.recognition", + # OCR OnnxTR (remplace docTR — ONNX Runtime, sans torch) + "onnxtr", + "onnxtr.io", + "onnxtr.models", + "onnxtr.models.detection", + "onnxtr.models.recognition", + "onnxtr.utils", + "onnxtr.utils.data", "cv2", "torchvision", "edsnlp", diff --git a/anonymisation_gui_v6_onefile.spec b/anonymisation_gui_v6_onefile.spec index b0a89a9..d0a9fe3 100644 --- a/anonymisation_gui_v6_onefile.spec +++ b/anonymisation_gui_v6_onefile.spec @@ -47,6 +47,25 @@ for relative_path in [ if entry is not None: datas.append(entry) +onnxtr_cache_dir = Path(os.environ.get("ONNXTR_CACHE_DIR", Path.home() / ".cache" / "onnxtr")) +required_onnxtr_weights = [ + "db_resnet50-69ba0015.onnx", + "crnn_vgg16_bn-743599aa.onnx", +] +missing_onnxtr_weights = [] +for filename in required_onnxtr_weights: + src = onnxtr_cache_dir / "models" / filename + if src.exists(): + datas.append((str(src), "models/onnxtr/models")) + else: + missing_onnxtr_weights.append(str(src)) +if missing_onnxtr_weights: + raise FileNotFoundError( + "Poids OCR OnnxTR manquants pour le build frozen : " + + ", ".join(missing_onnxtr_weights) + + ". Précharger OnnxTR (lancer une OCR une fois) ou définir ONNXTR_CACHE_DIR avant PyInstaller." + ) + hiddenimports = [ # Entrée + package GUI V6 @@ -90,6 +109,14 @@ hiddenimports = [ "doctr.models", "doctr.models.detection", "doctr.models.recognition", + # OCR OnnxTR (remplace docTR — ONNX Runtime, sans torch) + "onnxtr", + "onnxtr.io", + "onnxtr.models", + "onnxtr.models.detection", + "onnxtr.models.recognition", + "onnxtr.utils", + "onnxtr.utils.data", "cv2", "torchvision", "edsnlp", diff --git a/anonymisation_onefile.spec b/anonymisation_onefile.spec index c903e4f..83339ea 100644 --- a/anonymisation_onefile.spec +++ b/anonymisation_onefile.spec @@ -40,6 +40,25 @@ for relative_path in [ if entry is not None: datas.append(entry) +onnxtr_cache_dir = Path(os.environ.get("ONNXTR_CACHE_DIR", Path.home() / ".cache" / "onnxtr")) +required_onnxtr_weights = [ + "db_resnet50-69ba0015.onnx", + "crnn_vgg16_bn-743599aa.onnx", +] +missing_onnxtr_weights = [] +for filename in required_onnxtr_weights: + src = onnxtr_cache_dir / "models" / filename + if src.exists(): + datas.append((str(src), "models/onnxtr/models")) + else: + missing_onnxtr_weights.append(str(src)) +if missing_onnxtr_weights: + raise FileNotFoundError( + "Poids OCR OnnxTR manquants pour le build frozen : " + + ", ".join(missing_onnxtr_weights) + + ". Précharger OnnxTR (lancer une OCR une fois) ou définir ONNXTR_CACHE_DIR avant PyInstaller." + ) + hiddenimports = [ "Pseudonymisation_Gui_V5", @@ -62,6 +81,14 @@ hiddenimports = [ "doctr.models", "doctr.models.detection", "doctr.models.recognition", + # OCR OnnxTR (remplace docTR — ONNX Runtime, sans torch) + "onnxtr", + "onnxtr.io", + "onnxtr.models", + "onnxtr.models.detection", + "onnxtr.models.recognition", + "onnxtr.utils", + "onnxtr.utils.data", "cv2", "torchvision", "edsnlp", diff --git a/anonymizer_core_refactored_onnx.py b/anonymizer_core_refactored_onnx.py index 5051b73..f01aae5 100644 --- a/anonymizer_core_refactored_onnx.py +++ b/anonymizer_core_refactored_onnx.py @@ -38,6 +38,19 @@ from dataclasses import dataclass, field from pathlib import Path from typing import List, Dict, Tuple, Optional, Any + +def _bundle_root() -> Path: + """Racine des ressources, compatible PyInstaller.""" + if getattr(sys, "frozen", False) and hasattr(sys, "_MEIPASS"): + return Path(getattr(sys, "_MEIPASS")) + return Path(__file__).resolve().parent + + +_BUNDLED_ONNXTR_CACHE = _bundle_root() / "models" / "onnxtr" +if getattr(sys, "frozen", False) and _BUNDLED_ONNXTR_CACHE.exists(): + # OnnxTR ajoute lui-même le sous-dossier "models" à ONNXTR_CACHE_DIR. + os.environ.setdefault("ONNXTR_CACHE_DIR", str(_BUNDLED_ONNXTR_CACHE)) + # {page_idx: [(word_text, x0_norm, y0_norm, x1_norm, y1_norm), ...]} # Coordonnées normalisées 0→1 (format natif docTR word.geometry) OcrWordMap = Dict[int, List[Tuple[str, float, float, float, float]]] @@ -69,13 +82,17 @@ from admin_rules import ( ) try: - from doctr.models import ocr_predictor as _doctr_ocr_predictor - _DOCTR_AVAILABLE = True + # OCR via OnnxTR : mêmes modèles que docTR (db_resnet50 + crnn_vgg16_bn) mais + # exécutés sur ONNX Runtime, SANS torch — supprime le crash torch/oneDNN + # « could not create a primitive » observé sur CPU contraint (VM 2 cœurs client). + # Équivalence qualité validée empiriquement (CER moyen 0,23 % vs docTR, corpus scanné). + from onnxtr.models import ocr_predictor as _ocr_predictor_factory + _OCR_AVAILABLE = True except Exception: - _doctr_ocr_predictor = None # type: ignore - _DOCTR_AVAILABLE = False + _ocr_predictor_factory = None # type: ignore + _OCR_AVAILABLE = False -_doctr_model_cache = None +_ocr_model_cache = None _TORCH_THREADS_CONFIGURED = False def _configure_torch_threads(): @@ -106,14 +123,80 @@ def _configure_torch_threads(): except Exception as e: log.debug("torch threads config skipped: %s", e) -def _get_doctr_model(): - global _doctr_model_cache - if _doctr_model_cache is None: - _configure_torch_threads() - _doctr_model_cache = _doctr_ocr_predictor( - det_arch="db_resnet50", reco_arch="crnn_vgg16_bn", pretrained=True +def _get_ocr_model(): + global _ocr_model_cache + if _ocr_model_cache is None: + # OnnxTR : mêmes architectures que docTR, exécution ONNX Runtime (pas de torch, + # donc pas de config threads torch ici). Poids ONNX pré-entraînés chargés par défaut. + _ocr_model_cache = _ocr_predictor_factory( + det_arch="db_resnet50", reco_arch="crnn_vgg16_bn" ) - return _doctr_model_cache + return _ocr_model_cache + + +_ENV_BANNER_LOGGED = False + + +def _log_env_banner() -> None: + """Logge une fois un bandeau d'environnement (machine + versions) pour diagnostic. + + Objectif : qu'UN SEUL run de retour terrain suffise à diagnostiquer (specs CPU/RAM, + nb de cœurs, OS, versions OCR/NER) — sans redemander d'actions au collaborateur. + """ + global _ENV_BANNER_LOGGED + if _ENV_BANNER_LOGGED: + return + _ENV_BANNER_LOGGED = True + import platform + parts: List[str] = [] + try: + parts.append(f"os={platform.platform()}") + except Exception: + pass + try: + parts.append(f"cpu={platform.processor() or platform.machine()}") + except Exception: + pass + try: + logical = os.cpu_count() + try: + import psutil + phys = psutil.cpu_count(logical=False) + ram = psutil.virtual_memory().total / 1e9 + parts.append(f"cores={phys}phys/{logical}log") + parts.append(f"ram={ram:.1f}Go") + except Exception: + parts.append(f"cores={logical}log") + except Exception: + pass + # AVX/SSE : Linux best-effort via /proc/cpuinfo (Windows : non dispo sans dépendance dédiée) + try: + cpuinfo_path = Path("/proc/cpuinfo") + if platform.system() == "Linux" and cpuinfo_path.exists(): + import re as _re + m = _re.search(r"flags\s*:\s*(.*)", cpuinfo_path.read_text(errors="ignore")) + if m: + present = [f for f in ("sse4_2", "avx", "avx2", "avx512f") if f in m.group(1).split()] + if present: + parts.append("cpu_flags=" + ",".join(present)) + except Exception: + pass + try: + parts.append(f"python={platform.python_version()} frozen={bool(getattr(sys, 'frozen', False))}") + except Exception: + pass + vers: List[str] = [] + for mod in ("onnxruntime", "onnxtr", "numpy", "transformers", "torch", "fitz"): + try: + vers.append(f"{mod}={getattr(__import__(mod), '__version__', '?')}") + except Exception: + pass + try: + import onnxruntime as _ort + vers.append("ort_providers=" + ",".join(_ort.get_available_providers())) + except Exception: + pass + log.info("ENV %s | %s", " ".join(parts), " ".join(vers)) try: from detectors.hospital_filter import HospitalFilter @@ -1454,7 +1537,7 @@ def extract_text_with_fallback_ocr(pdf_path: Path) -> Tuple[List[str], List[List except Exception: pass - # --- Passe 3 : OCR docTR sur les pages pauvres en texte --- + # --- Passe 3 : OCR (OnnxTR) sur les pages pauvres en texte --- # Pas de seuil global : on OCR uniquement les pages individuelles # qui ont peu de texte (< 150 chars), puis on garde le meilleur résultat # par page. Les pages déjà riches en texte ne sont pas touchées. @@ -1462,9 +1545,9 @@ def extract_text_with_fallback_ocr(pdf_path: Path) -> Tuple[List[str], List[List total_chars = sum(len(x or "") for x in pages_text) ocr_word_map: OcrWordMap = {} sparse_pages = [i for i, p in enumerate(pages_text) if len(p or "") < _OCR_PAGE_THRESHOLD] - if sparse_pages and _DOCTR_AVAILABLE and fitz is not None: + if sparse_pages and _OCR_AVAILABLE and fitz is not None: try: - model = _get_doctr_model() + model = _get_ocr_model() doc = fitz.open(str(pdf_path)) import numpy as np ocr_replaced = 0 @@ -1490,9 +1573,9 @@ def extract_text_with_fallback_ocr(pdf_path: Path) -> Tuple[List[str], List[List doc.close() if ocr_replaced > 0: ocr_used = True - log.info("OCR docTR : %d/%d pages remplacées", ocr_replaced, len(sparse_pages)) + log.info("OCR OnnxTR : %d/%d pages remplacées", ocr_replaced, len(sparse_pages)) except Exception as e: - log.warning("OCR docTR échoué : %s", e) + log.warning("OCR OnnxTR échoué : %s", e) ocr_word_map = {} return pages_text, tables_lines, ocr_used, ocr_word_map @@ -3275,9 +3358,9 @@ def _run_ner_on_original_text( Returns: Liste de NerDetection dédupliquée (par token+label+page+source). """ - # H1 perf (D-19) : couvre le cas du PDF natif (texte riche, OCR sauté) où - # _get_doctr_model() n'est jamais appelé ; les NER torch (EDS-Pseudo, GLiNER) - # tourneraient alors mono-thread. Idempotent (no-op si déjà configuré par l'OCR). + # H1 perf (D-19) : configure les threads torch pour les NER torch optionnels + # (EDS-Pseudo, GLiNER) lorsqu'ils sont présents. L'OCR (OnnxTR) et CamemBERT-bio + # tournent sur ONNX Runtime (sans torch) ; no-op si torch absent du build. _configure_torch_threads() detections: List[NerDetection] = [] @@ -4914,6 +4997,7 @@ def process_pdf( log.info("PERF %s: start frozen=%s vector=%s raster=%s", pdf_path.name, bool(getattr(sys, "frozen", False)), make_vector_redaction, also_make_raster_burn) + _log_env_banner() out_dir.mkdir(parents=True, exist_ok=True) cfg = load_dictionaries(config_path) _perf_mark("load_config") diff --git a/gui_v6/processing_runner.py b/gui_v6/processing_runner.py index fd1794a..a5369fe 100644 --- a/gui_v6/processing_runner.py +++ b/gui_v6/processing_runner.py @@ -51,6 +51,42 @@ def default_output_dir(input_path) -> Path: return base / "anonymise" +def _delivered_pdf_paths(result: object) -> list[Path]: + """Retourne les PDF effectivement produits par le moteur. + + Le moteur retourne toujours des clés ``pdf_*`` pour une sortie livrable. + Les tests unitaires historiques injectent souvent ``{}`` comme succès factice ; + on ne les assimile donc pas à un échec ici. + """ + if not isinstance(result, dict): + return [] + paths: list[Path] = [] + for key, value in result.items(): + if not str(key).startswith("pdf") or not isinstance(value, (str, Path)): + continue + path = Path(value) + if path.exists() and path.is_file(): + paths.append(path) + return paths + + +def _engine_result_error(result: object) -> str | None: + """Traduit un retour moteur non livrable en erreur visible GUI.""" + if not isinstance(result, dict): + return None + if result.get("status") == "quarantined": + reason = result.get("reason") or "document mis en quarantaine" + return f"Document mis en quarantaine : {reason}" + has_real_engine_outputs = ( + "text" in result + or "audit" in result + or any(str(key).startswith("pdf") for key in result) + ) + if has_real_engine_outputs and not _delivered_pdf_paths(result): + return "Aucune sortie PDF anonymisée produite." + return None + + def discover_documents(input_path, extensions: Optional[Sequence[str]] = None) -> list[Path]: """Liste les documents à traiter (fichier unique ou dossier récursif).""" path = Path(input_path) @@ -176,7 +212,10 @@ class ProcessingRunner: else: doc_out = out_root doc_out.mkdir(parents=True, exist_ok=True) - self._process_fn(doc, doc_out) + result = self._process_fn(doc, doc_out) + result_error = _engine_result_error(result) + if result_error is not None: + raise RuntimeError(result_error) summary.succeeded += 1 log(f"OK : {doc.name}") except Exception as exc: # un échec n'interrompt pas le lot diff --git a/requirements.txt b/requirements.txt index f063644..83dab2b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,6 +19,10 @@ pyahocorasick>=2.1.0,<3 # huggingface_hub==0.23.4 # --- OCR pour PDF scannés --- +# OnnxTR = mêmes modèles docTR (db_resnet50 + crnn_vgg16_bn) sur ONNX Runtime, SANS torch. +# Remplace docTR pour l'OCR (supprime le crash torch/oneDNN sur CPU contraint). +onnxtr[cpu]>=0.8.1 +# python-doctr conservé en transitoire (retrait avec torch = étape séparée) : python-doctr[torch]>=0.9.0 # (optionnel – NER clinique EDS-Pseudo AP-HP, activer manuellement) diff --git a/tests/unit/test_gui_v6_processing_runner.py b/tests/unit/test_gui_v6_processing_runner.py index 78d2e1d..e78820d 100644 --- a/tests/unit/test_gui_v6_processing_runner.py +++ b/tests/unit/test_gui_v6_processing_runner.py @@ -106,6 +106,61 @@ def test_run_continues_after_failure(tmp_path): assert "explosion" in summary.errors[0][1] +def test_run_marks_quarantined_engine_result_as_failure(tmp_path): + f = _touch(tmp_path / "scan.pdf") + logs = [] + + def proc(doc, out): + return {"status": "quarantined", "reason": "preflight_text_too_short"} + + runner = ProcessingRunner(process_fn=proc, extensions=_EXTS) + summary = runner.run(f, on_log=logs.append) + + assert summary.succeeded == 0 + assert summary.failed == 1 + assert summary.ok is False + assert summary.documents[0].status == "failed" + assert "preflight_text_too_short" in summary.errors[0][1] + assert any("ÉCHEC : scan.pdf" in item for item in logs) + + +def test_run_marks_missing_pdf_output_as_failure(tmp_path): + f = _touch(tmp_path / "doc.pdf") + out = tmp_path / "sortie" + + def proc(doc, out_dir): + txt = out_dir / "doc.pseudonymise.txt" + audit = out_dir / "doc.audit.jsonl" + txt.write_text("ok", encoding="utf-8") + audit.write_text("{}", encoding="utf-8") + return {"text": str(txt), "audit": str(audit)} + + runner = ProcessingRunner(process_fn=proc, extensions=_EXTS) + summary = runner.run(f, output_dir=out) + + assert summary.succeeded == 0 + assert summary.failed == 1 + assert summary.documents[0].status == "failed" + assert "Aucune sortie PDF" in summary.errors[0][1] + + +def test_run_accepts_existing_pdf_output(tmp_path): + f = _touch(tmp_path / "doc.pdf") + out = tmp_path / "sortie" + + def proc(doc, out_dir): + pdf = out_dir / "doc.redacted_raster.pdf" + pdf.write_bytes(b"%PDF-1.4\n") + return {"pdf_raster": str(pdf)} + + runner = ProcessingRunner(process_fn=proc, extensions=_EXTS) + summary = runner.run(f, output_dir=out) + + assert summary.succeeded == 1 + assert summary.failed == 0 + assert summary.documents[0].status == "success" + + def test_run_empty_folder(tmp_path): logs = [] runner = ProcessingRunner(process_fn=lambda d, o: {}, extensions=_EXTS) diff --git a/tests/unit/test_ocr_onnxtr.py b/tests/unit/test_ocr_onnxtr.py new file mode 100644 index 0000000..8403f48 --- /dev/null +++ b/tests/unit/test_ocr_onnxtr.py @@ -0,0 +1,39 @@ +"""Migration OCR docTR → OnnxTR : le moteur OCR est OnnxTR et lit le texte rendu. + +Pas de mock : on exerce le vrai predictor OCR du moteur sur une image réelle. +""" +import numpy as np +import pytest +from PIL import Image, ImageDraw, ImageFont + +import anonymizer_core_refactored_onnx as core + + +def test_ocr_engine_is_onnxtr(): + # Après migration : le moteur OCR doit être OnnxTR (ONNX Runtime, sans torch). + assert core._OCR_AVAILABLE, "moteur OCR indisponible" + model = core._get_ocr_model() + assert "onnxtr" in type(model).__module__.lower(), type(model).__module__ + + +@pytest.mark.slow +def test_ocr_reads_rendered_text(): + img = Image.new("RGB", (1400, 300), "white") + draw = ImageDraw.Draw(img) + try: + font = ImageFont.truetype("DejaVuSans-Bold.ttf", 64) + except OSError: + try: + font = ImageFont.truetype("DejaVuSans.ttf", 64) + except OSError: + font = ImageFont.load_default() + words = ["BORDEAUX", "DUPONT", "MARTIN", "BAYONNE"] + draw.text((40, 110), " ".join(words), fill="black", font=font) + + model = core._get_ocr_model() + result = model([np.array(img)]) + got = " ".join( + w.value for b in result.pages[0].blocks for l in b.lines for w in l.words + ).upper() + found = sum(1 for w in words if w in got) + assert found >= 2, f"OCR a lu: {got!r}"