diff --git a/anonymizer_core_refactored_onnx.py b/anonymizer_core_refactored_onnx.py index 4d0acae..1b851b8 100644 --- a/anonymizer_core_refactored_onnx.py +++ b/anonymizer_core_refactored_onnx.py @@ -17,6 +17,7 @@ import json import logging import os import re +import shutil import sys from concurrent.futures import ProcessPoolExecutor @@ -4674,19 +4675,54 @@ def process_pdf( redact_pdf_vector(pdf_path, anon.audit, vec_path, ocr_word_map=ocr_word_map) outputs["pdf_vector"] = str(vec_path) except Exception as e: - # Q-1 D2 : ne plus avaler silencieusement. Le texte (.pseudonymise.txt) - # est déjà sorti avant ce bloc — donc on log + flag quarantaine PDF - # (severity=partial). Le fallback raster + copie texte arrivent en D3. + # Q-1 D2/D3 : ne plus avaler silencieusement. Le texte (.pseudonymise.txt) + # est déjà sorti avant ce bloc. log.warning("PDF vector redaction failed for %s: %s", pdf_path.name, e) - if quarantine_mgr is not None: - quarantine_mgr.flag( - doc_name=pdf_path.stem, - reason="pdf_redaction_failed", - detail=str(e), - severity="partial", - exc=e, + + # D3a : Décision B du consolidé v2 — fallback raster systématique + raster_fallback_ok = False + raster_err: Optional[Exception] = None + try: + ras_fb_path = out_dir / f"{base}.redacted_raster.pdf" + redact_pdf_raster( + pdf_path, anon.audit, ras_fb_path, + ogc_label=ogc_label, ocr_word_map=ocr_word_map, ) - # Note : pas de raise — texte anonymisé déjà disponible, partial OK + outputs["pdf_raster"] = str(ras_fb_path) + raster_fallback_ok = True + log.info("PDF raster fallback OK for %s", pdf_path.name) + except Exception as e2: + raster_err = e2 + log.warning("PDF raster fallback also failed for %s: %s", pdf_path.name, e2) + + if quarantine_mgr is not None: + if raster_fallback_ok: + # Vector raté mais raster OK : qualité moindre, signalée explicitement + quarantine_mgr.flag( + doc_name=pdf_path.stem, + reason="pdf_vector_fallback_to_raster", + detail=f"vector failed ({e}); raster fallback succeeded", + severity="partial", + exc=e, + ) + else: + quarantine_mgr.flag( + doc_name=pdf_path.stem, + reason="pdf_redaction_failed", + detail=f"vector failed ({e}); raster also failed ({raster_err})", + severity="partial", + exc=e, + ) + + # Décision A finalisée : copier le texte en quarantaine pour autoportance + # (l'opérateur peut tout consulter depuis un seul dossier) + try: + quarantine_mgr.quarantine_dir.mkdir(parents=True, exist_ok=True) + shutil.copy(txt_path, quarantine_mgr.quarantine_dir / txt_path.name) + except Exception as copy_err: + log.warning("Could not copy text to quarantine for %s: %s", + pdf_path.name, copy_err) + # Note : pas de raise — texte anonymisé disponible (et copié si quarantine_mgr) if also_make_raster_burn and fitz is not None: ras_path = out_dir / f"{base}.redacted_raster.pdf" redact_pdf_raster(pdf_path, anon.audit, ras_path, ogc_label=ogc_label, ocr_word_map=ocr_word_map)