feat: support multi-formats — DOCX, images, ODT, RTF, TXT, HTML

Nouveau module format_converter.py : conversion automatique vers PDF avant anonymisation. Formats supportés : - PDF (passthrough) - DOCX (python-docx → texte → PDF) - ODT (odfpy → texte → PDF) - RTF (striprtf → texte → PDF) - TXT (texte brut → PDF via PyMuPDF) - HTML (BeautifulSoup → texte → PDF) - JPEG/PNG/TIFF/BMP (image embarquée → OCR docTR en aval) Nouvelle fonction process_document() : wrapper qui gère la conversion puis appelle process_pdf(). GUI mise à jour pour chercher tous les formats supportés (plus seulement *.pdf). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-18 09:25:26 +01:00
parent 3992b43925
commit 437877e1c8
3 changed files with 342 additions and 9 deletions
--- a/anonymizer_core_refactored_onnx.py
+++ b/anonymizer_core_refactored_onnx.py
@@ -4119,3 +4119,59 @@ if __name__ == "__main__":
        ner_thresholds=NerThresholds() if NerThresholds else None,
    )
    print(json.dumps(outs, indent=2, ensure_ascii=False))
+
+
+# ---------------------------------------------------------------------------
+# process_document : wrapper multi-formats (PDF, DOCX, images, etc.)
+# ---------------------------------------------------------------------------
+
+def process_document(
+    doc_path: Path,
+    out_dir: Path,
+    **kwargs,
+) -> Dict[str, str]:
+    """Anonymise un document de n'importe quel format supporté.
+
+    Convertit en PDF si nécessaire, puis passe dans process_pdf().
+    Les formats supportés sont définis dans format_converter.SUPPORTED_EXTENSIONS.
+
+    Args:
+        doc_path: chemin du document source (PDF, DOCX, image, etc.)
+        out_dir: répertoire de sortie
+        **kwargs: arguments passés à process_pdf()
+
+    Returns:
+        dict avec les chemins des fichiers de sortie
+    """
+    from format_converter import convert_to_pdf, SUPPORTED_EXTENSIONS
+
+    suffix = doc_path.suffix.lower()
+    if suffix not in SUPPORTED_EXTENSIONS:
+        raise ValueError(
+            f"Format '{suffix}' non supporté. "
+            f"Formats acceptés : {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
+        )
+
+    pdf_path, is_temp = convert_to_pdf(doc_path)
+    try:
+        outputs = process_pdf(pdf_path=pdf_path, out_dir=out_dir, **kwargs)
+
+        # Renommer les sorties pour refléter le nom original (pas le .tmp_convert.pdf)
+        if is_temp:
+            original_stem = doc_path.stem
+            renamed = {}
+            for key, path_str in outputs.items():
+                p = Path(path_str)
+                if p.exists() and ".tmp_convert" in p.name:
+                    new_name = p.name.replace(doc_path.stem + ".tmp_convert", original_stem)
+                    new_path = p.parent / new_name
+                    p.rename(new_path)
+                    renamed[key] = str(new_path)
+                else:
+                    renamed[key] = path_str
+            outputs = renamed
+
+        return outputs
+    finally:
+        if is_temp and pdf_path.exists():
+            pdf_path.unlink()