feat: support multi-formats — DOCX, images, ODT, RTF, TXT, HTML

Nouveau module format_converter.py : conversion automatique vers PDF avant anonymisation. Formats supportés : - PDF (passthrough) - DOCX (python-docx → texte → PDF) - ODT (odfpy → texte → PDF) - RTF (striprtf → texte → PDF) - TXT (texte brut → PDF via PyMuPDF) - HTML (BeautifulSoup → texte → PDF) - JPEG/PNG/TIFF/BMP (image embarquée → OCR docTR en aval) Nouvelle fonction process_document() : wrapper qui gère la conversion puis appelle process_pdf(). GUI mise à jour pour chercher tous les formats supportés (plus seulement *.pdf). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-18 09:25:26 +01:00
parent 3992b43925
commit 437877e1c8
3 changed files with 342 additions and 9 deletions
--- a/Pseudonymisation_Gui_V5.py
+++ b/Pseudonymisation_Gui_V5.py
@@ -658,10 +658,17 @@ class App:
        if not folder:
            return

-        # Compter les PDF (récursif)
+        # Compter les documents supportés (récursif)
+        try:
+            from format_converter import SUPPORTED_EXTENSIONS
+        except ImportError:
+            SUPPORTED_EXTENSIONS = {".pdf"}
        pdf_count = 0
        try:
-            pdf_count = len([p for p in Path(folder).rglob("*.pdf") if p.is_file()])
+            pdf_count = len([
+                p for p in Path(folder).rglob("*")
+                if p.is_file() and p.suffix.lower() in SUPPORTED_EXTENSIONS
+            ])
        except Exception:
            pass

@@ -689,7 +696,7 @@ class App:
            bg=CLR_CARD_BG, fg=CLR_TEXT, anchor="w",
        ).pack(fill=tk.X)

-        suffix = "PDF trouvé (récursif)" if pdf_count <= 1 else "PDF trouvés (récursif)"
+        suffix = "document trouvé (récursif)" if pdf_count <= 1 else "documents trouvés (récursif)"
        tk.Label(
            info_frame, text=f"{pdf_count} {suffix}",
            font=self._f_small, bg=CLR_CARD_BG, fg=CLR_TEXT_SECONDARY, anchor="w",
@@ -713,15 +720,25 @@ class App:
        if not folder.is_dir():
            messagebox.showwarning(
                "Dossier invalide",
-                "Choisissez un dossier contenant des PDF.",
+                "Choisissez un dossier contenant des documents.",
            )
            return

-        pdfs = sorted([p for p in folder.rglob("*.pdf") if p.is_file()])
+        try:
+            from format_converter import SUPPORTED_EXTENSIONS
+        except ImportError:
+            SUPPORTED_EXTENSIONS = {".pdf"}
+        pdfs = sorted([
+            p for p in folder.rglob("*")
+            if p.is_file() and p.suffix.lower() in SUPPORTED_EXTENSIONS
+        ])
        if not pdfs:
+            exts = ", ".join(sorted(SUPPORTED_EXTENSIONS))
            messagebox.showwarning(
-                "Aucun PDF",
-                "Aucun fichier PDF trouvé\n(recherche récursive dans les sous-dossiers).",
+                "Aucun document",
+                f"Aucun fichier supporté trouvé.\n"
+                f"Formats acceptés : {exts}\n"
+                f"(recherche récursive dans les sous-dossiers)",
            )
            return

@@ -779,8 +796,12 @@ class App:
                        and self._vlm_manager.is_loaded()
                    )

-                    outputs = core.process_pdf(
-                        pdf_path=pdf,
+                    # Utiliser process_document (multi-formats) si disponible,
+                    # sinon fallback sur process_pdf (PDF uniquement)
+                    _process_fn = getattr(core, 'process_document', None) or core.process_pdf
+                    _path_key = "doc_path" if _process_fn.__name__ == "process_document" else "pdf_path"
+                    outputs = _process_fn(
+                        **{_path_key: pdf},
                        out_dir=outdir,
                        make_vector_redaction=False,
                        also_make_raster_burn=True,
--- a/anonymizer_core_refactored_onnx.py
+++ b/anonymizer_core_refactored_onnx.py
@@ -4119,3 +4119,59 @@ if __name__ == "__main__":
        ner_thresholds=NerThresholds() if NerThresholds else None,
    )
    print(json.dumps(outs, indent=2, ensure_ascii=False))
+
+
+# ---------------------------------------------------------------------------
+# process_document : wrapper multi-formats (PDF, DOCX, images, etc.)
+# ---------------------------------------------------------------------------
+
+def process_document(
+    doc_path: Path,
+    out_dir: Path,
+    **kwargs,
+) -> Dict[str, str]:
+    """Anonymise un document de n'importe quel format supporté.
+
+    Convertit en PDF si nécessaire, puis passe dans process_pdf().
+    Les formats supportés sont définis dans format_converter.SUPPORTED_EXTENSIONS.
+
+    Args:
+        doc_path: chemin du document source (PDF, DOCX, image, etc.)
+        out_dir: répertoire de sortie
+        **kwargs: arguments passés à process_pdf()
+
+    Returns:
+        dict avec les chemins des fichiers de sortie
+    """
+    from format_converter import convert_to_pdf, SUPPORTED_EXTENSIONS
+
+    suffix = doc_path.suffix.lower()
+    if suffix not in SUPPORTED_EXTENSIONS:
+        raise ValueError(
+            f"Format '{suffix}' non supporté. "
+            f"Formats acceptés : {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
+        )
+
+    pdf_path, is_temp = convert_to_pdf(doc_path)
+    try:
+        outputs = process_pdf(pdf_path=pdf_path, out_dir=out_dir, **kwargs)
+
+        # Renommer les sorties pour refléter le nom original (pas le .tmp_convert.pdf)
+        if is_temp:
+            original_stem = doc_path.stem
+            renamed = {}
+            for key, path_str in outputs.items():
+                p = Path(path_str)
+                if p.exists() and ".tmp_convert" in p.name:
+                    new_name = p.name.replace(doc_path.stem + ".tmp_convert", original_stem)
+                    new_path = p.parent / new_name
+                    p.rename(new_path)
+                    renamed[key] = str(new_path)
+                else:
+                    renamed[key] = path_str
+            outputs = renamed
+
+        return outputs
+    finally:
+        if is_temp and pdf_path.exists():
+            pdf_path.unlink()
--- a/format_converter.py
+++ b/format_converter.py
@@ -0,0 +1,256 @@
+#!/usr/bin/env python3
+"""Conversion de documents multi-formats vers PDF pour le pipeline d'anonymisation.
+
+Formats supportés :
+  - PDF  : passthrough (rien à faire)
+  - DOCX : python-docx → texte → PDF via PyMuPDF
+  - ODT  : odfpy → texte → PDF via PyMuPDF
+  - RTF  : striprtf → texte → PDF via PyMuPDF
+  - TXT  : texte brut → PDF via PyMuPDF
+  - HTML : BeautifulSoup → texte → PDF via PyMuPDF
+  - JPEG/PNG/TIFF/BMP : image embarquée dans un PDF (OCR docTR en aval)
+
+Usage :
+    from format_converter import convert_to_pdf, SUPPORTED_EXTENSIONS
+
+    pdf_path, is_temp = convert_to_pdf(Path("document.docx"))
+    # ... process_pdf(pdf_path, ...) ...
+    if is_temp:
+        pdf_path.unlink()  # nettoyer le fichier temporaire
+"""
+from __future__ import annotations
+
+import logging
+import tempfile
+from pathlib import Path
+from typing import Tuple
+
+log = logging.getLogger(__name__)
+
+# Extensions supportées (lowercase, avec le point)
+SUPPORTED_EXTENSIONS = {
+    ".pdf",
+    ".docx",
+    ".odt",
+    ".rtf",
+    ".txt", ".text",
+    ".html", ".htm",
+    ".jpg", ".jpeg",
+    ".png",
+    ".tiff", ".tif",
+    ".bmp",
+}
+
+# Extensions images (OCR requis)
+_IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".tiff", ".tif", ".bmp"}
+
+
+def convert_to_pdf(input_path: Path) -> Tuple[Path, bool]:
+    """Convertit un document en PDF pour le pipeline d'anonymisation.
+
+    Args:
+        input_path: chemin du document source
+
+    Returns:
+        (pdf_path, is_temporary): chemin du PDF + flag si fichier temporaire à nettoyer
+
+    Raises:
+        ValueError: format non supporté
+        RuntimeError: erreur de conversion
+    """
+    suffix = input_path.suffix.lower()
+
+    if suffix == ".pdf":
+        return input_path, False
+
+    if suffix not in SUPPORTED_EXTENSIONS:
+        raise ValueError(
+            f"Format '{suffix}' non supporté. "
+            f"Formats acceptés : {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
+        )
+
+    # Fichier PDF temporaire dans le même dossier (pour préserver le contexte)
+    tmp_pdf = input_path.with_suffix(".tmp_convert.pdf")
+
+    try:
+        if suffix in _IMAGE_EXTENSIONS:
+            _image_to_pdf(input_path, tmp_pdf)
+        elif suffix == ".docx":
+            _docx_to_pdf(input_path, tmp_pdf)
+        elif suffix == ".odt":
+            _odt_to_pdf(input_path, tmp_pdf)
+        elif suffix == ".rtf":
+            _rtf_to_pdf(input_path, tmp_pdf)
+        elif suffix in {".txt", ".text"}:
+            _txt_to_pdf(input_path, tmp_pdf)
+        elif suffix in {".html", ".htm"}:
+            _html_to_pdf(input_path, tmp_pdf)
+        else:
+            raise ValueError(f"Format '{suffix}' non implémenté")
+
+        log.info("Converti %s → %s", input_path.name, tmp_pdf.name)
+        return tmp_pdf, True
+
+    except Exception as e:
+        # Nettoyer en cas d'erreur
+        if tmp_pdf.exists():
+            tmp_pdf.unlink()
+        raise RuntimeError(f"Erreur conversion {input_path.name}: {e}") from e
+
+
+def _image_to_pdf(img_path: Path, out_pdf: Path):
+    """Embarque une image dans un PDF (1 page). L'OCR docTR traitera en aval."""
+    import fitz
+
+    doc = fitz.open()
+    # Ouvrir l'image pour obtenir ses dimensions
+    img_doc = fitz.open(str(img_path))
+    # Si c'est un TIFF multi-pages
+    for i in range(len(img_doc)):
+        page = img_doc[i]
+        rect = page.rect
+        pdf_page = doc.new_page(width=rect.width, height=rect.height)
+        pdf_page.insert_image(rect, filename=str(img_path) if i == 0 else None,
+                               pixmap=img_doc[i].get_pixmap() if i > 0 else None)
+    img_doc.close()
+    doc.save(str(out_pdf))
+    doc.close()
+
+
+def _text_to_pdf_pages(text: str, out_pdf: Path, font_size: float = 10.0):
+    """Crée un PDF à partir de texte brut, avec pagination automatique."""
+    import fitz
+
+    doc = fitz.open()
+    # A4
+    page_w, page_h = 595, 842
+    margin = 50
+    usable_h = page_h - 2 * margin
+    line_height = font_size * 1.4
+
+    lines = text.split("\n")
+    page = doc.new_page(width=page_w, height=page_h)
+    y = margin
+
+    for line in lines:
+        if y + line_height > page_h - margin:
+            # Nouvelle page
+            page = doc.new_page(width=page_w, height=page_h)
+            y = margin
+
+        # Tronquer les lignes trop longues
+        max_chars = int((page_w - 2 * margin) / (font_size * 0.5))
+        display_line = line[:max_chars] if len(line) > max_chars else line
+
+        try:
+            page.insert_text(
+                fitz.Point(margin, y + font_size),
+                display_line,
+                fontsize=font_size,
+                fontname="helv",
+            )
+        except Exception:
+            # Fallback pour les caractères non supportés
+            safe = display_line.encode("latin-1", errors="replace").decode("latin-1")
+            page.insert_text(
+                fitz.Point(margin, y + font_size),
+                safe,
+                fontsize=font_size,
+                fontname="helv",
+            )
+        y += line_height
+
+    doc.save(str(out_pdf))
+    doc.close()
+
+
+def _docx_to_pdf(docx_path: Path, out_pdf: Path):
+    """Extrait le texte d'un DOCX et crée un PDF."""
+    from docx import Document
+
+    doc = Document(str(docx_path))
+    paragraphs = []
+    for para in doc.paragraphs:
+        paragraphs.append(para.text)
+
+    # Extraire aussi les tableaux
+    for table in doc.tables:
+        for row in table.rows:
+            cells = [cell.text.strip() for cell in row.cells]
+            paragraphs.append(" | ".join(cells))
+
+    text = "\n".join(paragraphs)
+    if not text.strip():
+        raise RuntimeError("Document DOCX vide ou illisible")
+
+    _text_to_pdf_pages(text, out_pdf)
+
+
+def _odt_to_pdf(odt_path: Path, out_pdf: Path):
+    """Extrait le texte d'un ODT et crée un PDF."""
+    from odf.opendocument import load as odf_load
+    from odf.text import P as OdfP
+    from odf import teletype
+
+    doc = odf_load(str(odt_path))
+    paragraphs = []
+    for p in doc.getElementsByType(OdfP):
+        paragraphs.append(teletype.extractText(p))
+
+    text = "\n".join(paragraphs)
+    if not text.strip():
+        raise RuntimeError("Document ODT vide ou illisible")
+
+    _text_to_pdf_pages(text, out_pdf)
+
+
+def _rtf_to_pdf(rtf_path: Path, out_pdf: Path):
+    """Extrait le texte d'un RTF et crée un PDF."""
+    from striprtf.striprtf import rtf_to_text
+
+    raw = rtf_path.read_text(encoding="utf-8", errors="replace")
+    text = rtf_to_text(raw)
+    if not text.strip():
+        raise RuntimeError("Document RTF vide ou illisible")
+
+    _text_to_pdf_pages(text, out_pdf)
+
+
+def _txt_to_pdf(txt_path: Path, out_pdf: Path):
+    """Convertit un fichier texte brut en PDF."""
+    # Tenter plusieurs encodages
+    for enc in ("utf-8", "latin-1", "cp1252"):
+        try:
+            text = txt_path.read_text(encoding=enc)
+            break
+        except UnicodeDecodeError:
+            continue
+    else:
+        text = txt_path.read_bytes().decode("utf-8", errors="replace")
+
+    if not text.strip():
+        raise RuntimeError("Fichier texte vide")
+
+    _text_to_pdf_pages(text, out_pdf)
+
+
+def _html_to_pdf(html_path: Path, out_pdf: Path):
+    """Extrait le texte d'un fichier HTML et crée un PDF."""
+    from bs4 import BeautifulSoup
+
+    raw = html_path.read_text(encoding="utf-8", errors="replace")
+    soup = BeautifulSoup(raw, "html.parser")
+
+    # Supprimer scripts et styles
+    for tag in soup(["script", "style"]):
+        tag.decompose()
+
+    text = soup.get_text(separator="\n")
+    # Nettoyer les lignes vides multiples
+    import re
+    text = re.sub(r"\n{3,}", "\n\n", text)
+
+    if not text.strip():
+        raise RuntimeError("Document HTML vide ou illisible")
+
+    _text_to_pdf_pages(text, out_pdf)