From 437877e1c8d52fdd3409a2107aa48f71ce4fffc6 Mon Sep 17 00:00:00 2001 From: Domi31tls Date: Wed, 18 Mar 2026 09:25:26 +0100 Subject: [PATCH] =?UTF-8?q?feat:=20support=20multi-formats=20=E2=80=94=20D?= =?UTF-8?q?OCX,=20images,=20ODT,=20RTF,=20TXT,=20HTML?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Nouveau module format_converter.py : conversion automatique vers PDF avant anonymisation. Formats supportés : - PDF (passthrough) - DOCX (python-docx → texte → PDF) - ODT (odfpy → texte → PDF) - RTF (striprtf → texte → PDF) - TXT (texte brut → PDF via PyMuPDF) - HTML (BeautifulSoup → texte → PDF) - JPEG/PNG/TIFF/BMP (image embarquée → OCR docTR en aval) Nouvelle fonction process_document() : wrapper qui gère la conversion puis appelle process_pdf(). GUI mise à jour pour chercher tous les formats supportés (plus seulement *.pdf). Co-Authored-By: Claude Opus 4.6 (1M context) --- Pseudonymisation_Gui_V5.py | 39 ++++- anonymizer_core_refactored_onnx.py | 56 +++++++ format_converter.py | 256 +++++++++++++++++++++++++++++ 3 files changed, 342 insertions(+), 9 deletions(-) create mode 100644 format_converter.py diff --git a/Pseudonymisation_Gui_V5.py b/Pseudonymisation_Gui_V5.py index 637899b..d62fdff 100644 --- a/Pseudonymisation_Gui_V5.py +++ b/Pseudonymisation_Gui_V5.py @@ -658,10 +658,17 @@ class App: if not folder: return - # Compter les PDF (récursif) + # Compter les documents supportés (récursif) + try: + from format_converter import SUPPORTED_EXTENSIONS + except ImportError: + SUPPORTED_EXTENSIONS = {".pdf"} pdf_count = 0 try: - pdf_count = len([p for p in Path(folder).rglob("*.pdf") if p.is_file()]) + pdf_count = len([ + p for p in Path(folder).rglob("*") + if p.is_file() and p.suffix.lower() in SUPPORTED_EXTENSIONS + ]) except Exception: pass @@ -689,7 +696,7 @@ class App: bg=CLR_CARD_BG, fg=CLR_TEXT, anchor="w", ).pack(fill=tk.X) - suffix = "PDF trouvé (récursif)" if pdf_count <= 1 else "PDF trouvés (récursif)" + suffix = "document trouvé (récursif)" if pdf_count <= 1 else "documents trouvés (récursif)" tk.Label( info_frame, text=f"{pdf_count} {suffix}", font=self._f_small, bg=CLR_CARD_BG, fg=CLR_TEXT_SECONDARY, anchor="w", @@ -713,15 +720,25 @@ class App: if not folder.is_dir(): messagebox.showwarning( "Dossier invalide", - "Choisissez un dossier contenant des PDF.", + "Choisissez un dossier contenant des documents.", ) return - pdfs = sorted([p for p in folder.rglob("*.pdf") if p.is_file()]) + try: + from format_converter import SUPPORTED_EXTENSIONS + except ImportError: + SUPPORTED_EXTENSIONS = {".pdf"} + pdfs = sorted([ + p for p in folder.rglob("*") + if p.is_file() and p.suffix.lower() in SUPPORTED_EXTENSIONS + ]) if not pdfs: + exts = ", ".join(sorted(SUPPORTED_EXTENSIONS)) messagebox.showwarning( - "Aucun PDF", - "Aucun fichier PDF trouvé\n(recherche récursive dans les sous-dossiers).", + "Aucun document", + f"Aucun fichier supporté trouvé.\n" + f"Formats acceptés : {exts}\n" + f"(recherche récursive dans les sous-dossiers)", ) return @@ -779,8 +796,12 @@ class App: and self._vlm_manager.is_loaded() ) - outputs = core.process_pdf( - pdf_path=pdf, + # Utiliser process_document (multi-formats) si disponible, + # sinon fallback sur process_pdf (PDF uniquement) + _process_fn = getattr(core, 'process_document', None) or core.process_pdf + _path_key = "doc_path" if _process_fn.__name__ == "process_document" else "pdf_path" + outputs = _process_fn( + **{_path_key: pdf}, out_dir=outdir, make_vector_redaction=False, also_make_raster_burn=True, diff --git a/anonymizer_core_refactored_onnx.py b/anonymizer_core_refactored_onnx.py index c9e99bc..6807db3 100644 --- a/anonymizer_core_refactored_onnx.py +++ b/anonymizer_core_refactored_onnx.py @@ -4119,3 +4119,59 @@ if __name__ == "__main__": ner_thresholds=NerThresholds() if NerThresholds else None, ) print(json.dumps(outs, indent=2, ensure_ascii=False)) + + +# --------------------------------------------------------------------------- +# process_document : wrapper multi-formats (PDF, DOCX, images, etc.) +# --------------------------------------------------------------------------- + +def process_document( + doc_path: Path, + out_dir: Path, + **kwargs, +) -> Dict[str, str]: + """Anonymise un document de n'importe quel format supporté. + + Convertit en PDF si nécessaire, puis passe dans process_pdf(). + Les formats supportés sont définis dans format_converter.SUPPORTED_EXTENSIONS. + + Args: + doc_path: chemin du document source (PDF, DOCX, image, etc.) + out_dir: répertoire de sortie + **kwargs: arguments passés à process_pdf() + + Returns: + dict avec les chemins des fichiers de sortie + """ + from format_converter import convert_to_pdf, SUPPORTED_EXTENSIONS + + suffix = doc_path.suffix.lower() + if suffix not in SUPPORTED_EXTENSIONS: + raise ValueError( + f"Format '{suffix}' non supporté. " + f"Formats acceptés : {', '.join(sorted(SUPPORTED_EXTENSIONS))}" + ) + + pdf_path, is_temp = convert_to_pdf(doc_path) + try: + outputs = process_pdf(pdf_path=pdf_path, out_dir=out_dir, **kwargs) + + # Renommer les sorties pour refléter le nom original (pas le .tmp_convert.pdf) + if is_temp: + original_stem = doc_path.stem + renamed = {} + for key, path_str in outputs.items(): + p = Path(path_str) + if p.exists() and ".tmp_convert" in p.name: + new_name = p.name.replace(doc_path.stem + ".tmp_convert", original_stem) + new_path = p.parent / new_name + p.rename(new_path) + renamed[key] = str(new_path) + else: + renamed[key] = path_str + outputs = renamed + + return outputs + finally: + if is_temp and pdf_path.exists(): + pdf_path.unlink() diff --git a/format_converter.py b/format_converter.py new file mode 100644 index 0000000..3433ea7 --- /dev/null +++ b/format_converter.py @@ -0,0 +1,256 @@ +#!/usr/bin/env python3 +"""Conversion de documents multi-formats vers PDF pour le pipeline d'anonymisation. + +Formats supportés : + - PDF : passthrough (rien à faire) + - DOCX : python-docx → texte → PDF via PyMuPDF + - ODT : odfpy → texte → PDF via PyMuPDF + - RTF : striprtf → texte → PDF via PyMuPDF + - TXT : texte brut → PDF via PyMuPDF + - HTML : BeautifulSoup → texte → PDF via PyMuPDF + - JPEG/PNG/TIFF/BMP : image embarquée dans un PDF (OCR docTR en aval) + +Usage : + from format_converter import convert_to_pdf, SUPPORTED_EXTENSIONS + + pdf_path, is_temp = convert_to_pdf(Path("document.docx")) + # ... process_pdf(pdf_path, ...) ... + if is_temp: + pdf_path.unlink() # nettoyer le fichier temporaire +""" +from __future__ import annotations + +import logging +import tempfile +from pathlib import Path +from typing import Tuple + +log = logging.getLogger(__name__) + +# Extensions supportées (lowercase, avec le point) +SUPPORTED_EXTENSIONS = { + ".pdf", + ".docx", + ".odt", + ".rtf", + ".txt", ".text", + ".html", ".htm", + ".jpg", ".jpeg", + ".png", + ".tiff", ".tif", + ".bmp", +} + +# Extensions images (OCR requis) +_IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".tiff", ".tif", ".bmp"} + + +def convert_to_pdf(input_path: Path) -> Tuple[Path, bool]: + """Convertit un document en PDF pour le pipeline d'anonymisation. + + Args: + input_path: chemin du document source + + Returns: + (pdf_path, is_temporary): chemin du PDF + flag si fichier temporaire à nettoyer + + Raises: + ValueError: format non supporté + RuntimeError: erreur de conversion + """ + suffix = input_path.suffix.lower() + + if suffix == ".pdf": + return input_path, False + + if suffix not in SUPPORTED_EXTENSIONS: + raise ValueError( + f"Format '{suffix}' non supporté. " + f"Formats acceptés : {', '.join(sorted(SUPPORTED_EXTENSIONS))}" + ) + + # Fichier PDF temporaire dans le même dossier (pour préserver le contexte) + tmp_pdf = input_path.with_suffix(".tmp_convert.pdf") + + try: + if suffix in _IMAGE_EXTENSIONS: + _image_to_pdf(input_path, tmp_pdf) + elif suffix == ".docx": + _docx_to_pdf(input_path, tmp_pdf) + elif suffix == ".odt": + _odt_to_pdf(input_path, tmp_pdf) + elif suffix == ".rtf": + _rtf_to_pdf(input_path, tmp_pdf) + elif suffix in {".txt", ".text"}: + _txt_to_pdf(input_path, tmp_pdf) + elif suffix in {".html", ".htm"}: + _html_to_pdf(input_path, tmp_pdf) + else: + raise ValueError(f"Format '{suffix}' non implémenté") + + log.info("Converti %s → %s", input_path.name, tmp_pdf.name) + return tmp_pdf, True + + except Exception as e: + # Nettoyer en cas d'erreur + if tmp_pdf.exists(): + tmp_pdf.unlink() + raise RuntimeError(f"Erreur conversion {input_path.name}: {e}") from e + + +def _image_to_pdf(img_path: Path, out_pdf: Path): + """Embarque une image dans un PDF (1 page). L'OCR docTR traitera en aval.""" + import fitz + + doc = fitz.open() + # Ouvrir l'image pour obtenir ses dimensions + img_doc = fitz.open(str(img_path)) + # Si c'est un TIFF multi-pages + for i in range(len(img_doc)): + page = img_doc[i] + rect = page.rect + pdf_page = doc.new_page(width=rect.width, height=rect.height) + pdf_page.insert_image(rect, filename=str(img_path) if i == 0 else None, + pixmap=img_doc[i].get_pixmap() if i > 0 else None) + img_doc.close() + doc.save(str(out_pdf)) + doc.close() + + +def _text_to_pdf_pages(text: str, out_pdf: Path, font_size: float = 10.0): + """Crée un PDF à partir de texte brut, avec pagination automatique.""" + import fitz + + doc = fitz.open() + # A4 + page_w, page_h = 595, 842 + margin = 50 + usable_h = page_h - 2 * margin + line_height = font_size * 1.4 + + lines = text.split("\n") + page = doc.new_page(width=page_w, height=page_h) + y = margin + + for line in lines: + if y + line_height > page_h - margin: + # Nouvelle page + page = doc.new_page(width=page_w, height=page_h) + y = margin + + # Tronquer les lignes trop longues + max_chars = int((page_w - 2 * margin) / (font_size * 0.5)) + display_line = line[:max_chars] if len(line) > max_chars else line + + try: + page.insert_text( + fitz.Point(margin, y + font_size), + display_line, + fontsize=font_size, + fontname="helv", + ) + except Exception: + # Fallback pour les caractères non supportés + safe = display_line.encode("latin-1", errors="replace").decode("latin-1") + page.insert_text( + fitz.Point(margin, y + font_size), + safe, + fontsize=font_size, + fontname="helv", + ) + y += line_height + + doc.save(str(out_pdf)) + doc.close() + + +def _docx_to_pdf(docx_path: Path, out_pdf: Path): + """Extrait le texte d'un DOCX et crée un PDF.""" + from docx import Document + + doc = Document(str(docx_path)) + paragraphs = [] + for para in doc.paragraphs: + paragraphs.append(para.text) + + # Extraire aussi les tableaux + for table in doc.tables: + for row in table.rows: + cells = [cell.text.strip() for cell in row.cells] + paragraphs.append(" | ".join(cells)) + + text = "\n".join(paragraphs) + if not text.strip(): + raise RuntimeError("Document DOCX vide ou illisible") + + _text_to_pdf_pages(text, out_pdf) + + +def _odt_to_pdf(odt_path: Path, out_pdf: Path): + """Extrait le texte d'un ODT et crée un PDF.""" + from odf.opendocument import load as odf_load + from odf.text import P as OdfP + from odf import teletype + + doc = odf_load(str(odt_path)) + paragraphs = [] + for p in doc.getElementsByType(OdfP): + paragraphs.append(teletype.extractText(p)) + + text = "\n".join(paragraphs) + if not text.strip(): + raise RuntimeError("Document ODT vide ou illisible") + + _text_to_pdf_pages(text, out_pdf) + + +def _rtf_to_pdf(rtf_path: Path, out_pdf: Path): + """Extrait le texte d'un RTF et crée un PDF.""" + from striprtf.striprtf import rtf_to_text + + raw = rtf_path.read_text(encoding="utf-8", errors="replace") + text = rtf_to_text(raw) + if not text.strip(): + raise RuntimeError("Document RTF vide ou illisible") + + _text_to_pdf_pages(text, out_pdf) + + +def _txt_to_pdf(txt_path: Path, out_pdf: Path): + """Convertit un fichier texte brut en PDF.""" + # Tenter plusieurs encodages + for enc in ("utf-8", "latin-1", "cp1252"): + try: + text = txt_path.read_text(encoding=enc) + break + except UnicodeDecodeError: + continue + else: + text = txt_path.read_bytes().decode("utf-8", errors="replace") + + if not text.strip(): + raise RuntimeError("Fichier texte vide") + + _text_to_pdf_pages(text, out_pdf) + + +def _html_to_pdf(html_path: Path, out_pdf: Path): + """Extrait le texte d'un fichier HTML et crée un PDF.""" + from bs4 import BeautifulSoup + + raw = html_path.read_text(encoding="utf-8", errors="replace") + soup = BeautifulSoup(raw, "html.parser") + + # Supprimer scripts et styles + for tag in soup(["script", "style"]): + tag.decompose() + + text = soup.get_text(separator="\n") + # Nettoyer les lignes vides multiples + import re + text = re.sub(r"\n{3,}", "\n\n", text) + + if not text.strip(): + raise RuntimeError("Document HTML vide ou illisible") + + _text_to_pdf_pages(text, out_pdf)