#!/usr/bin/env python3 """Conversion de documents multi-formats vers PDF pour le pipeline d'anonymisation. Formats supportés : - PDF : passthrough (rien à faire) - DOCX : python-docx → texte → PDF via PyMuPDF - ODT : odfpy → texte → PDF via PyMuPDF - RTF : striprtf → texte → PDF via PyMuPDF - TXT : texte brut → PDF via PyMuPDF - HTML : BeautifulSoup → texte → PDF via PyMuPDF - JPEG/PNG/TIFF/BMP : image embarquée dans un PDF (OCR docTR en aval) Usage : from format_converter import convert_to_pdf, SUPPORTED_EXTENSIONS pdf_path, is_temp = convert_to_pdf(Path("document.docx")) # ... process_pdf(pdf_path, ...) ... if is_temp: pdf_path.unlink() # nettoyer le fichier temporaire """ from __future__ import annotations import logging import tempfile from pathlib import Path from typing import Tuple log = logging.getLogger(__name__) # Extensions supportées (lowercase, avec le point) SUPPORTED_EXTENSIONS = { ".pdf", ".docx", ".odt", ".rtf", ".txt", ".text", ".html", ".htm", ".jpg", ".jpeg", ".png", ".tiff", ".tif", ".bmp", } # Extensions images (OCR requis) _IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".tiff", ".tif", ".bmp"} def convert_to_pdf(input_path: Path) -> Tuple[Path, bool]: """Convertit un document en PDF pour le pipeline d'anonymisation. Args: input_path: chemin du document source Returns: (pdf_path, is_temporary): chemin du PDF + flag si fichier temporaire à nettoyer Raises: ValueError: format non supporté RuntimeError: erreur de conversion """ suffix = input_path.suffix.lower() if suffix == ".pdf": return input_path, False if suffix not in SUPPORTED_EXTENSIONS: raise ValueError( f"Format '{suffix}' non supporté. " f"Formats acceptés : {', '.join(sorted(SUPPORTED_EXTENSIONS))}" ) # Fichier PDF temporaire dans le même dossier (pour préserver le contexte) tmp_pdf = input_path.with_suffix(".tmp_convert.pdf") try: if suffix in _IMAGE_EXTENSIONS: _image_to_pdf(input_path, tmp_pdf) elif suffix == ".docx": _docx_to_pdf(input_path, tmp_pdf) elif suffix == ".odt": _odt_to_pdf(input_path, tmp_pdf) elif suffix == ".rtf": _rtf_to_pdf(input_path, tmp_pdf) elif suffix in {".txt", ".text"}: _txt_to_pdf(input_path, tmp_pdf) elif suffix in {".html", ".htm"}: _html_to_pdf(input_path, tmp_pdf) else: raise ValueError(f"Format '{suffix}' non implémenté") log.info("Converti %s → %s", input_path.name, tmp_pdf.name) return tmp_pdf, True except Exception as e: # Nettoyer en cas d'erreur if tmp_pdf.exists(): tmp_pdf.unlink() raise RuntimeError(f"Erreur conversion {input_path.name}: {e}") from e def _image_to_pdf(img_path: Path, out_pdf: Path): """Embarque une image dans un PDF (1 page). L'OCR docTR traitera en aval.""" import fitz doc = fitz.open() # Ouvrir l'image pour obtenir ses dimensions img_doc = fitz.open(str(img_path)) # Si c'est un TIFF multi-pages for i in range(len(img_doc)): page = img_doc[i] rect = page.rect pdf_page = doc.new_page(width=rect.width, height=rect.height) pdf_page.insert_image(rect, filename=str(img_path) if i == 0 else None, pixmap=img_doc[i].get_pixmap() if i > 0 else None) img_doc.close() doc.save(str(out_pdf)) doc.close() def _text_to_pdf_pages(text: str, out_pdf: Path, font_size: float = 10.0): """Crée un PDF à partir de texte brut, avec pagination automatique.""" import fitz doc = fitz.open() # A4 page_w, page_h = 595, 842 margin = 50 usable_h = page_h - 2 * margin line_height = font_size * 1.4 lines = text.split("\n") page = doc.new_page(width=page_w, height=page_h) y = margin for line in lines: if y + line_height > page_h - margin: # Nouvelle page page = doc.new_page(width=page_w, height=page_h) y = margin # Tronquer les lignes trop longues max_chars = int((page_w - 2 * margin) / (font_size * 0.5)) display_line = line[:max_chars] if len(line) > max_chars else line try: page.insert_text( fitz.Point(margin, y + font_size), display_line, fontsize=font_size, fontname="helv", ) except Exception: # Fallback pour les caractères non supportés safe = display_line.encode("latin-1", errors="replace").decode("latin-1") page.insert_text( fitz.Point(margin, y + font_size), safe, fontsize=font_size, fontname="helv", ) y += line_height doc.save(str(out_pdf)) doc.close() def _docx_to_pdf(docx_path: Path, out_pdf: Path): """Extrait le texte d'un DOCX et crée un PDF.""" from docx import Document doc = Document(str(docx_path)) paragraphs = [] for para in doc.paragraphs: paragraphs.append(para.text) # Extraire aussi les tableaux for table in doc.tables: for row in table.rows: cells = [cell.text.strip() for cell in row.cells] paragraphs.append(" | ".join(cells)) text = "\n".join(paragraphs) if not text.strip(): raise RuntimeError("Document DOCX vide ou illisible") _text_to_pdf_pages(text, out_pdf) def _odt_to_pdf(odt_path: Path, out_pdf: Path): """Extrait le texte d'un ODT et crée un PDF.""" from odf.opendocument import load as odf_load from odf.text import P as OdfP from odf import teletype doc = odf_load(str(odt_path)) paragraphs = [] for p in doc.getElementsByType(OdfP): paragraphs.append(teletype.extractText(p)) text = "\n".join(paragraphs) if not text.strip(): raise RuntimeError("Document ODT vide ou illisible") _text_to_pdf_pages(text, out_pdf) def _rtf_to_pdf(rtf_path: Path, out_pdf: Path): """Extrait le texte d'un RTF et crée un PDF.""" from striprtf.striprtf import rtf_to_text raw = rtf_path.read_text(encoding="utf-8", errors="replace") text = rtf_to_text(raw) if not text.strip(): raise RuntimeError("Document RTF vide ou illisible") _text_to_pdf_pages(text, out_pdf) def _txt_to_pdf(txt_path: Path, out_pdf: Path): """Convertit un fichier texte brut en PDF.""" # Tenter plusieurs encodages for enc in ("utf-8", "latin-1", "cp1252"): try: text = txt_path.read_text(encoding=enc) break except UnicodeDecodeError: continue else: text = txt_path.read_bytes().decode("utf-8", errors="replace") if not text.strip(): raise RuntimeError("Fichier texte vide") _text_to_pdf_pages(text, out_pdf) def _html_to_pdf(html_path: Path, out_pdf: Path): """Extrait le texte d'un fichier HTML et crée un PDF.""" from bs4 import BeautifulSoup raw = html_path.read_text(encoding="utf-8", errors="replace") soup = BeautifulSoup(raw, "html.parser") # Supprimer scripts et styles for tag in soup(["script", "style"]): tag.decompose() text = soup.get_text(separator="\n") # Nettoyer les lignes vides multiples import re text = re.sub(r"\n{3,}", "\n\n", text) if not text.strip(): raise RuntimeError("Document HTML vide ou illisible") _text_to_pdf_pages(text, out_pdf)