feat: support multi-formats — DOCX, images, ODT, RTF, TXT, HTML

Nouveau module format_converter.py : conversion automatique vers PDF
avant anonymisation. Formats supportés :
- PDF (passthrough)
- DOCX (python-docx → texte → PDF)
- ODT (odfpy → texte → PDF)
- RTF (striprtf → texte → PDF)
- TXT (texte brut → PDF via PyMuPDF)
- HTML (BeautifulSoup → texte → PDF)
- JPEG/PNG/TIFF/BMP (image embarquée → OCR docTR en aval)

Nouvelle fonction process_document() : wrapper qui gère la conversion
puis appelle process_pdf(). GUI mise à jour pour chercher tous les
formats supportés (plus seulement *.pdf).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-18 09:25:26 +01:00
parent 3992b43925
commit 437877e1c8
3 changed files with 342 additions and 9 deletions

256
format_converter.py Normal file
View File

@@ -0,0 +1,256 @@
#!/usr/bin/env python3
"""Conversion de documents multi-formats vers PDF pour le pipeline d'anonymisation.
Formats supportés :
- PDF : passthrough (rien à faire)
- DOCX : python-docx → texte → PDF via PyMuPDF
- ODT : odfpy → texte → PDF via PyMuPDF
- RTF : striprtf → texte → PDF via PyMuPDF
- TXT : texte brut → PDF via PyMuPDF
- HTML : BeautifulSoup → texte → PDF via PyMuPDF
- JPEG/PNG/TIFF/BMP : image embarquée dans un PDF (OCR docTR en aval)
Usage :
from format_converter import convert_to_pdf, SUPPORTED_EXTENSIONS
pdf_path, is_temp = convert_to_pdf(Path("document.docx"))
# ... process_pdf(pdf_path, ...) ...
if is_temp:
pdf_path.unlink() # nettoyer le fichier temporaire
"""
from __future__ import annotations
import logging
import tempfile
from pathlib import Path
from typing import Tuple
log = logging.getLogger(__name__)
# Extensions supportées (lowercase, avec le point)
SUPPORTED_EXTENSIONS = {
".pdf",
".docx",
".odt",
".rtf",
".txt", ".text",
".html", ".htm",
".jpg", ".jpeg",
".png",
".tiff", ".tif",
".bmp",
}
# Extensions images (OCR requis)
_IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".tiff", ".tif", ".bmp"}
def convert_to_pdf(input_path: Path) -> Tuple[Path, bool]:
"""Convertit un document en PDF pour le pipeline d'anonymisation.
Args:
input_path: chemin du document source
Returns:
(pdf_path, is_temporary): chemin du PDF + flag si fichier temporaire à nettoyer
Raises:
ValueError: format non supporté
RuntimeError: erreur de conversion
"""
suffix = input_path.suffix.lower()
if suffix == ".pdf":
return input_path, False
if suffix not in SUPPORTED_EXTENSIONS:
raise ValueError(
f"Format '{suffix}' non supporté. "
f"Formats acceptés : {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
)
# Fichier PDF temporaire dans le même dossier (pour préserver le contexte)
tmp_pdf = input_path.with_suffix(".tmp_convert.pdf")
try:
if suffix in _IMAGE_EXTENSIONS:
_image_to_pdf(input_path, tmp_pdf)
elif suffix == ".docx":
_docx_to_pdf(input_path, tmp_pdf)
elif suffix == ".odt":
_odt_to_pdf(input_path, tmp_pdf)
elif suffix == ".rtf":
_rtf_to_pdf(input_path, tmp_pdf)
elif suffix in {".txt", ".text"}:
_txt_to_pdf(input_path, tmp_pdf)
elif suffix in {".html", ".htm"}:
_html_to_pdf(input_path, tmp_pdf)
else:
raise ValueError(f"Format '{suffix}' non implémenté")
log.info("Converti %s%s", input_path.name, tmp_pdf.name)
return tmp_pdf, True
except Exception as e:
# Nettoyer en cas d'erreur
if tmp_pdf.exists():
tmp_pdf.unlink()
raise RuntimeError(f"Erreur conversion {input_path.name}: {e}") from e
def _image_to_pdf(img_path: Path, out_pdf: Path):
"""Embarque une image dans un PDF (1 page). L'OCR docTR traitera en aval."""
import fitz
doc = fitz.open()
# Ouvrir l'image pour obtenir ses dimensions
img_doc = fitz.open(str(img_path))
# Si c'est un TIFF multi-pages
for i in range(len(img_doc)):
page = img_doc[i]
rect = page.rect
pdf_page = doc.new_page(width=rect.width, height=rect.height)
pdf_page.insert_image(rect, filename=str(img_path) if i == 0 else None,
pixmap=img_doc[i].get_pixmap() if i > 0 else None)
img_doc.close()
doc.save(str(out_pdf))
doc.close()
def _text_to_pdf_pages(text: str, out_pdf: Path, font_size: float = 10.0):
"""Crée un PDF à partir de texte brut, avec pagination automatique."""
import fitz
doc = fitz.open()
# A4
page_w, page_h = 595, 842
margin = 50
usable_h = page_h - 2 * margin
line_height = font_size * 1.4
lines = text.split("\n")
page = doc.new_page(width=page_w, height=page_h)
y = margin
for line in lines:
if y + line_height > page_h - margin:
# Nouvelle page
page = doc.new_page(width=page_w, height=page_h)
y = margin
# Tronquer les lignes trop longues
max_chars = int((page_w - 2 * margin) / (font_size * 0.5))
display_line = line[:max_chars] if len(line) > max_chars else line
try:
page.insert_text(
fitz.Point(margin, y + font_size),
display_line,
fontsize=font_size,
fontname="helv",
)
except Exception:
# Fallback pour les caractères non supportés
safe = display_line.encode("latin-1", errors="replace").decode("latin-1")
page.insert_text(
fitz.Point(margin, y + font_size),
safe,
fontsize=font_size,
fontname="helv",
)
y += line_height
doc.save(str(out_pdf))
doc.close()
def _docx_to_pdf(docx_path: Path, out_pdf: Path):
"""Extrait le texte d'un DOCX et crée un PDF."""
from docx import Document
doc = Document(str(docx_path))
paragraphs = []
for para in doc.paragraphs:
paragraphs.append(para.text)
# Extraire aussi les tableaux
for table in doc.tables:
for row in table.rows:
cells = [cell.text.strip() for cell in row.cells]
paragraphs.append(" | ".join(cells))
text = "\n".join(paragraphs)
if not text.strip():
raise RuntimeError("Document DOCX vide ou illisible")
_text_to_pdf_pages(text, out_pdf)
def _odt_to_pdf(odt_path: Path, out_pdf: Path):
"""Extrait le texte d'un ODT et crée un PDF."""
from odf.opendocument import load as odf_load
from odf.text import P as OdfP
from odf import teletype
doc = odf_load(str(odt_path))
paragraphs = []
for p in doc.getElementsByType(OdfP):
paragraphs.append(teletype.extractText(p))
text = "\n".join(paragraphs)
if not text.strip():
raise RuntimeError("Document ODT vide ou illisible")
_text_to_pdf_pages(text, out_pdf)
def _rtf_to_pdf(rtf_path: Path, out_pdf: Path):
"""Extrait le texte d'un RTF et crée un PDF."""
from striprtf.striprtf import rtf_to_text
raw = rtf_path.read_text(encoding="utf-8", errors="replace")
text = rtf_to_text(raw)
if not text.strip():
raise RuntimeError("Document RTF vide ou illisible")
_text_to_pdf_pages(text, out_pdf)
def _txt_to_pdf(txt_path: Path, out_pdf: Path):
"""Convertit un fichier texte brut en PDF."""
# Tenter plusieurs encodages
for enc in ("utf-8", "latin-1", "cp1252"):
try:
text = txt_path.read_text(encoding=enc)
break
except UnicodeDecodeError:
continue
else:
text = txt_path.read_bytes().decode("utf-8", errors="replace")
if not text.strip():
raise RuntimeError("Fichier texte vide")
_text_to_pdf_pages(text, out_pdf)
def _html_to_pdf(html_path: Path, out_pdf: Path):
"""Extrait le texte d'un fichier HTML et crée un PDF."""
from bs4 import BeautifulSoup
raw = html_path.read_text(encoding="utf-8", errors="replace")
soup = BeautifulSoup(raw, "html.parser")
# Supprimer scripts et styles
for tag in soup(["script", "style"]):
tag.decompose()
text = soup.get_text(separator="\n")
# Nettoyer les lignes vides multiples
import re
text = re.sub(r"\n{3,}", "\n\n", text)
if not text.strip():
raise RuntimeError("Document HTML vide ou illisible")
_text_to_pdf_pages(text, out_pdf)