anonymisation/format_converter.py

#!/usr/bin/env python3
"""Conversion de documents multi-formats vers PDF pour le pipeline d'anonymisation.

Formats supportés :
  - PDF  : passthrough (rien à faire)
  - DOCX : python-docx → texte → PDF via PyMuPDF
  - ODT  : odfpy → texte → PDF via PyMuPDF
  - RTF  : striprtf → texte → PDF via PyMuPDF
  - TXT  : texte brut → PDF via PyMuPDF
  - HTML : BeautifulSoup → texte → PDF via PyMuPDF
  - JPEG/PNG/TIFF/BMP : image embarquée dans un PDF (OCR docTR en aval)

Usage :
    from format_converter import convert_to_pdf, SUPPORTED_EXTENSIONS

    pdf_path, is_temp = convert_to_pdf(Path("document.docx"))
    # ... process_pdf(pdf_path, ...) ...
    if is_temp:
        pdf_path.unlink()  # nettoyer le fichier temporaire
"""
from __future__ import annotations

import logging
import tempfile
from pathlib import Path
from typing import Tuple

log = logging.getLogger(__name__)

# Extensions supportées (lowercase, avec le point)
SUPPORTED_EXTENSIONS = {
    ".pdf",
    ".docx",
    ".odt",
    ".rtf",
    ".txt", ".text",
    ".html", ".htm",
    ".jpg", ".jpeg",
    ".png",
    ".tiff", ".tif",
    ".bmp",
}

# Extensions images (OCR requis)
_IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".tiff", ".tif", ".bmp"}


def convert_to_pdf(input_path: Path) -> Tuple[Path, bool]:
    """Convertit un document en PDF pour le pipeline d'anonymisation.

    Args:
        input_path: chemin du document source

    Returns:
        (pdf_path, is_temporary): chemin du PDF + flag si fichier temporaire à nettoyer

    Raises:
        ValueError: format non supporté
        RuntimeError: erreur de conversion
    """
    suffix = input_path.suffix.lower()

    if suffix == ".pdf":
        return input_path, False

    if suffix not in SUPPORTED_EXTENSIONS:
        raise ValueError(
            f"Format '{suffix}' non supporté. "
            f"Formats acceptés : {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
        )

    # Fichier PDF temporaire dans le même dossier (pour préserver le contexte)
    tmp_pdf = input_path.with_suffix(".tmp_convert.pdf")

    try:
        if suffix in _IMAGE_EXTENSIONS:
            _image_to_pdf(input_path, tmp_pdf)
        elif suffix == ".docx":
            _docx_to_pdf(input_path, tmp_pdf)
        elif suffix == ".odt":
            _odt_to_pdf(input_path, tmp_pdf)
        elif suffix == ".rtf":
            _rtf_to_pdf(input_path, tmp_pdf)
        elif suffix in {".txt", ".text"}:
            _txt_to_pdf(input_path, tmp_pdf)
        elif suffix in {".html", ".htm"}:
            _html_to_pdf(input_path, tmp_pdf)
        else:
            raise ValueError(f"Format '{suffix}' non implémenté")

        log.info("Converti %s → %s", input_path.name, tmp_pdf.name)
        return tmp_pdf, True

    except Exception as e:
        # Nettoyer en cas d'erreur
        if tmp_pdf.exists():
            tmp_pdf.unlink()
        raise RuntimeError(f"Erreur conversion {input_path.name}: {e}") from e


def _image_to_pdf(img_path: Path, out_pdf: Path):
    """Embarque une image dans un PDF (1 page). L'OCR docTR traitera en aval."""
    import fitz

    doc = fitz.open()
    # Ouvrir l'image pour obtenir ses dimensions
    img_doc = fitz.open(str(img_path))
    # Si c'est un TIFF multi-pages
    for i in range(len(img_doc)):
        page = img_doc[i]
        rect = page.rect
        pdf_page = doc.new_page(width=rect.width, height=rect.height)
        pdf_page.insert_image(rect, filename=str(img_path) if i == 0 else None,
                               pixmap=img_doc[i].get_pixmap() if i > 0 else None)
    img_doc.close()
    doc.save(str(out_pdf))
    doc.close()


def _text_to_pdf_pages(text: str, out_pdf: Path, font_size: float = 10.0):
    """Crée un PDF à partir de texte brut, avec pagination automatique."""
    import fitz

    doc = fitz.open()
    # A4
    page_w, page_h = 595, 842
    margin = 50
    usable_h = page_h - 2 * margin
    line_height = font_size * 1.4

    lines = text.split("\n")
    page = doc.new_page(width=page_w, height=page_h)
    y = margin

    for line in lines:
        if y + line_height > page_h - margin:
            # Nouvelle page
            page = doc.new_page(width=page_w, height=page_h)
            y = margin

        # Tronquer les lignes trop longues
        max_chars = int((page_w - 2 * margin) / (font_size * 0.5))
        display_line = line[:max_chars] if len(line) > max_chars else line

        try:
            page.insert_text(
                fitz.Point(margin, y + font_size),
                display_line,
                fontsize=font_size,
                fontname="helv",
            )
        except Exception:
            # Fallback pour les caractères non supportés
            safe = display_line.encode("latin-1", errors="replace").decode("latin-1")
            page.insert_text(
                fitz.Point(margin, y + font_size),
                safe,
                fontsize=font_size,
                fontname="helv",
            )
        y += line_height

    doc.save(str(out_pdf))
    doc.close()


def _docx_to_pdf(docx_path: Path, out_pdf: Path):
    """Extrait le texte d'un DOCX et crée un PDF."""
    from docx import Document

    doc = Document(str(docx_path))
    paragraphs = []
    for para in doc.paragraphs:
        paragraphs.append(para.text)

    # Extraire aussi les tableaux
    for table in doc.tables:
        for row in table.rows:
            cells = [cell.text.strip() for cell in row.cells]
            paragraphs.append(" | ".join(cells))

    text = "\n".join(paragraphs)
    if not text.strip():
        raise RuntimeError("Document DOCX vide ou illisible")

    _text_to_pdf_pages(text, out_pdf)


def _odt_to_pdf(odt_path: Path, out_pdf: Path):
    """Extrait le texte d'un ODT et crée un PDF."""
    from odf.opendocument import load as odf_load
    from odf.text import P as OdfP
    from odf import teletype

    doc = odf_load(str(odt_path))
    paragraphs = []
    for p in doc.getElementsByType(OdfP):
        paragraphs.append(teletype.extractText(p))

    text = "\n".join(paragraphs)
    if not text.strip():
        raise RuntimeError("Document ODT vide ou illisible")

    _text_to_pdf_pages(text, out_pdf)


def _rtf_to_pdf(rtf_path: Path, out_pdf: Path):
    """Extrait le texte d'un RTF et crée un PDF."""
    from striprtf.striprtf import rtf_to_text

    raw = rtf_path.read_text(encoding="utf-8", errors="replace")
    text = rtf_to_text(raw)
    if not text.strip():
        raise RuntimeError("Document RTF vide ou illisible")

    _text_to_pdf_pages(text, out_pdf)


def _txt_to_pdf(txt_path: Path, out_pdf: Path):
    """Convertit un fichier texte brut en PDF."""
    # Tenter plusieurs encodages
    for enc in ("utf-8", "latin-1", "cp1252"):
        try:
            text = txt_path.read_text(encoding=enc)
            break
        except UnicodeDecodeError:
            continue
    else:
        text = txt_path.read_bytes().decode("utf-8", errors="replace")

    if not text.strip():
        raise RuntimeError("Fichier texte vide")

    _text_to_pdf_pages(text, out_pdf)


def _html_to_pdf(html_path: Path, out_pdf: Path):
    """Extrait le texte d'un fichier HTML et crée un PDF."""
    from bs4 import BeautifulSoup

    raw = html_path.read_text(encoding="utf-8", errors="replace")
    soup = BeautifulSoup(raw, "html.parser")

    # Supprimer scripts et styles
    for tag in soup(["script", "style"]):
        tag.decompose()

    text = soup.get_text(separator="\n")
    # Nettoyer les lignes vides multiples
    import re
    text = re.sub(r"\n{3,}", "\n\n", text)

    if not text.strip():
        raise RuntimeError("Document HTML vide ou illisible")

    _text_to_pdf_pages(text, out_pdf)