feat: support multi-formats — DOCX, images, ODT, RTF, TXT, HTML
Nouveau module format_converter.py : conversion automatique vers PDF avant anonymisation. Formats supportés : - PDF (passthrough) - DOCX (python-docx → texte → PDF) - ODT (odfpy → texte → PDF) - RTF (striprtf → texte → PDF) - TXT (texte brut → PDF via PyMuPDF) - HTML (BeautifulSoup → texte → PDF) - JPEG/PNG/TIFF/BMP (image embarquée → OCR docTR en aval) Nouvelle fonction process_document() : wrapper qui gère la conversion puis appelle process_pdf(). GUI mise à jour pour chercher tous les formats supportés (plus seulement *.pdf). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -658,10 +658,17 @@ class App:
|
|||||||
if not folder:
|
if not folder:
|
||||||
return
|
return
|
||||||
|
|
||||||
# Compter les PDF (récursif)
|
# Compter les documents supportés (récursif)
|
||||||
|
try:
|
||||||
|
from format_converter import SUPPORTED_EXTENSIONS
|
||||||
|
except ImportError:
|
||||||
|
SUPPORTED_EXTENSIONS = {".pdf"}
|
||||||
pdf_count = 0
|
pdf_count = 0
|
||||||
try:
|
try:
|
||||||
pdf_count = len([p for p in Path(folder).rglob("*.pdf") if p.is_file()])
|
pdf_count = len([
|
||||||
|
p for p in Path(folder).rglob("*")
|
||||||
|
if p.is_file() and p.suffix.lower() in SUPPORTED_EXTENSIONS
|
||||||
|
])
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -689,7 +696,7 @@ class App:
|
|||||||
bg=CLR_CARD_BG, fg=CLR_TEXT, anchor="w",
|
bg=CLR_CARD_BG, fg=CLR_TEXT, anchor="w",
|
||||||
).pack(fill=tk.X)
|
).pack(fill=tk.X)
|
||||||
|
|
||||||
suffix = "PDF trouvé (récursif)" if pdf_count <= 1 else "PDF trouvés (récursif)"
|
suffix = "document trouvé (récursif)" if pdf_count <= 1 else "documents trouvés (récursif)"
|
||||||
tk.Label(
|
tk.Label(
|
||||||
info_frame, text=f"{pdf_count} {suffix}",
|
info_frame, text=f"{pdf_count} {suffix}",
|
||||||
font=self._f_small, bg=CLR_CARD_BG, fg=CLR_TEXT_SECONDARY, anchor="w",
|
font=self._f_small, bg=CLR_CARD_BG, fg=CLR_TEXT_SECONDARY, anchor="w",
|
||||||
@@ -713,15 +720,25 @@ class App:
|
|||||||
if not folder.is_dir():
|
if not folder.is_dir():
|
||||||
messagebox.showwarning(
|
messagebox.showwarning(
|
||||||
"Dossier invalide",
|
"Dossier invalide",
|
||||||
"Choisissez un dossier contenant des PDF.",
|
"Choisissez un dossier contenant des documents.",
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
pdfs = sorted([p for p in folder.rglob("*.pdf") if p.is_file()])
|
try:
|
||||||
|
from format_converter import SUPPORTED_EXTENSIONS
|
||||||
|
except ImportError:
|
||||||
|
SUPPORTED_EXTENSIONS = {".pdf"}
|
||||||
|
pdfs = sorted([
|
||||||
|
p for p in folder.rglob("*")
|
||||||
|
if p.is_file() and p.suffix.lower() in SUPPORTED_EXTENSIONS
|
||||||
|
])
|
||||||
if not pdfs:
|
if not pdfs:
|
||||||
|
exts = ", ".join(sorted(SUPPORTED_EXTENSIONS))
|
||||||
messagebox.showwarning(
|
messagebox.showwarning(
|
||||||
"Aucun PDF",
|
"Aucun document",
|
||||||
"Aucun fichier PDF trouvé\n(recherche récursive dans les sous-dossiers).",
|
f"Aucun fichier supporté trouvé.\n"
|
||||||
|
f"Formats acceptés : {exts}\n"
|
||||||
|
f"(recherche récursive dans les sous-dossiers)",
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -779,8 +796,12 @@ class App:
|
|||||||
and self._vlm_manager.is_loaded()
|
and self._vlm_manager.is_loaded()
|
||||||
)
|
)
|
||||||
|
|
||||||
outputs = core.process_pdf(
|
# Utiliser process_document (multi-formats) si disponible,
|
||||||
pdf_path=pdf,
|
# sinon fallback sur process_pdf (PDF uniquement)
|
||||||
|
_process_fn = getattr(core, 'process_document', None) or core.process_pdf
|
||||||
|
_path_key = "doc_path" if _process_fn.__name__ == "process_document" else "pdf_path"
|
||||||
|
outputs = _process_fn(
|
||||||
|
**{_path_key: pdf},
|
||||||
out_dir=outdir,
|
out_dir=outdir,
|
||||||
make_vector_redaction=False,
|
make_vector_redaction=False,
|
||||||
also_make_raster_burn=True,
|
also_make_raster_burn=True,
|
||||||
|
|||||||
@@ -4119,3 +4119,59 @@ if __name__ == "__main__":
|
|||||||
ner_thresholds=NerThresholds() if NerThresholds else None,
|
ner_thresholds=NerThresholds() if NerThresholds else None,
|
||||||
)
|
)
|
||||||
print(json.dumps(outs, indent=2, ensure_ascii=False))
|
print(json.dumps(outs, indent=2, ensure_ascii=False))
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# process_document : wrapper multi-formats (PDF, DOCX, images, etc.)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def process_document(
|
||||||
|
doc_path: Path,
|
||||||
|
out_dir: Path,
|
||||||
|
**kwargs,
|
||||||
|
) -> Dict[str, str]:
|
||||||
|
"""Anonymise un document de n'importe quel format supporté.
|
||||||
|
|
||||||
|
Convertit en PDF si nécessaire, puis passe dans process_pdf().
|
||||||
|
Les formats supportés sont définis dans format_converter.SUPPORTED_EXTENSIONS.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
doc_path: chemin du document source (PDF, DOCX, image, etc.)
|
||||||
|
out_dir: répertoire de sortie
|
||||||
|
**kwargs: arguments passés à process_pdf()
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict avec les chemins des fichiers de sortie
|
||||||
|
"""
|
||||||
|
from format_converter import convert_to_pdf, SUPPORTED_EXTENSIONS
|
||||||
|
|
||||||
|
suffix = doc_path.suffix.lower()
|
||||||
|
if suffix not in SUPPORTED_EXTENSIONS:
|
||||||
|
raise ValueError(
|
||||||
|
f"Format '{suffix}' non supporté. "
|
||||||
|
f"Formats acceptés : {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
|
||||||
|
)
|
||||||
|
|
||||||
|
pdf_path, is_temp = convert_to_pdf(doc_path)
|
||||||
|
try:
|
||||||
|
outputs = process_pdf(pdf_path=pdf_path, out_dir=out_dir, **kwargs)
|
||||||
|
|
||||||
|
# Renommer les sorties pour refléter le nom original (pas le .tmp_convert.pdf)
|
||||||
|
if is_temp:
|
||||||
|
original_stem = doc_path.stem
|
||||||
|
renamed = {}
|
||||||
|
for key, path_str in outputs.items():
|
||||||
|
p = Path(path_str)
|
||||||
|
if p.exists() and ".tmp_convert" in p.name:
|
||||||
|
new_name = p.name.replace(doc_path.stem + ".tmp_convert", original_stem)
|
||||||
|
new_path = p.parent / new_name
|
||||||
|
p.rename(new_path)
|
||||||
|
renamed[key] = str(new_path)
|
||||||
|
else:
|
||||||
|
renamed[key] = path_str
|
||||||
|
outputs = renamed
|
||||||
|
|
||||||
|
return outputs
|
||||||
|
finally:
|
||||||
|
if is_temp and pdf_path.exists():
|
||||||
|
pdf_path.unlink()
|
||||||
|
|||||||
256
format_converter.py
Normal file
256
format_converter.py
Normal file
@@ -0,0 +1,256 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Conversion de documents multi-formats vers PDF pour le pipeline d'anonymisation.
|
||||||
|
|
||||||
|
Formats supportés :
|
||||||
|
- PDF : passthrough (rien à faire)
|
||||||
|
- DOCX : python-docx → texte → PDF via PyMuPDF
|
||||||
|
- ODT : odfpy → texte → PDF via PyMuPDF
|
||||||
|
- RTF : striprtf → texte → PDF via PyMuPDF
|
||||||
|
- TXT : texte brut → PDF via PyMuPDF
|
||||||
|
- HTML : BeautifulSoup → texte → PDF via PyMuPDF
|
||||||
|
- JPEG/PNG/TIFF/BMP : image embarquée dans un PDF (OCR docTR en aval)
|
||||||
|
|
||||||
|
Usage :
|
||||||
|
from format_converter import convert_to_pdf, SUPPORTED_EXTENSIONS
|
||||||
|
|
||||||
|
pdf_path, is_temp = convert_to_pdf(Path("document.docx"))
|
||||||
|
# ... process_pdf(pdf_path, ...) ...
|
||||||
|
if is_temp:
|
||||||
|
pdf_path.unlink() # nettoyer le fichier temporaire
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Extensions supportées (lowercase, avec le point)
|
||||||
|
SUPPORTED_EXTENSIONS = {
|
||||||
|
".pdf",
|
||||||
|
".docx",
|
||||||
|
".odt",
|
||||||
|
".rtf",
|
||||||
|
".txt", ".text",
|
||||||
|
".html", ".htm",
|
||||||
|
".jpg", ".jpeg",
|
||||||
|
".png",
|
||||||
|
".tiff", ".tif",
|
||||||
|
".bmp",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Extensions images (OCR requis)
|
||||||
|
_IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".tiff", ".tif", ".bmp"}
|
||||||
|
|
||||||
|
|
||||||
|
def convert_to_pdf(input_path: Path) -> Tuple[Path, bool]:
|
||||||
|
"""Convertit un document en PDF pour le pipeline d'anonymisation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_path: chemin du document source
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(pdf_path, is_temporary): chemin du PDF + flag si fichier temporaire à nettoyer
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: format non supporté
|
||||||
|
RuntimeError: erreur de conversion
|
||||||
|
"""
|
||||||
|
suffix = input_path.suffix.lower()
|
||||||
|
|
||||||
|
if suffix == ".pdf":
|
||||||
|
return input_path, False
|
||||||
|
|
||||||
|
if suffix not in SUPPORTED_EXTENSIONS:
|
||||||
|
raise ValueError(
|
||||||
|
f"Format '{suffix}' non supporté. "
|
||||||
|
f"Formats acceptés : {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fichier PDF temporaire dans le même dossier (pour préserver le contexte)
|
||||||
|
tmp_pdf = input_path.with_suffix(".tmp_convert.pdf")
|
||||||
|
|
||||||
|
try:
|
||||||
|
if suffix in _IMAGE_EXTENSIONS:
|
||||||
|
_image_to_pdf(input_path, tmp_pdf)
|
||||||
|
elif suffix == ".docx":
|
||||||
|
_docx_to_pdf(input_path, tmp_pdf)
|
||||||
|
elif suffix == ".odt":
|
||||||
|
_odt_to_pdf(input_path, tmp_pdf)
|
||||||
|
elif suffix == ".rtf":
|
||||||
|
_rtf_to_pdf(input_path, tmp_pdf)
|
||||||
|
elif suffix in {".txt", ".text"}:
|
||||||
|
_txt_to_pdf(input_path, tmp_pdf)
|
||||||
|
elif suffix in {".html", ".htm"}:
|
||||||
|
_html_to_pdf(input_path, tmp_pdf)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Format '{suffix}' non implémenté")
|
||||||
|
|
||||||
|
log.info("Converti %s → %s", input_path.name, tmp_pdf.name)
|
||||||
|
return tmp_pdf, True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
# Nettoyer en cas d'erreur
|
||||||
|
if tmp_pdf.exists():
|
||||||
|
tmp_pdf.unlink()
|
||||||
|
raise RuntimeError(f"Erreur conversion {input_path.name}: {e}") from e
|
||||||
|
|
||||||
|
|
||||||
|
def _image_to_pdf(img_path: Path, out_pdf: Path):
|
||||||
|
"""Embarque une image dans un PDF (1 page). L'OCR docTR traitera en aval."""
|
||||||
|
import fitz
|
||||||
|
|
||||||
|
doc = fitz.open()
|
||||||
|
# Ouvrir l'image pour obtenir ses dimensions
|
||||||
|
img_doc = fitz.open(str(img_path))
|
||||||
|
# Si c'est un TIFF multi-pages
|
||||||
|
for i in range(len(img_doc)):
|
||||||
|
page = img_doc[i]
|
||||||
|
rect = page.rect
|
||||||
|
pdf_page = doc.new_page(width=rect.width, height=rect.height)
|
||||||
|
pdf_page.insert_image(rect, filename=str(img_path) if i == 0 else None,
|
||||||
|
pixmap=img_doc[i].get_pixmap() if i > 0 else None)
|
||||||
|
img_doc.close()
|
||||||
|
doc.save(str(out_pdf))
|
||||||
|
doc.close()
|
||||||
|
|
||||||
|
|
||||||
|
def _text_to_pdf_pages(text: str, out_pdf: Path, font_size: float = 10.0):
|
||||||
|
"""Crée un PDF à partir de texte brut, avec pagination automatique."""
|
||||||
|
import fitz
|
||||||
|
|
||||||
|
doc = fitz.open()
|
||||||
|
# A4
|
||||||
|
page_w, page_h = 595, 842
|
||||||
|
margin = 50
|
||||||
|
usable_h = page_h - 2 * margin
|
||||||
|
line_height = font_size * 1.4
|
||||||
|
|
||||||
|
lines = text.split("\n")
|
||||||
|
page = doc.new_page(width=page_w, height=page_h)
|
||||||
|
y = margin
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
if y + line_height > page_h - margin:
|
||||||
|
# Nouvelle page
|
||||||
|
page = doc.new_page(width=page_w, height=page_h)
|
||||||
|
y = margin
|
||||||
|
|
||||||
|
# Tronquer les lignes trop longues
|
||||||
|
max_chars = int((page_w - 2 * margin) / (font_size * 0.5))
|
||||||
|
display_line = line[:max_chars] if len(line) > max_chars else line
|
||||||
|
|
||||||
|
try:
|
||||||
|
page.insert_text(
|
||||||
|
fitz.Point(margin, y + font_size),
|
||||||
|
display_line,
|
||||||
|
fontsize=font_size,
|
||||||
|
fontname="helv",
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
# Fallback pour les caractères non supportés
|
||||||
|
safe = display_line.encode("latin-1", errors="replace").decode("latin-1")
|
||||||
|
page.insert_text(
|
||||||
|
fitz.Point(margin, y + font_size),
|
||||||
|
safe,
|
||||||
|
fontsize=font_size,
|
||||||
|
fontname="helv",
|
||||||
|
)
|
||||||
|
y += line_height
|
||||||
|
|
||||||
|
doc.save(str(out_pdf))
|
||||||
|
doc.close()
|
||||||
|
|
||||||
|
|
||||||
|
def _docx_to_pdf(docx_path: Path, out_pdf: Path):
|
||||||
|
"""Extrait le texte d'un DOCX et crée un PDF."""
|
||||||
|
from docx import Document
|
||||||
|
|
||||||
|
doc = Document(str(docx_path))
|
||||||
|
paragraphs = []
|
||||||
|
for para in doc.paragraphs:
|
||||||
|
paragraphs.append(para.text)
|
||||||
|
|
||||||
|
# Extraire aussi les tableaux
|
||||||
|
for table in doc.tables:
|
||||||
|
for row in table.rows:
|
||||||
|
cells = [cell.text.strip() for cell in row.cells]
|
||||||
|
paragraphs.append(" | ".join(cells))
|
||||||
|
|
||||||
|
text = "\n".join(paragraphs)
|
||||||
|
if not text.strip():
|
||||||
|
raise RuntimeError("Document DOCX vide ou illisible")
|
||||||
|
|
||||||
|
_text_to_pdf_pages(text, out_pdf)
|
||||||
|
|
||||||
|
|
||||||
|
def _odt_to_pdf(odt_path: Path, out_pdf: Path):
|
||||||
|
"""Extrait le texte d'un ODT et crée un PDF."""
|
||||||
|
from odf.opendocument import load as odf_load
|
||||||
|
from odf.text import P as OdfP
|
||||||
|
from odf import teletype
|
||||||
|
|
||||||
|
doc = odf_load(str(odt_path))
|
||||||
|
paragraphs = []
|
||||||
|
for p in doc.getElementsByType(OdfP):
|
||||||
|
paragraphs.append(teletype.extractText(p))
|
||||||
|
|
||||||
|
text = "\n".join(paragraphs)
|
||||||
|
if not text.strip():
|
||||||
|
raise RuntimeError("Document ODT vide ou illisible")
|
||||||
|
|
||||||
|
_text_to_pdf_pages(text, out_pdf)
|
||||||
|
|
||||||
|
|
||||||
|
def _rtf_to_pdf(rtf_path: Path, out_pdf: Path):
|
||||||
|
"""Extrait le texte d'un RTF et crée un PDF."""
|
||||||
|
from striprtf.striprtf import rtf_to_text
|
||||||
|
|
||||||
|
raw = rtf_path.read_text(encoding="utf-8", errors="replace")
|
||||||
|
text = rtf_to_text(raw)
|
||||||
|
if not text.strip():
|
||||||
|
raise RuntimeError("Document RTF vide ou illisible")
|
||||||
|
|
||||||
|
_text_to_pdf_pages(text, out_pdf)
|
||||||
|
|
||||||
|
|
||||||
|
def _txt_to_pdf(txt_path: Path, out_pdf: Path):
|
||||||
|
"""Convertit un fichier texte brut en PDF."""
|
||||||
|
# Tenter plusieurs encodages
|
||||||
|
for enc in ("utf-8", "latin-1", "cp1252"):
|
||||||
|
try:
|
||||||
|
text = txt_path.read_text(encoding=enc)
|
||||||
|
break
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
text = txt_path.read_bytes().decode("utf-8", errors="replace")
|
||||||
|
|
||||||
|
if not text.strip():
|
||||||
|
raise RuntimeError("Fichier texte vide")
|
||||||
|
|
||||||
|
_text_to_pdf_pages(text, out_pdf)
|
||||||
|
|
||||||
|
|
||||||
|
def _html_to_pdf(html_path: Path, out_pdf: Path):
|
||||||
|
"""Extrait le texte d'un fichier HTML et crée un PDF."""
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
raw = html_path.read_text(encoding="utf-8", errors="replace")
|
||||||
|
soup = BeautifulSoup(raw, "html.parser")
|
||||||
|
|
||||||
|
# Supprimer scripts et styles
|
||||||
|
for tag in soup(["script", "style"]):
|
||||||
|
tag.decompose()
|
||||||
|
|
||||||
|
text = soup.get_text(separator="\n")
|
||||||
|
# Nettoyer les lignes vides multiples
|
||||||
|
import re
|
||||||
|
text = re.sub(r"\n{3,}", "\n\n", text)
|
||||||
|
|
||||||
|
if not text.strip():
|
||||||
|
raise RuntimeError("Document HTML vide ou illisible")
|
||||||
|
|
||||||
|
_text_to_pdf_pages(text, out_pdf)
|
||||||
Reference in New Issue
Block a user