feat: support multi-formats — DOCX, images, ODT, RTF, TXT, HTML
Nouveau module format_converter.py : conversion automatique vers PDF avant anonymisation. Formats supportés : - PDF (passthrough) - DOCX (python-docx → texte → PDF) - ODT (odfpy → texte → PDF) - RTF (striprtf → texte → PDF) - TXT (texte brut → PDF via PyMuPDF) - HTML (BeautifulSoup → texte → PDF) - JPEG/PNG/TIFF/BMP (image embarquée → OCR docTR en aval) Nouvelle fonction process_document() : wrapper qui gère la conversion puis appelle process_pdf(). GUI mise à jour pour chercher tous les formats supportés (plus seulement *.pdf). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -658,10 +658,17 @@ class App:
|
||||
if not folder:
|
||||
return
|
||||
|
||||
# Compter les PDF (récursif)
|
||||
# Compter les documents supportés (récursif)
|
||||
try:
|
||||
from format_converter import SUPPORTED_EXTENSIONS
|
||||
except ImportError:
|
||||
SUPPORTED_EXTENSIONS = {".pdf"}
|
||||
pdf_count = 0
|
||||
try:
|
||||
pdf_count = len([p for p in Path(folder).rglob("*.pdf") if p.is_file()])
|
||||
pdf_count = len([
|
||||
p for p in Path(folder).rglob("*")
|
||||
if p.is_file() and p.suffix.lower() in SUPPORTED_EXTENSIONS
|
||||
])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@@ -689,7 +696,7 @@ class App:
|
||||
bg=CLR_CARD_BG, fg=CLR_TEXT, anchor="w",
|
||||
).pack(fill=tk.X)
|
||||
|
||||
suffix = "PDF trouvé (récursif)" if pdf_count <= 1 else "PDF trouvés (récursif)"
|
||||
suffix = "document trouvé (récursif)" if pdf_count <= 1 else "documents trouvés (récursif)"
|
||||
tk.Label(
|
||||
info_frame, text=f"{pdf_count} {suffix}",
|
||||
font=self._f_small, bg=CLR_CARD_BG, fg=CLR_TEXT_SECONDARY, anchor="w",
|
||||
@@ -713,15 +720,25 @@ class App:
|
||||
if not folder.is_dir():
|
||||
messagebox.showwarning(
|
||||
"Dossier invalide",
|
||||
"Choisissez un dossier contenant des PDF.",
|
||||
"Choisissez un dossier contenant des documents.",
|
||||
)
|
||||
return
|
||||
|
||||
pdfs = sorted([p for p in folder.rglob("*.pdf") if p.is_file()])
|
||||
try:
|
||||
from format_converter import SUPPORTED_EXTENSIONS
|
||||
except ImportError:
|
||||
SUPPORTED_EXTENSIONS = {".pdf"}
|
||||
pdfs = sorted([
|
||||
p for p in folder.rglob("*")
|
||||
if p.is_file() and p.suffix.lower() in SUPPORTED_EXTENSIONS
|
||||
])
|
||||
if not pdfs:
|
||||
exts = ", ".join(sorted(SUPPORTED_EXTENSIONS))
|
||||
messagebox.showwarning(
|
||||
"Aucun PDF",
|
||||
"Aucun fichier PDF trouvé\n(recherche récursive dans les sous-dossiers).",
|
||||
"Aucun document",
|
||||
f"Aucun fichier supporté trouvé.\n"
|
||||
f"Formats acceptés : {exts}\n"
|
||||
f"(recherche récursive dans les sous-dossiers)",
|
||||
)
|
||||
return
|
||||
|
||||
@@ -779,8 +796,12 @@ class App:
|
||||
and self._vlm_manager.is_loaded()
|
||||
)
|
||||
|
||||
outputs = core.process_pdf(
|
||||
pdf_path=pdf,
|
||||
# Utiliser process_document (multi-formats) si disponible,
|
||||
# sinon fallback sur process_pdf (PDF uniquement)
|
||||
_process_fn = getattr(core, 'process_document', None) or core.process_pdf
|
||||
_path_key = "doc_path" if _process_fn.__name__ == "process_document" else "pdf_path"
|
||||
outputs = _process_fn(
|
||||
**{_path_key: pdf},
|
||||
out_dir=outdir,
|
||||
make_vector_redaction=False,
|
||||
also_make_raster_burn=True,
|
||||
|
||||
Reference in New Issue
Block a user