""" Extraction de texte depuis des PDF natifs et scannés. - PDF natif → pymupdf (extraction directe) - PDF scanné → docTR (OCR) """ import fitz # pymupdf import numpy as np from PIL import Image from pathlib import Path from dataclasses import dataclass import io import logging from config import NATIVE_TEXT_MIN_CHARS, OCR_DPI, DOCTR_DET_ARCH, DOCTR_RECO_ARCH, OCR_MIN_CONFIDENCE logger = logging.getLogger(__name__) # Lazy loading de docTR (lourd à importer) _doctr_predictor = None def _get_doctr_predictor(): """Charge le modèle docTR une seule fois (lazy).""" global _doctr_predictor if _doctr_predictor is None: logger.info("Chargement du modèle docTR...") from doctr.models import ocr_predictor _doctr_predictor = ocr_predictor( det_arch=DOCTR_DET_ARCH, reco_arch=DOCTR_RECO_ARCH, pretrained=True, assume_straight_pages=True, ) logger.info("Modèle docTR chargé.") return _doctr_predictor @dataclass class PageResult: """Résultat d'extraction d'une page.""" page_num: int text: str method: str # "native" ou "ocr" confidence: float # 1.0 pour natif, score moyen docTR pour OCR @dataclass class PDFExtractionResult: """Résultat complet d'extraction d'un PDF.""" file_path: str total_pages: int pages: list # list[PageResult] native_pages: int ocr_pages: int @property def full_text(self) -> str: """Texte complet du document.""" return "\n\n".join(p.text for p in self.pages if p.text.strip()) def _page_to_image(page: fitz.Page, dpi: int = OCR_DPI) -> np.ndarray: """Convertit une page PDF en image numpy (RGB) pour docTR.""" zoom = dpi / 72 mat = fitz.Matrix(zoom, zoom) pix = page.get_pixmap(matrix=mat) img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB") return np.array(img) def _extract_text_doctr(image_array: np.ndarray) -> tuple[str, float]: """ Extrait le texte d'une image via docTR. Retourne (texte, score_confiance_moyen). """ predictor = _get_doctr_predictor() result = predictor([image_array]) lines = [] confidences = [] for page in result.pages: for block in page.blocks: for line in block.lines: words = [] for word in line.words: if word.confidence >= OCR_MIN_CONFIDENCE: words.append(word.value) confidences.append(word.confidence) if words: lines.append(" ".join(words)) text = "\n".join(lines) avg_conf = sum(confidences) / len(confidences) if confidences else 0.0 return text, avg_conf def extract_pdf(pdf_path: str | Path) -> PDFExtractionResult: """ Extrait le texte d'un PDF en détectant automatiquement les pages natives vs scannées. """ pdf_path = Path(pdf_path) if not pdf_path.exists(): raise FileNotFoundError(f"PDF non trouvé : {pdf_path}") doc = fitz.open(str(pdf_path)) pages = [] native_count = 0 ocr_count = 0 logger.info(f"Extraction de {pdf_path.name} ({len(doc)} pages)") for page_num in range(len(doc)): page = doc[page_num] native_text = page.get_text().strip() if len(native_text) >= NATIVE_TEXT_MIN_CHARS: # Page native pages.append(PageResult( page_num=page_num + 1, text=native_text, method="native", confidence=1.0, )) native_count += 1 logger.debug(f" Page {page_num + 1}/{len(doc)} : natif ({len(native_text)} chars)") else: # Page scannée → OCR docTR image = _page_to_image(page) text, confidence = _extract_text_doctr(image) if text.strip(): pages.append(PageResult( page_num=page_num + 1, text=text, method="ocr", confidence=confidence, )) ocr_count += 1 logger.debug(f" Page {page_num + 1}/{len(doc)} : OCR (conf={confidence:.2f}, {len(text)} chars)") else: # Page vide (page de garde, séparateur, etc.) pages.append(PageResult( page_num=page_num + 1, text="", method="ocr", confidence=0.0, )) ocr_count += 1 logger.debug(f" Page {page_num + 1}/{len(doc)} : vide") total_pages = len(doc) doc.close() result = PDFExtractionResult( file_path=str(pdf_path), total_pages=total_pages, pages=pages, native_pages=native_count, ocr_pages=ocr_count, ) logger.info( f"Extraction terminée : {native_count} pages natives, " f"{ocr_count} pages OCR, {len(result.full_text)} chars total" ) return result