Initial commit with full extraction pipeline: PDF OCR (docTR), text segmentation, LLM extraction (Ollama), deterministic post-processing normalizer, validation, and Excel/CSV export. The normalizer fixes OCR/LLM errors on CIM-10 codes: - OCR digit→letter confusion in position 1 (1→I, 0→O, 5→S, 2→Z, 8→B) - Missing dot separator (F050→F05.0, R410→R41.0) - '+' instead of '.' (B99+1→B99.1, J961+0→J96.10) - Excess decimals (Z04.880→Z04.88) - OCR letter→digit in positions 2-3 (LO2.2→L02.2) - Literal "null" string purge - Auto-fill codes_retenus from decision context Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
170 lines
5.0 KiB
Python
170 lines
5.0 KiB
Python
"""
|
|
Extraction de texte depuis des PDF natifs et scannés.
|
|
- PDF natif → pymupdf (extraction directe)
|
|
- PDF scanné → docTR (OCR)
|
|
"""
|
|
import fitz # pymupdf
|
|
import numpy as np
|
|
from PIL import Image
|
|
from pathlib import Path
|
|
from dataclasses import dataclass
|
|
import io
|
|
import logging
|
|
|
|
from config import NATIVE_TEXT_MIN_CHARS, OCR_DPI, DOCTR_DET_ARCH, DOCTR_RECO_ARCH, OCR_MIN_CONFIDENCE
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Lazy loading de docTR (lourd à importer)
|
|
_doctr_predictor = None
|
|
|
|
|
|
def _get_doctr_predictor():
|
|
"""Charge le modèle docTR une seule fois (lazy)."""
|
|
global _doctr_predictor
|
|
if _doctr_predictor is None:
|
|
logger.info("Chargement du modèle docTR...")
|
|
from doctr.models import ocr_predictor
|
|
_doctr_predictor = ocr_predictor(
|
|
det_arch=DOCTR_DET_ARCH,
|
|
reco_arch=DOCTR_RECO_ARCH,
|
|
pretrained=True,
|
|
assume_straight_pages=True,
|
|
)
|
|
logger.info("Modèle docTR chargé.")
|
|
return _doctr_predictor
|
|
|
|
|
|
@dataclass
|
|
class PageResult:
|
|
"""Résultat d'extraction d'une page."""
|
|
page_num: int
|
|
text: str
|
|
method: str # "native" ou "ocr"
|
|
confidence: float # 1.0 pour natif, score moyen docTR pour OCR
|
|
|
|
|
|
@dataclass
|
|
class PDFExtractionResult:
|
|
"""Résultat complet d'extraction d'un PDF."""
|
|
file_path: str
|
|
total_pages: int
|
|
pages: list # list[PageResult]
|
|
native_pages: int
|
|
ocr_pages: int
|
|
|
|
@property
|
|
def full_text(self) -> str:
|
|
"""Texte complet du document."""
|
|
return "\n\n".join(p.text for p in self.pages if p.text.strip())
|
|
|
|
|
|
def _page_to_image(page: fitz.Page, dpi: int = OCR_DPI) -> np.ndarray:
|
|
"""Convertit une page PDF en image numpy (RGB) pour docTR."""
|
|
zoom = dpi / 72
|
|
mat = fitz.Matrix(zoom, zoom)
|
|
pix = page.get_pixmap(matrix=mat)
|
|
img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
|
|
return np.array(img)
|
|
|
|
|
|
def _extract_text_doctr(image_array: np.ndarray) -> tuple[str, float]:
|
|
"""
|
|
Extrait le texte d'une image via docTR.
|
|
Retourne (texte, score_confiance_moyen).
|
|
"""
|
|
predictor = _get_doctr_predictor()
|
|
result = predictor([image_array])
|
|
|
|
lines = []
|
|
confidences = []
|
|
|
|
for page in result.pages:
|
|
for block in page.blocks:
|
|
for line in block.lines:
|
|
words = []
|
|
for word in line.words:
|
|
if word.confidence >= OCR_MIN_CONFIDENCE:
|
|
words.append(word.value)
|
|
confidences.append(word.confidence)
|
|
if words:
|
|
lines.append(" ".join(words))
|
|
|
|
text = "\n".join(lines)
|
|
avg_conf = sum(confidences) / len(confidences) if confidences else 0.0
|
|
return text, avg_conf
|
|
|
|
|
|
def extract_pdf(pdf_path: str | Path) -> PDFExtractionResult:
|
|
"""
|
|
Extrait le texte d'un PDF en détectant automatiquement
|
|
les pages natives vs scannées.
|
|
"""
|
|
pdf_path = Path(pdf_path)
|
|
if not pdf_path.exists():
|
|
raise FileNotFoundError(f"PDF non trouvé : {pdf_path}")
|
|
|
|
doc = fitz.open(str(pdf_path))
|
|
pages = []
|
|
native_count = 0
|
|
ocr_count = 0
|
|
|
|
logger.info(f"Extraction de {pdf_path.name} ({len(doc)} pages)")
|
|
|
|
for page_num in range(len(doc)):
|
|
page = doc[page_num]
|
|
native_text = page.get_text().strip()
|
|
|
|
if len(native_text) >= NATIVE_TEXT_MIN_CHARS:
|
|
# Page native
|
|
pages.append(PageResult(
|
|
page_num=page_num + 1,
|
|
text=native_text,
|
|
method="native",
|
|
confidence=1.0,
|
|
))
|
|
native_count += 1
|
|
logger.debug(f" Page {page_num + 1}/{len(doc)} : natif ({len(native_text)} chars)")
|
|
else:
|
|
# Page scannée → OCR docTR
|
|
image = _page_to_image(page)
|
|
text, confidence = _extract_text_doctr(image)
|
|
|
|
if text.strip():
|
|
pages.append(PageResult(
|
|
page_num=page_num + 1,
|
|
text=text,
|
|
method="ocr",
|
|
confidence=confidence,
|
|
))
|
|
ocr_count += 1
|
|
logger.debug(f" Page {page_num + 1}/{len(doc)} : OCR (conf={confidence:.2f}, {len(text)} chars)")
|
|
else:
|
|
# Page vide (page de garde, séparateur, etc.)
|
|
pages.append(PageResult(
|
|
page_num=page_num + 1,
|
|
text="",
|
|
method="ocr",
|
|
confidence=0.0,
|
|
))
|
|
ocr_count += 1
|
|
logger.debug(f" Page {page_num + 1}/{len(doc)} : vide")
|
|
|
|
total_pages = len(doc)
|
|
doc.close()
|
|
|
|
result = PDFExtractionResult(
|
|
file_path=str(pdf_path),
|
|
total_pages=total_pages,
|
|
pages=pages,
|
|
native_pages=native_count,
|
|
ocr_pages=ocr_count,
|
|
)
|
|
|
|
logger.info(
|
|
f"Extraction terminée : {native_count} pages natives, "
|
|
f"{ocr_count} pages OCR, {len(result.full_text)} chars total"
|
|
)
|
|
|
|
return result
|