Files
t2a-extractor/extractor/pdf_reader.py
dom f70d138db3 feat: T2A-Extractor pipeline with CIM-10 normalizer (31→0 warnings)
Initial commit with full extraction pipeline: PDF OCR (docTR), text
segmentation, LLM extraction (Ollama), deterministic post-processing
normalizer, validation, and Excel/CSV export.

The normalizer fixes OCR/LLM errors on CIM-10 codes:
- OCR digit→letter confusion in position 1 (1→I, 0→O, 5→S, 2→Z, 8→B)
- Missing dot separator (F050→F05.0, R410→R41.0)
- '+' instead of '.' (B99+1→B99.1, J961+0→J96.10)
- Excess decimals (Z04.880→Z04.88)
- OCR letter→digit in positions 2-3 (LO2.2→L02.2)
- Literal "null" string purge
- Auto-fill codes_retenus from decision context

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-23 20:44:32 +01:00

170 lines
5.0 KiB
Python

"""
Extraction de texte depuis des PDF natifs et scannés.
- PDF natif → pymupdf (extraction directe)
- PDF scanné → docTR (OCR)
"""
import fitz # pymupdf
import numpy as np
from PIL import Image
from pathlib import Path
from dataclasses import dataclass
import io
import logging
from config import NATIVE_TEXT_MIN_CHARS, OCR_DPI, DOCTR_DET_ARCH, DOCTR_RECO_ARCH, OCR_MIN_CONFIDENCE
logger = logging.getLogger(__name__)
# Lazy loading de docTR (lourd à importer)
_doctr_predictor = None
def _get_doctr_predictor():
"""Charge le modèle docTR une seule fois (lazy)."""
global _doctr_predictor
if _doctr_predictor is None:
logger.info("Chargement du modèle docTR...")
from doctr.models import ocr_predictor
_doctr_predictor = ocr_predictor(
det_arch=DOCTR_DET_ARCH,
reco_arch=DOCTR_RECO_ARCH,
pretrained=True,
assume_straight_pages=True,
)
logger.info("Modèle docTR chargé.")
return _doctr_predictor
@dataclass
class PageResult:
"""Résultat d'extraction d'une page."""
page_num: int
text: str
method: str # "native" ou "ocr"
confidence: float # 1.0 pour natif, score moyen docTR pour OCR
@dataclass
class PDFExtractionResult:
"""Résultat complet d'extraction d'un PDF."""
file_path: str
total_pages: int
pages: list # list[PageResult]
native_pages: int
ocr_pages: int
@property
def full_text(self) -> str:
"""Texte complet du document."""
return "\n\n".join(p.text for p in self.pages if p.text.strip())
def _page_to_image(page: fitz.Page, dpi: int = OCR_DPI) -> np.ndarray:
"""Convertit une page PDF en image numpy (RGB) pour docTR."""
zoom = dpi / 72
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat)
img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
return np.array(img)
def _extract_text_doctr(image_array: np.ndarray) -> tuple[str, float]:
"""
Extrait le texte d'une image via docTR.
Retourne (texte, score_confiance_moyen).
"""
predictor = _get_doctr_predictor()
result = predictor([image_array])
lines = []
confidences = []
for page in result.pages:
for block in page.blocks:
for line in block.lines:
words = []
for word in line.words:
if word.confidence >= OCR_MIN_CONFIDENCE:
words.append(word.value)
confidences.append(word.confidence)
if words:
lines.append(" ".join(words))
text = "\n".join(lines)
avg_conf = sum(confidences) / len(confidences) if confidences else 0.0
return text, avg_conf
def extract_pdf(pdf_path: str | Path) -> PDFExtractionResult:
"""
Extrait le texte d'un PDF en détectant automatiquement
les pages natives vs scannées.
"""
pdf_path = Path(pdf_path)
if not pdf_path.exists():
raise FileNotFoundError(f"PDF non trouvé : {pdf_path}")
doc = fitz.open(str(pdf_path))
pages = []
native_count = 0
ocr_count = 0
logger.info(f"Extraction de {pdf_path.name} ({len(doc)} pages)")
for page_num in range(len(doc)):
page = doc[page_num]
native_text = page.get_text().strip()
if len(native_text) >= NATIVE_TEXT_MIN_CHARS:
# Page native
pages.append(PageResult(
page_num=page_num + 1,
text=native_text,
method="native",
confidence=1.0,
))
native_count += 1
logger.debug(f" Page {page_num + 1}/{len(doc)} : natif ({len(native_text)} chars)")
else:
# Page scannée → OCR docTR
image = _page_to_image(page)
text, confidence = _extract_text_doctr(image)
if text.strip():
pages.append(PageResult(
page_num=page_num + 1,
text=text,
method="ocr",
confidence=confidence,
))
ocr_count += 1
logger.debug(f" Page {page_num + 1}/{len(doc)} : OCR (conf={confidence:.2f}, {len(text)} chars)")
else:
# Page vide (page de garde, séparateur, etc.)
pages.append(PageResult(
page_num=page_num + 1,
text="",
method="ocr",
confidence=0.0,
))
ocr_count += 1
logger.debug(f" Page {page_num + 1}/{len(doc)} : vide")
total_pages = len(doc)
doc.close()
result = PDFExtractionResult(
file_path=str(pdf_path),
total_pages=total_pages,
pages=pages,
native_pages=native_count,
ocr_pages=ocr_count,
)
logger.info(
f"Extraction terminée : {native_count} pages natives, "
f"{ocr_count} pages OCR, {len(result.full_text)} chars total"
)
return result