feat: T2A-Extractor pipeline with CIM-10 normalizer (31→0 warnings)
Initial commit with full extraction pipeline: PDF OCR (docTR), text segmentation, LLM extraction (Ollama), deterministic post-processing normalizer, validation, and Excel/CSV export. The normalizer fixes OCR/LLM errors on CIM-10 codes: - OCR digit→letter confusion in position 1 (1→I, 0→O, 5→S, 2→Z, 8→B) - Missing dot separator (F050→F05.0, R410→R41.0) - '+' instead of '.' (B99+1→B99.1, J961+0→J96.10) - Excess decimals (Z04.880→Z04.88) - OCR letter→digit in positions 2-3 (LO2.2→L02.2) - Literal "null" string purge - Auto-fill codes_retenus from decision context Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
169
extractor/pdf_reader.py
Normal file
169
extractor/pdf_reader.py
Normal file
@@ -0,0 +1,169 @@
|
||||
"""
|
||||
Extraction de texte depuis des PDF natifs et scannés.
|
||||
- PDF natif → pymupdf (extraction directe)
|
||||
- PDF scanné → docTR (OCR)
|
||||
"""
|
||||
import fitz # pymupdf
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass
|
||||
import io
|
||||
import logging
|
||||
|
||||
from config import NATIVE_TEXT_MIN_CHARS, OCR_DPI, DOCTR_DET_ARCH, DOCTR_RECO_ARCH, OCR_MIN_CONFIDENCE
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Lazy loading de docTR (lourd à importer)
|
||||
_doctr_predictor = None
|
||||
|
||||
|
||||
def _get_doctr_predictor():
|
||||
"""Charge le modèle docTR une seule fois (lazy)."""
|
||||
global _doctr_predictor
|
||||
if _doctr_predictor is None:
|
||||
logger.info("Chargement du modèle docTR...")
|
||||
from doctr.models import ocr_predictor
|
||||
_doctr_predictor = ocr_predictor(
|
||||
det_arch=DOCTR_DET_ARCH,
|
||||
reco_arch=DOCTR_RECO_ARCH,
|
||||
pretrained=True,
|
||||
assume_straight_pages=True,
|
||||
)
|
||||
logger.info("Modèle docTR chargé.")
|
||||
return _doctr_predictor
|
||||
|
||||
|
||||
@dataclass
|
||||
class PageResult:
|
||||
"""Résultat d'extraction d'une page."""
|
||||
page_num: int
|
||||
text: str
|
||||
method: str # "native" ou "ocr"
|
||||
confidence: float # 1.0 pour natif, score moyen docTR pour OCR
|
||||
|
||||
|
||||
@dataclass
|
||||
class PDFExtractionResult:
|
||||
"""Résultat complet d'extraction d'un PDF."""
|
||||
file_path: str
|
||||
total_pages: int
|
||||
pages: list # list[PageResult]
|
||||
native_pages: int
|
||||
ocr_pages: int
|
||||
|
||||
@property
|
||||
def full_text(self) -> str:
|
||||
"""Texte complet du document."""
|
||||
return "\n\n".join(p.text for p in self.pages if p.text.strip())
|
||||
|
||||
|
||||
def _page_to_image(page: fitz.Page, dpi: int = OCR_DPI) -> np.ndarray:
|
||||
"""Convertit une page PDF en image numpy (RGB) pour docTR."""
|
||||
zoom = dpi / 72
|
||||
mat = fitz.Matrix(zoom, zoom)
|
||||
pix = page.get_pixmap(matrix=mat)
|
||||
img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
|
||||
return np.array(img)
|
||||
|
||||
|
||||
def _extract_text_doctr(image_array: np.ndarray) -> tuple[str, float]:
|
||||
"""
|
||||
Extrait le texte d'une image via docTR.
|
||||
Retourne (texte, score_confiance_moyen).
|
||||
"""
|
||||
predictor = _get_doctr_predictor()
|
||||
result = predictor([image_array])
|
||||
|
||||
lines = []
|
||||
confidences = []
|
||||
|
||||
for page in result.pages:
|
||||
for block in page.blocks:
|
||||
for line in block.lines:
|
||||
words = []
|
||||
for word in line.words:
|
||||
if word.confidence >= OCR_MIN_CONFIDENCE:
|
||||
words.append(word.value)
|
||||
confidences.append(word.confidence)
|
||||
if words:
|
||||
lines.append(" ".join(words))
|
||||
|
||||
text = "\n".join(lines)
|
||||
avg_conf = sum(confidences) / len(confidences) if confidences else 0.0
|
||||
return text, avg_conf
|
||||
|
||||
|
||||
def extract_pdf(pdf_path: str | Path) -> PDFExtractionResult:
|
||||
"""
|
||||
Extrait le texte d'un PDF en détectant automatiquement
|
||||
les pages natives vs scannées.
|
||||
"""
|
||||
pdf_path = Path(pdf_path)
|
||||
if not pdf_path.exists():
|
||||
raise FileNotFoundError(f"PDF non trouvé : {pdf_path}")
|
||||
|
||||
doc = fitz.open(str(pdf_path))
|
||||
pages = []
|
||||
native_count = 0
|
||||
ocr_count = 0
|
||||
|
||||
logger.info(f"Extraction de {pdf_path.name} ({len(doc)} pages)")
|
||||
|
||||
for page_num in range(len(doc)):
|
||||
page = doc[page_num]
|
||||
native_text = page.get_text().strip()
|
||||
|
||||
if len(native_text) >= NATIVE_TEXT_MIN_CHARS:
|
||||
# Page native
|
||||
pages.append(PageResult(
|
||||
page_num=page_num + 1,
|
||||
text=native_text,
|
||||
method="native",
|
||||
confidence=1.0,
|
||||
))
|
||||
native_count += 1
|
||||
logger.debug(f" Page {page_num + 1}/{len(doc)} : natif ({len(native_text)} chars)")
|
||||
else:
|
||||
# Page scannée → OCR docTR
|
||||
image = _page_to_image(page)
|
||||
text, confidence = _extract_text_doctr(image)
|
||||
|
||||
if text.strip():
|
||||
pages.append(PageResult(
|
||||
page_num=page_num + 1,
|
||||
text=text,
|
||||
method="ocr",
|
||||
confidence=confidence,
|
||||
))
|
||||
ocr_count += 1
|
||||
logger.debug(f" Page {page_num + 1}/{len(doc)} : OCR (conf={confidence:.2f}, {len(text)} chars)")
|
||||
else:
|
||||
# Page vide (page de garde, séparateur, etc.)
|
||||
pages.append(PageResult(
|
||||
page_num=page_num + 1,
|
||||
text="",
|
||||
method="ocr",
|
||||
confidence=0.0,
|
||||
))
|
||||
ocr_count += 1
|
||||
logger.debug(f" Page {page_num + 1}/{len(doc)} : vide")
|
||||
|
||||
total_pages = len(doc)
|
||||
doc.close()
|
||||
|
||||
result = PDFExtractionResult(
|
||||
file_path=str(pdf_path),
|
||||
total_pages=total_pages,
|
||||
pages=pages,
|
||||
native_pages=native_count,
|
||||
ocr_pages=ocr_count,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Extraction terminée : {native_count} pages natives, "
|
||||
f"{ocr_count} pages OCR, {len(result.full_text)} chars total"
|
||||
)
|
||||
|
||||
return result
|
||||
Reference in New Issue
Block a user