feat: T2A-Extractor pipeline with CIM-10 normalizer (31→0 warnings)

Initial commit with full extraction pipeline: PDF OCR (docTR), text segmentation, LLM extraction (Ollama), deterministic post-processing normalizer, validation, and Excel/CSV export. The normalizer fixes OCR/LLM errors on CIM-10 codes: - OCR digit→letter confusion in position 1 (1→I, 0→O, 5→S, 2→Z, 8→B) - Missing dot separator (F050→F05.0, R410→R41.0) - '+' instead of '.' (B99+1→B99.1, J961+0→J96.10) - Excess decimals (Z04.880→Z04.88) - OCR letter→digit in positions 2-3 (LO2.2→L02.2) - Literal "null" string purge - Auto-fill codes_retenus from decision context Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-23 20:44:32 +01:00
commit f70d138db3
13 changed files with 1699 additions and 0 deletions
--- a/extractor/pdf_reader.py
+++ b/extractor/pdf_reader.py
@@ -0,0 +1,169 @@
+"""
+Extraction de texte depuis des PDF natifs et scannés.
+- PDF natif → pymupdf (extraction directe)
+- PDF scanné → docTR (OCR)
+"""
+import fitz  # pymupdf
+import numpy as np
+from PIL import Image
+from pathlib import Path
+from dataclasses import dataclass
+import io
+import logging
+
+from config import NATIVE_TEXT_MIN_CHARS, OCR_DPI, DOCTR_DET_ARCH, DOCTR_RECO_ARCH, OCR_MIN_CONFIDENCE
+
+logger = logging.getLogger(__name__)
+
+# Lazy loading de docTR (lourd à importer)
+_doctr_predictor = None
+
+
+def _get_doctr_predictor():
+    """Charge le modèle docTR une seule fois (lazy)."""
+    global _doctr_predictor
+    if _doctr_predictor is None:
+        logger.info("Chargement du modèle docTR...")
+        from doctr.models import ocr_predictor
+        _doctr_predictor = ocr_predictor(
+            det_arch=DOCTR_DET_ARCH,
+            reco_arch=DOCTR_RECO_ARCH,
+            pretrained=True,
+            assume_straight_pages=True,
+        )
+        logger.info("Modèle docTR chargé.")
+    return _doctr_predictor
+
+
+@dataclass
+class PageResult:
+    """Résultat d'extraction d'une page."""
+    page_num: int
+    text: str
+    method: str  # "native" ou "ocr"
+    confidence: float  # 1.0 pour natif, score moyen docTR pour OCR
+
+
+@dataclass
+class PDFExtractionResult:
+    """Résultat complet d'extraction d'un PDF."""
+    file_path: str
+    total_pages: int
+    pages: list  # list[PageResult]
+    native_pages: int
+    ocr_pages: int
+
+    @property
+    def full_text(self) -> str:
+        """Texte complet du document."""
+        return "\n\n".join(p.text for p in self.pages if p.text.strip())
+
+
+def _page_to_image(page: fitz.Page, dpi: int = OCR_DPI) -> np.ndarray:
+    """Convertit une page PDF en image numpy (RGB) pour docTR."""
+    zoom = dpi / 72
+    mat = fitz.Matrix(zoom, zoom)
+    pix = page.get_pixmap(matrix=mat)
+    img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
+    return np.array(img)
+
+
+def _extract_text_doctr(image_array: np.ndarray) -> tuple[str, float]:
+    """
+    Extrait le texte d'une image via docTR.
+    Retourne (texte, score_confiance_moyen).
+    """
+    predictor = _get_doctr_predictor()
+    result = predictor([image_array])
+
+    lines = []
+    confidences = []
+
+    for page in result.pages:
+        for block in page.blocks:
+            for line in block.lines:
+                words = []
+                for word in line.words:
+                    if word.confidence >= OCR_MIN_CONFIDENCE:
+                        words.append(word.value)
+                        confidences.append(word.confidence)
+                if words:
+                    lines.append(" ".join(words))
+
+    text = "\n".join(lines)
+    avg_conf = sum(confidences) / len(confidences) if confidences else 0.0
+    return text, avg_conf
+
+
+def extract_pdf(pdf_path: str | Path) -> PDFExtractionResult:
+    """
+    Extrait le texte d'un PDF en détectant automatiquement
+    les pages natives vs scannées.
+    """
+    pdf_path = Path(pdf_path)
+    if not pdf_path.exists():
+        raise FileNotFoundError(f"PDF non trouvé : {pdf_path}")
+
+    doc = fitz.open(str(pdf_path))
+    pages = []
+    native_count = 0
+    ocr_count = 0
+
+    logger.info(f"Extraction de {pdf_path.name} ({len(doc)} pages)")
+
+    for page_num in range(len(doc)):
+        page = doc[page_num]
+        native_text = page.get_text().strip()
+
+        if len(native_text) >= NATIVE_TEXT_MIN_CHARS:
+            # Page native
+            pages.append(PageResult(
+                page_num=page_num + 1,
+                text=native_text,
+                method="native",
+                confidence=1.0,
+            ))
+            native_count += 1
+            logger.debug(f"  Page {page_num + 1}/{len(doc)} : natif ({len(native_text)} chars)")
+        else:
+            # Page scannée → OCR docTR
+            image = _page_to_image(page)
+            text, confidence = _extract_text_doctr(image)
+
+            if text.strip():
+                pages.append(PageResult(
+                    page_num=page_num + 1,
+                    text=text,
+                    method="ocr",
+                    confidence=confidence,
+                ))
+                ocr_count += 1
+                logger.debug(f"  Page {page_num + 1}/{len(doc)} : OCR (conf={confidence:.2f}, {len(text)} chars)")
+            else:
+                # Page vide (page de garde, séparateur, etc.)
+                pages.append(PageResult(
+                    page_num=page_num + 1,
+                    text="",
+                    method="ocr",
+                    confidence=0.0,
+                ))
+                ocr_count += 1
+                logger.debug(f"  Page {page_num + 1}/{len(doc)} : vide")
+
+    total_pages = len(doc)
+    doc.close()
+
+    result = PDFExtractionResult(
+        file_path=str(pdf_path),
+        total_pages=total_pages,
+        pages=pages,
+        native_pages=native_count,
+        ocr_pages=ocr_count,
+    )
+
+    logger.info(
+        f"Extraction terminée : {native_count} pages natives, "
+        f"{ocr_count} pages OCR, {len(result.full_text)} chars total"
+    )
+
+    return result