feat: T2A-Extractor pipeline with CIM-10 normalizer (31→0 warnings)

Initial commit with full extraction pipeline: PDF OCR (docTR), text segmentation, LLM extraction (Ollama), deterministic post-processing normalizer, validation, and Excel/CSV export. The normalizer fixes OCR/LLM errors on CIM-10 codes: - OCR digit→letter confusion in position 1 (1→I, 0→O, 5→S, 2→Z, 8→B) - Missing dot separator (F050→F05.0, R410→R41.0) - '+' instead of '.' (B99+1→B99.1, J961+0→J96.10) - Excess decimals (Z04.880→Z04.88) - OCR letter→digit in positions 2-3 (LO2.2→L02.2) - Literal "null" string purge - Auto-fill codes_retenus from decision context Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-23 20:44:32 +01:00
commit f70d138db3
13 changed files with 1699 additions and 0 deletions
--- a/main.py
+++ b/main.py
@@ -0,0 +1,205 @@
+#!/usr/bin/env python3
+"""
+T2A Extractor — Extraction structurée de rapports UCR
+Usage : python main.py <fichier.pdf> [--output-dir <dossier>] [--csv] [--verbose]
+"""
+import argparse
+import sys
+import time
+import logging
+from pathlib import Path
+
+# Ajouter le répertoire du projet au path
+sys.path.insert(0, str(Path(__file__).parent))
+
+from config import DEFAULT_OUTPUT_DIR, OLLAMA_MODEL
+from extractor.pdf_reader import extract_pdf
+from extractor.segmenter import segment_text
+from extractor.llm_extractor import extract_ogc_block, extract_champ_block, check_ollama_available
+from extractor.normalizer import normalize_all
+from extractor.validator import validate_all
+from extractor.exporter import export_excel, export_csv
+
+
+def setup_logging(verbose: bool = False):
+    """Configure le logging."""
+    level = logging.DEBUG if verbose else logging.INFO
+    formatter = logging.Formatter(
+        '%(asctime)s [%(levelname)s] %(message)s',
+        datefmt='%H:%M:%S'
+    )
+    handler = logging.StreamHandler()
+    handler.setFormatter(formatter)
+
+    root_logger = logging.getLogger()
+    root_logger.setLevel(level)
+    root_logger.addHandler(handler)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Extraction structurée de rapports de contrôle T2A (UCR)"
+    )
+    parser.add_argument("pdf", help="Chemin vers le fichier PDF à traiter")
+    parser.add_argument("--output-dir", "-o", default=None,
+                        help="Dossier de sortie (défaut: ./output)")
+    parser.add_argument("--csv", action="store_true",
+                        help="Exporter aussi en CSV")
+    parser.add_argument("--verbose", "-v", action="store_true",
+                        help="Mode verbeux (debug)")
+    parser.add_argument("--skip-validation", action="store_true",
+                        help="Ne pas valider les extractions")
+
+    args = parser.parse_args()
+    setup_logging(args.verbose)
+    logger = logging.getLogger(__name__)
+
+    pdf_path = Path(args.pdf)
+    if not pdf_path.exists():
+        logger.error(f"Fichier non trouvé : {pdf_path}")
+        sys.exit(1)
+
+    output_dir = Path(args.output_dir) if args.output_dir else DEFAULT_OUTPUT_DIR
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    stem = pdf_path.stem
+    excel_path = output_dir / f"{stem}_ucr_extract.xlsx"
+    csv_path = output_dir / f"{stem}_ucr_extract.csv"
+
+    # ============================================================
+    # Étape 0 : Vérification Ollama
+    # ============================================================
+    logger.info(f"Vérification Ollama ({OLLAMA_MODEL})...")
+    if not check_ollama_available():
+        logger.error("Ollama non disponible. Assurez-vous que le service est démarré et le modèle chargé.")
+        logger.error(f"  → ollama serve")
+        logger.error(f"  → ollama pull {OLLAMA_MODEL}")
+        sys.exit(1)
+
+    # ============================================================
+    # Étape 1 : Extraction du texte PDF
+    # ============================================================
+    logger.info("=" * 60)
+    logger.info("ÉTAPE 1 : Extraction du texte PDF")
+    logger.info("=" * 60)
+    t0 = time.time()
+
+    pdf_result = extract_pdf(pdf_path)
+    logger.info(f"  {pdf_result.total_pages} pages ({pdf_result.native_pages} natives, {pdf_result.ocr_pages} OCR)")
+    logger.info(f"  {len(pdf_result.full_text)} caractères extraits en {time.time() - t0:.1f}s")
+
+    # ============================================================
+    # Étape 2 : Segmentation en blocs OGC
+    # ============================================================
+    logger.info("=" * 60)
+    logger.info("ÉTAPE 2 : Segmentation en blocs OGC")
+    logger.info("=" * 60)
+    t1 = time.time()
+
+    segments = segment_text(pdf_result.full_text)
+    logger.info(f"  {segments.total_ogc_count} OGC détectés en {len(segments.ogc_blocks)} blocs")
+    logger.info(f"  {len(segments.champ_blocks)} décisions au niveau champ")
+    logger.info(f"  Segmentation en {time.time() - t1:.1f}s")
+
+    # ============================================================
+    # Étape 3 : Extraction structurée via VLM
+    # ============================================================
+    logger.info("=" * 60)
+    logger.info("ÉTAPE 3 : Extraction structurée via VLM")
+    logger.info("=" * 60)
+    t2 = time.time()
+
+    all_extractions = []
+    total_blocks = len(segments.ogc_blocks) + len(segments.champ_blocks)
+    current = 0
+
+    # Extraire les blocs OGC
+    for block in segments.ogc_blocks:
+        current += 1
+        ogc_str = ",".join(str(n) for n in block.ogc_numbers)
+        logger.info(f"  [{current}/{total_blocks}] Champ {block.champ} — OGC {ogc_str}...")
+
+        extractions = extract_ogc_block(
+            champ=block.champ,
+            ogc_numbers=block.ogc_numbers,
+            block_text=block.text,
+        )
+        all_extractions.extend(extractions)
+
+        for ext in extractions:
+            status = "✓" if ext.extraction_success else "✗"
+            logger.info(f"    {status} OGC {ext.num_ogc} → {ext.decision_ucr or 'N/A'}")
+
+    # Extraire les blocs Champ
+    for block in segments.champ_blocks:
+        current += 1
+        logger.info(f"  [{current}/{total_blocks}] Champ {block.champ} (décision globale)...")
+
+        extraction = extract_champ_block(
+            champ=block.champ,
+            block_text=block.text,
+        )
+        all_extractions.append(extraction)
+
+    # Tri final
+    all_extractions.sort(key=lambda x: (x.champ or 0, x.num_ogc or 0))
+
+    elapsed = time.time() - t2
+    logger.info(f"  {len(all_extractions)} extractions en {elapsed:.1f}s ({elapsed/max(len(all_extractions),1):.1f}s/extraction)")
+
+    # ============================================================
+    # Étape 3.5 : Normalisation (post-traitement déterministe)
+    # ============================================================
+    logger.info("=" * 60)
+    logger.info("ÉTAPE 3.5 : Normalisation (codes CIM-10, codes retenus, texte)")
+    logger.info("=" * 60)
+    t_norm = time.time()
+
+    norm_report = normalize_all(all_extractions)
+    logger.info(f"  {norm_report['total_fixes']} corrections en {time.time() - t_norm:.1f}s")
+
+    # ============================================================
+    # Étape 4 : Validation
+    # ============================================================
+    if not args.skip_validation:
+        logger.info("=" * 60)
+        logger.info("ÉTAPE 4 : Validation")
+        logger.info("=" * 60)
+
+        report = validate_all(all_extractions)
+        logger.info(f"  {report['valid']}/{report['total']} valides, "
+                     f"{report['with_warnings']} avec warnings, "
+                     f"{report['failed']} échoués")
+        if report.get('total_fixes'):
+            logger.info(f"  {report['total_fixes']} auto-corrections supplémentaires (safety-net)")
+
+    # ============================================================
+    # Étape 5 : Export
+    # ============================================================
+    logger.info("=" * 60)
+    logger.info("ÉTAPE 5 : Export")
+    logger.info("=" * 60)
+
+    n = export_excel(all_extractions, excel_path)
+    logger.info(f"  Excel : {excel_path} ({n} lignes)")
+
+    if args.csv:
+        n = export_csv(all_extractions, csv_path)
+        logger.info(f"  CSV   : {csv_path} ({n} lignes)")
+
+    # ============================================================
+    # Résumé
+    # ============================================================
+    total_time = time.time() - t0
+    logger.info("=" * 60)
+    logger.info("TERMINÉ")
+    logger.info(f"  Durée totale : {total_time:.1f}s")
+    logger.info(f"  OGC extraits : {len(all_extractions)}")
+    success_count = sum(1 for e in all_extractions if e.extraction_success)
+    logger.info(f"  Succès       : {success_count}/{len(all_extractions)}")
+    logger.info(f"  Sortie       : {excel_path}")
+    logger.info("=" * 60)
+
+
+if __name__ == "__main__":
+    main()