Initial commit with full extraction pipeline: PDF OCR (docTR), text segmentation, LLM extraction (Ollama), deterministic post-processing normalizer, validation, and Excel/CSV export. The normalizer fixes OCR/LLM errors on CIM-10 codes: - OCR digit→letter confusion in position 1 (1→I, 0→O, 5→S, 2→Z, 8→B) - Missing dot separator (F050→F05.0, R410→R41.0) - '+' instead of '.' (B99+1→B99.1, J961+0→J96.10) - Excess decimals (Z04.880→Z04.88) - OCR letter→digit in positions 2-3 (LO2.2→L02.2) - Literal "null" string purge - Auto-fill codes_retenus from decision context Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
206 lines
7.8 KiB
Python
206 lines
7.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
T2A Extractor — Extraction structurée de rapports UCR
|
|
Usage : python main.py <fichier.pdf> [--output-dir <dossier>] [--csv] [--verbose]
|
|
"""
|
|
import argparse
|
|
import sys
|
|
import time
|
|
import logging
|
|
from pathlib import Path
|
|
|
|
# Ajouter le répertoire du projet au path
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
from config import DEFAULT_OUTPUT_DIR, OLLAMA_MODEL
|
|
from extractor.pdf_reader import extract_pdf
|
|
from extractor.segmenter import segment_text
|
|
from extractor.llm_extractor import extract_ogc_block, extract_champ_block, check_ollama_available
|
|
from extractor.normalizer import normalize_all
|
|
from extractor.validator import validate_all
|
|
from extractor.exporter import export_excel, export_csv
|
|
|
|
|
|
def setup_logging(verbose: bool = False):
|
|
"""Configure le logging."""
|
|
level = logging.DEBUG if verbose else logging.INFO
|
|
formatter = logging.Formatter(
|
|
'%(asctime)s [%(levelname)s] %(message)s',
|
|
datefmt='%H:%M:%S'
|
|
)
|
|
handler = logging.StreamHandler()
|
|
handler.setFormatter(formatter)
|
|
|
|
root_logger = logging.getLogger()
|
|
root_logger.setLevel(level)
|
|
root_logger.addHandler(handler)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Extraction structurée de rapports de contrôle T2A (UCR)"
|
|
)
|
|
parser.add_argument("pdf", help="Chemin vers le fichier PDF à traiter")
|
|
parser.add_argument("--output-dir", "-o", default=None,
|
|
help="Dossier de sortie (défaut: ./output)")
|
|
parser.add_argument("--csv", action="store_true",
|
|
help="Exporter aussi en CSV")
|
|
parser.add_argument("--verbose", "-v", action="store_true",
|
|
help="Mode verbeux (debug)")
|
|
parser.add_argument("--skip-validation", action="store_true",
|
|
help="Ne pas valider les extractions")
|
|
|
|
args = parser.parse_args()
|
|
setup_logging(args.verbose)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
pdf_path = Path(args.pdf)
|
|
if not pdf_path.exists():
|
|
logger.error(f"Fichier non trouvé : {pdf_path}")
|
|
sys.exit(1)
|
|
|
|
output_dir = Path(args.output_dir) if args.output_dir else DEFAULT_OUTPUT_DIR
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
stem = pdf_path.stem
|
|
excel_path = output_dir / f"{stem}_ucr_extract.xlsx"
|
|
csv_path = output_dir / f"{stem}_ucr_extract.csv"
|
|
|
|
# ============================================================
|
|
# Étape 0 : Vérification Ollama
|
|
# ============================================================
|
|
logger.info(f"Vérification Ollama ({OLLAMA_MODEL})...")
|
|
if not check_ollama_available():
|
|
logger.error("Ollama non disponible. Assurez-vous que le service est démarré et le modèle chargé.")
|
|
logger.error(f" → ollama serve")
|
|
logger.error(f" → ollama pull {OLLAMA_MODEL}")
|
|
sys.exit(1)
|
|
|
|
# ============================================================
|
|
# Étape 1 : Extraction du texte PDF
|
|
# ============================================================
|
|
logger.info("=" * 60)
|
|
logger.info("ÉTAPE 1 : Extraction du texte PDF")
|
|
logger.info("=" * 60)
|
|
t0 = time.time()
|
|
|
|
pdf_result = extract_pdf(pdf_path)
|
|
logger.info(f" {pdf_result.total_pages} pages ({pdf_result.native_pages} natives, {pdf_result.ocr_pages} OCR)")
|
|
logger.info(f" {len(pdf_result.full_text)} caractères extraits en {time.time() - t0:.1f}s")
|
|
|
|
# ============================================================
|
|
# Étape 2 : Segmentation en blocs OGC
|
|
# ============================================================
|
|
logger.info("=" * 60)
|
|
logger.info("ÉTAPE 2 : Segmentation en blocs OGC")
|
|
logger.info("=" * 60)
|
|
t1 = time.time()
|
|
|
|
segments = segment_text(pdf_result.full_text)
|
|
logger.info(f" {segments.total_ogc_count} OGC détectés en {len(segments.ogc_blocks)} blocs")
|
|
logger.info(f" {len(segments.champ_blocks)} décisions au niveau champ")
|
|
logger.info(f" Segmentation en {time.time() - t1:.1f}s")
|
|
|
|
# ============================================================
|
|
# Étape 3 : Extraction structurée via VLM
|
|
# ============================================================
|
|
logger.info("=" * 60)
|
|
logger.info("ÉTAPE 3 : Extraction structurée via VLM")
|
|
logger.info("=" * 60)
|
|
t2 = time.time()
|
|
|
|
all_extractions = []
|
|
total_blocks = len(segments.ogc_blocks) + len(segments.champ_blocks)
|
|
current = 0
|
|
|
|
# Extraire les blocs OGC
|
|
for block in segments.ogc_blocks:
|
|
current += 1
|
|
ogc_str = ",".join(str(n) for n in block.ogc_numbers)
|
|
logger.info(f" [{current}/{total_blocks}] Champ {block.champ} — OGC {ogc_str}...")
|
|
|
|
extractions = extract_ogc_block(
|
|
champ=block.champ,
|
|
ogc_numbers=block.ogc_numbers,
|
|
block_text=block.text,
|
|
)
|
|
all_extractions.extend(extractions)
|
|
|
|
for ext in extractions:
|
|
status = "✓" if ext.extraction_success else "✗"
|
|
logger.info(f" {status} OGC {ext.num_ogc} → {ext.decision_ucr or 'N/A'}")
|
|
|
|
# Extraire les blocs Champ
|
|
for block in segments.champ_blocks:
|
|
current += 1
|
|
logger.info(f" [{current}/{total_blocks}] Champ {block.champ} (décision globale)...")
|
|
|
|
extraction = extract_champ_block(
|
|
champ=block.champ,
|
|
block_text=block.text,
|
|
)
|
|
all_extractions.append(extraction)
|
|
|
|
# Tri final
|
|
all_extractions.sort(key=lambda x: (x.champ or 0, x.num_ogc or 0))
|
|
|
|
elapsed = time.time() - t2
|
|
logger.info(f" {len(all_extractions)} extractions en {elapsed:.1f}s ({elapsed/max(len(all_extractions),1):.1f}s/extraction)")
|
|
|
|
# ============================================================
|
|
# Étape 3.5 : Normalisation (post-traitement déterministe)
|
|
# ============================================================
|
|
logger.info("=" * 60)
|
|
logger.info("ÉTAPE 3.5 : Normalisation (codes CIM-10, codes retenus, texte)")
|
|
logger.info("=" * 60)
|
|
t_norm = time.time()
|
|
|
|
norm_report = normalize_all(all_extractions)
|
|
logger.info(f" {norm_report['total_fixes']} corrections en {time.time() - t_norm:.1f}s")
|
|
|
|
# ============================================================
|
|
# Étape 4 : Validation
|
|
# ============================================================
|
|
if not args.skip_validation:
|
|
logger.info("=" * 60)
|
|
logger.info("ÉTAPE 4 : Validation")
|
|
logger.info("=" * 60)
|
|
|
|
report = validate_all(all_extractions)
|
|
logger.info(f" {report['valid']}/{report['total']} valides, "
|
|
f"{report['with_warnings']} avec warnings, "
|
|
f"{report['failed']} échoués")
|
|
if report.get('total_fixes'):
|
|
logger.info(f" {report['total_fixes']} auto-corrections supplémentaires (safety-net)")
|
|
|
|
# ============================================================
|
|
# Étape 5 : Export
|
|
# ============================================================
|
|
logger.info("=" * 60)
|
|
logger.info("ÉTAPE 5 : Export")
|
|
logger.info("=" * 60)
|
|
|
|
n = export_excel(all_extractions, excel_path)
|
|
logger.info(f" Excel : {excel_path} ({n} lignes)")
|
|
|
|
if args.csv:
|
|
n = export_csv(all_extractions, csv_path)
|
|
logger.info(f" CSV : {csv_path} ({n} lignes)")
|
|
|
|
# ============================================================
|
|
# Résumé
|
|
# ============================================================
|
|
total_time = time.time() - t0
|
|
logger.info("=" * 60)
|
|
logger.info("TERMINÉ")
|
|
logger.info(f" Durée totale : {total_time:.1f}s")
|
|
logger.info(f" OGC extraits : {len(all_extractions)}")
|
|
success_count = sum(1 for e in all_extractions if e.extraction_success)
|
|
logger.info(f" Succès : {success_count}/{len(all_extractions)}")
|
|
logger.info(f" Sortie : {excel_path}")
|
|
logger.info("=" * 60)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|