feat: T2A-Extractor pipeline with CIM-10 normalizer (31→0 warnings)
Initial commit with full extraction pipeline: PDF OCR (docTR), text segmentation, LLM extraction (Ollama), deterministic post-processing normalizer, validation, and Excel/CSV export. The normalizer fixes OCR/LLM errors on CIM-10 codes: - OCR digit→letter confusion in position 1 (1→I, 0→O, 5→S, 2→Z, 8→B) - Missing dot separator (F050→F05.0, R410→R41.0) - '+' instead of '.' (B99+1→B99.1, J961+0→J96.10) - Excess decimals (Z04.880→Z04.88) - OCR letter→digit in positions 2-3 (LO2.2→L02.2) - Literal "null" string purge - Auto-fill codes_retenus from decision context Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
205
main.py
Normal file
205
main.py
Normal file
@@ -0,0 +1,205 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
T2A Extractor — Extraction structurée de rapports UCR
|
||||
Usage : python main.py <fichier.pdf> [--output-dir <dossier>] [--csv] [--verbose]
|
||||
"""
|
||||
import argparse
|
||||
import sys
|
||||
import time
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
# Ajouter le répertoire du projet au path
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from config import DEFAULT_OUTPUT_DIR, OLLAMA_MODEL
|
||||
from extractor.pdf_reader import extract_pdf
|
||||
from extractor.segmenter import segment_text
|
||||
from extractor.llm_extractor import extract_ogc_block, extract_champ_block, check_ollama_available
|
||||
from extractor.normalizer import normalize_all
|
||||
from extractor.validator import validate_all
|
||||
from extractor.exporter import export_excel, export_csv
|
||||
|
||||
|
||||
def setup_logging(verbose: bool = False):
|
||||
"""Configure le logging."""
|
||||
level = logging.DEBUG if verbose else logging.INFO
|
||||
formatter = logging.Formatter(
|
||||
'%(asctime)s [%(levelname)s] %(message)s',
|
||||
datefmt='%H:%M:%S'
|
||||
)
|
||||
handler = logging.StreamHandler()
|
||||
handler.setFormatter(formatter)
|
||||
|
||||
root_logger = logging.getLogger()
|
||||
root_logger.setLevel(level)
|
||||
root_logger.addHandler(handler)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Extraction structurée de rapports de contrôle T2A (UCR)"
|
||||
)
|
||||
parser.add_argument("pdf", help="Chemin vers le fichier PDF à traiter")
|
||||
parser.add_argument("--output-dir", "-o", default=None,
|
||||
help="Dossier de sortie (défaut: ./output)")
|
||||
parser.add_argument("--csv", action="store_true",
|
||||
help="Exporter aussi en CSV")
|
||||
parser.add_argument("--verbose", "-v", action="store_true",
|
||||
help="Mode verbeux (debug)")
|
||||
parser.add_argument("--skip-validation", action="store_true",
|
||||
help="Ne pas valider les extractions")
|
||||
|
||||
args = parser.parse_args()
|
||||
setup_logging(args.verbose)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
pdf_path = Path(args.pdf)
|
||||
if not pdf_path.exists():
|
||||
logger.error(f"Fichier non trouvé : {pdf_path}")
|
||||
sys.exit(1)
|
||||
|
||||
output_dir = Path(args.output_dir) if args.output_dir else DEFAULT_OUTPUT_DIR
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
stem = pdf_path.stem
|
||||
excel_path = output_dir / f"{stem}_ucr_extract.xlsx"
|
||||
csv_path = output_dir / f"{stem}_ucr_extract.csv"
|
||||
|
||||
# ============================================================
|
||||
# Étape 0 : Vérification Ollama
|
||||
# ============================================================
|
||||
logger.info(f"Vérification Ollama ({OLLAMA_MODEL})...")
|
||||
if not check_ollama_available():
|
||||
logger.error("Ollama non disponible. Assurez-vous que le service est démarré et le modèle chargé.")
|
||||
logger.error(f" → ollama serve")
|
||||
logger.error(f" → ollama pull {OLLAMA_MODEL}")
|
||||
sys.exit(1)
|
||||
|
||||
# ============================================================
|
||||
# Étape 1 : Extraction du texte PDF
|
||||
# ============================================================
|
||||
logger.info("=" * 60)
|
||||
logger.info("ÉTAPE 1 : Extraction du texte PDF")
|
||||
logger.info("=" * 60)
|
||||
t0 = time.time()
|
||||
|
||||
pdf_result = extract_pdf(pdf_path)
|
||||
logger.info(f" {pdf_result.total_pages} pages ({pdf_result.native_pages} natives, {pdf_result.ocr_pages} OCR)")
|
||||
logger.info(f" {len(pdf_result.full_text)} caractères extraits en {time.time() - t0:.1f}s")
|
||||
|
||||
# ============================================================
|
||||
# Étape 2 : Segmentation en blocs OGC
|
||||
# ============================================================
|
||||
logger.info("=" * 60)
|
||||
logger.info("ÉTAPE 2 : Segmentation en blocs OGC")
|
||||
logger.info("=" * 60)
|
||||
t1 = time.time()
|
||||
|
||||
segments = segment_text(pdf_result.full_text)
|
||||
logger.info(f" {segments.total_ogc_count} OGC détectés en {len(segments.ogc_blocks)} blocs")
|
||||
logger.info(f" {len(segments.champ_blocks)} décisions au niveau champ")
|
||||
logger.info(f" Segmentation en {time.time() - t1:.1f}s")
|
||||
|
||||
# ============================================================
|
||||
# Étape 3 : Extraction structurée via VLM
|
||||
# ============================================================
|
||||
logger.info("=" * 60)
|
||||
logger.info("ÉTAPE 3 : Extraction structurée via VLM")
|
||||
logger.info("=" * 60)
|
||||
t2 = time.time()
|
||||
|
||||
all_extractions = []
|
||||
total_blocks = len(segments.ogc_blocks) + len(segments.champ_blocks)
|
||||
current = 0
|
||||
|
||||
# Extraire les blocs OGC
|
||||
for block in segments.ogc_blocks:
|
||||
current += 1
|
||||
ogc_str = ",".join(str(n) for n in block.ogc_numbers)
|
||||
logger.info(f" [{current}/{total_blocks}] Champ {block.champ} — OGC {ogc_str}...")
|
||||
|
||||
extractions = extract_ogc_block(
|
||||
champ=block.champ,
|
||||
ogc_numbers=block.ogc_numbers,
|
||||
block_text=block.text,
|
||||
)
|
||||
all_extractions.extend(extractions)
|
||||
|
||||
for ext in extractions:
|
||||
status = "✓" if ext.extraction_success else "✗"
|
||||
logger.info(f" {status} OGC {ext.num_ogc} → {ext.decision_ucr or 'N/A'}")
|
||||
|
||||
# Extraire les blocs Champ
|
||||
for block in segments.champ_blocks:
|
||||
current += 1
|
||||
logger.info(f" [{current}/{total_blocks}] Champ {block.champ} (décision globale)...")
|
||||
|
||||
extraction = extract_champ_block(
|
||||
champ=block.champ,
|
||||
block_text=block.text,
|
||||
)
|
||||
all_extractions.append(extraction)
|
||||
|
||||
# Tri final
|
||||
all_extractions.sort(key=lambda x: (x.champ or 0, x.num_ogc or 0))
|
||||
|
||||
elapsed = time.time() - t2
|
||||
logger.info(f" {len(all_extractions)} extractions en {elapsed:.1f}s ({elapsed/max(len(all_extractions),1):.1f}s/extraction)")
|
||||
|
||||
# ============================================================
|
||||
# Étape 3.5 : Normalisation (post-traitement déterministe)
|
||||
# ============================================================
|
||||
logger.info("=" * 60)
|
||||
logger.info("ÉTAPE 3.5 : Normalisation (codes CIM-10, codes retenus, texte)")
|
||||
logger.info("=" * 60)
|
||||
t_norm = time.time()
|
||||
|
||||
norm_report = normalize_all(all_extractions)
|
||||
logger.info(f" {norm_report['total_fixes']} corrections en {time.time() - t_norm:.1f}s")
|
||||
|
||||
# ============================================================
|
||||
# Étape 4 : Validation
|
||||
# ============================================================
|
||||
if not args.skip_validation:
|
||||
logger.info("=" * 60)
|
||||
logger.info("ÉTAPE 4 : Validation")
|
||||
logger.info("=" * 60)
|
||||
|
||||
report = validate_all(all_extractions)
|
||||
logger.info(f" {report['valid']}/{report['total']} valides, "
|
||||
f"{report['with_warnings']} avec warnings, "
|
||||
f"{report['failed']} échoués")
|
||||
if report.get('total_fixes'):
|
||||
logger.info(f" {report['total_fixes']} auto-corrections supplémentaires (safety-net)")
|
||||
|
||||
# ============================================================
|
||||
# Étape 5 : Export
|
||||
# ============================================================
|
||||
logger.info("=" * 60)
|
||||
logger.info("ÉTAPE 5 : Export")
|
||||
logger.info("=" * 60)
|
||||
|
||||
n = export_excel(all_extractions, excel_path)
|
||||
logger.info(f" Excel : {excel_path} ({n} lignes)")
|
||||
|
||||
if args.csv:
|
||||
n = export_csv(all_extractions, csv_path)
|
||||
logger.info(f" CSV : {csv_path} ({n} lignes)")
|
||||
|
||||
# ============================================================
|
||||
# Résumé
|
||||
# ============================================================
|
||||
total_time = time.time() - t0
|
||||
logger.info("=" * 60)
|
||||
logger.info("TERMINÉ")
|
||||
logger.info(f" Durée totale : {total_time:.1f}s")
|
||||
logger.info(f" OGC extraits : {len(all_extractions)}")
|
||||
success_count = sum(1 for e in all_extractions if e.extraction_success)
|
||||
logger.info(f" Succès : {success_count}/{len(all_extractions)}")
|
||||
logger.info(f" Sortie : {excel_path}")
|
||||
logger.info("=" * 60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user