feat: T2A-Extractor pipeline with CIM-10 normalizer (31→0 warnings)
Initial commit with full extraction pipeline: PDF OCR (docTR), text segmentation, LLM extraction (Ollama), deterministic post-processing normalizer, validation, and Excel/CSV export. The normalizer fixes OCR/LLM errors on CIM-10 codes: - OCR digit→letter confusion in position 1 (1→I, 0→O, 5→S, 2→Z, 8→B) - Missing dot separator (F050→F05.0, R410→R41.0) - '+' instead of '.' (B99+1→B99.1, J961+0→J96.10) - Excess decimals (Z04.880→Z04.88) - OCR letter→digit in positions 2-3 (LO2.2→L02.2) - Literal "null" string purge - Auto-fill codes_retenus from decision context Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
44
config.py
Normal file
44
config.py
Normal file
@@ -0,0 +1,44 @@
|
||||
"""
|
||||
Configuration T2A Extractor
|
||||
"""
|
||||
from pathlib import Path
|
||||
|
||||
# === Ollama ===
|
||||
OLLAMA_BASE_URL = "http://localhost:11434"
|
||||
OLLAMA_MODEL = "gemma3:27b-cloud" # À adapter selon le tag exact
|
||||
OLLAMA_TIMEOUT = 120 # secondes par requête
|
||||
OLLAMA_MAX_RETRIES = 2
|
||||
|
||||
# === OCR (docTR) ===
|
||||
DOCTR_DET_ARCH = "db_resnet50"
|
||||
DOCTR_RECO_ARCH = "crnn_vgg16_bn"
|
||||
OCR_DPI = 200 # résolution pour conversion page → image
|
||||
OCR_MIN_CONFIDENCE = 0.5 # seuil de confiance minimum docTR
|
||||
|
||||
# === Extraction PDF ===
|
||||
# Seuil de caractères pour considérer une page comme "native"
|
||||
# (certaines pages scannées ont quelques caractères parasites)
|
||||
NATIVE_TEXT_MIN_CHARS = 50
|
||||
|
||||
# === Schéma de sortie ===
|
||||
OUTPUT_COLUMNS = [
|
||||
"champ",
|
||||
"num_ogc",
|
||||
"type_desaccord",
|
||||
"codes_etablissement",
|
||||
"libelle_etablissement",
|
||||
"codes_controleurs",
|
||||
"libelle_controleurs",
|
||||
"decision_ucr",
|
||||
"codes_retenus",
|
||||
"ghm_ghs",
|
||||
"texte_decision",
|
||||
]
|
||||
|
||||
# Valeurs autorisées pour les enums
|
||||
DECISION_VALUES = ["Favorable", "Défavorable"]
|
||||
TYPE_DESACCORD_VALUES = ["DP", "DAS", "DP+DAS", "Actes"]
|
||||
|
||||
# === Chemins ===
|
||||
PROJECT_ROOT = Path(__file__).parent
|
||||
DEFAULT_OUTPUT_DIR = PROJECT_ROOT / "output"
|
||||
Reference in New Issue
Block a user