feat: T2A-Extractor pipeline with CIM-10 normalizer (31→0 warnings)
Initial commit with full extraction pipeline: PDF OCR (docTR), text segmentation, LLM extraction (Ollama), deterministic post-processing normalizer, validation, and Excel/CSV export. The normalizer fixes OCR/LLM errors on CIM-10 codes: - OCR digit→letter confusion in position 1 (1→I, 0→O, 5→S, 2→Z, 8→B) - Missing dot separator (F050→F05.0, R410→R41.0) - '+' instead of '.' (B99+1→B99.1, J961+0→J96.10) - Excess decimals (Z04.880→Z04.88) - OCR letter→digit in positions 2-3 (LO2.2→L02.2) - Literal "null" string purge - Auto-fill codes_retenus from decision context Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
157
extractor/exporter.py
Normal file
157
extractor/exporter.py
Normal file
@@ -0,0 +1,157 @@
|
||||
"""
|
||||
Export des données extraites en Excel et CSV.
|
||||
"""
|
||||
import csv
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from openpyxl import Workbook
|
||||
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
|
||||
|
||||
from config import OUTPUT_COLUMNS
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Styles Excel
|
||||
HEADER_FONT = Font(bold=True, color="FFFFFF", size=11, name="Arial")
|
||||
HEADER_FILL = PatternFill("solid", fgColor="2F5496")
|
||||
HEADER_ALIGN = Alignment(horizontal="center", vertical="center", wrap_text=True)
|
||||
CELL_ALIGN = Alignment(vertical="top", wrap_text=True)
|
||||
CELL_FONT = Font(name="Arial", size=10)
|
||||
THIN_BORDER = Border(
|
||||
left=Side(style='thin'),
|
||||
right=Side(style='thin'),
|
||||
top=Side(style='thin'),
|
||||
bottom=Side(style='thin'),
|
||||
)
|
||||
|
||||
# Couleurs de décision
|
||||
FILL_FAVORABLE = PatternFill("solid", fgColor="CCFFCC") # Vert clair
|
||||
FILL_DEFAVORABLE = PatternFill("solid", fgColor="FFCCCC") # Rouge clair
|
||||
FILL_UNKNOWN = PatternFill("solid", fgColor="FFFFCC") # Jaune clair
|
||||
FILL_ERROR = PatternFill("solid", fgColor="FFD9CC") # Orange clair
|
||||
|
||||
# Largeurs de colonnes
|
||||
COLUMN_WIDTHS = {
|
||||
"champ": 8,
|
||||
"num_ogc": 10,
|
||||
"type_desaccord": 14,
|
||||
"codes_etablissement": 22,
|
||||
"libelle_etablissement": 40,
|
||||
"codes_controleurs": 22,
|
||||
"libelle_controleurs": 40,
|
||||
"decision_ucr": 16,
|
||||
"codes_retenus": 22,
|
||||
"ghm_ghs": 22,
|
||||
"texte_decision": 80,
|
||||
}
|
||||
|
||||
# Labels d'en-tête plus lisibles
|
||||
HEADER_LABELS = {
|
||||
"champ": "Champ",
|
||||
"num_ogc": "N° OGC",
|
||||
"type_desaccord": "Type désaccord",
|
||||
"codes_etablissement": "Codes Établissement",
|
||||
"libelle_etablissement": "Libellé Établissement",
|
||||
"codes_controleurs": "Codes Contrôleurs",
|
||||
"libelle_controleurs": "Libellé Contrôleurs",
|
||||
"decision_ucr": "Décision UCR",
|
||||
"codes_retenus": "Codes retenus",
|
||||
"ghm_ghs": "GHM / GHS",
|
||||
"texte_decision": "Texte décision",
|
||||
}
|
||||
|
||||
|
||||
def _extraction_to_row(extraction) -> dict:
|
||||
"""Convertit une extraction en dictionnaire pour l'export."""
|
||||
return {
|
||||
"champ": extraction.champ,
|
||||
"num_ogc": extraction.num_ogc,
|
||||
"type_desaccord": extraction.type_desaccord,
|
||||
"codes_etablissement": extraction.codes_etablissement,
|
||||
"libelle_etablissement": extraction.libelle_etablissement,
|
||||
"codes_controleurs": extraction.codes_controleurs,
|
||||
"libelle_controleurs": extraction.libelle_controleurs,
|
||||
"decision_ucr": extraction.decision_ucr,
|
||||
"codes_retenus": extraction.codes_retenus,
|
||||
"ghm_ghs": extraction.ghm_ghs,
|
||||
"texte_decision": extraction.texte_decision,
|
||||
}
|
||||
|
||||
|
||||
def export_excel(extractions: list, output_path: str | Path) -> int:
|
||||
"""
|
||||
Exporte les extractions en fichier Excel formaté.
|
||||
Retourne le nombre de lignes exportées.
|
||||
"""
|
||||
output_path = Path(output_path)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
wb = Workbook()
|
||||
ws = wb.active
|
||||
ws.title = "Décisions UCR"
|
||||
|
||||
# En-têtes
|
||||
for col_idx, col_name in enumerate(OUTPUT_COLUMNS, 1):
|
||||
cell = ws.cell(row=1, column=col_idx, value=HEADER_LABELS.get(col_name, col_name))
|
||||
cell.font = HEADER_FONT
|
||||
cell.fill = HEADER_FILL
|
||||
cell.alignment = HEADER_ALIGN
|
||||
cell.border = THIN_BORDER
|
||||
|
||||
# Données
|
||||
for row_idx, extraction in enumerate(extractions, 2):
|
||||
row_data = _extraction_to_row(extraction)
|
||||
|
||||
for col_idx, col_name in enumerate(OUTPUT_COLUMNS, 1):
|
||||
value = row_data.get(col_name)
|
||||
cell = ws.cell(row=row_idx, column=col_idx, value=value)
|
||||
cell.alignment = CELL_ALIGN
|
||||
cell.font = CELL_FONT
|
||||
cell.border = THIN_BORDER
|
||||
|
||||
# Colorer la cellule décision
|
||||
decision_col = OUTPUT_COLUMNS.index("decision_ucr") + 1
|
||||
decision_cell = ws.cell(row=row_idx, column=decision_col)
|
||||
decision_value = row_data.get("decision_ucr", "")
|
||||
|
||||
if not extraction.extraction_success:
|
||||
decision_cell.fill = FILL_ERROR
|
||||
elif decision_value == "Favorable":
|
||||
decision_cell.fill = FILL_FAVORABLE
|
||||
elif decision_value == "Défavorable":
|
||||
decision_cell.fill = FILL_DEFAVORABLE
|
||||
else:
|
||||
decision_cell.fill = FILL_UNKNOWN
|
||||
|
||||
# Largeurs de colonnes
|
||||
for col_idx, col_name in enumerate(OUTPUT_COLUMNS, 1):
|
||||
col_letter = chr(64 + col_idx) if col_idx <= 26 else chr(64 + (col_idx - 1) // 26) + chr(65 + (col_idx - 1) % 26)
|
||||
ws.column_dimensions[col_letter].width = COLUMN_WIDTHS.get(col_name, 15)
|
||||
|
||||
# Filtres et gel
|
||||
ws.auto_filter.ref = f"A1:{chr(64 + len(OUTPUT_COLUMNS))}{len(extractions) + 1}"
|
||||
ws.freeze_panes = "A2"
|
||||
|
||||
wb.save(str(output_path))
|
||||
logger.info(f"Excel exporté : {output_path} ({len(extractions)} lignes)")
|
||||
return len(extractions)
|
||||
|
||||
|
||||
def export_csv(extractions: list, output_path: str | Path) -> int:
|
||||
"""
|
||||
Exporte les extractions en fichier CSV.
|
||||
Retourne le nombre de lignes exportées.
|
||||
"""
|
||||
output_path = Path(output_path)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(output_path, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=OUTPUT_COLUMNS, delimiter=';')
|
||||
writer.writeheader()
|
||||
|
||||
for extraction in extractions:
|
||||
row = _extraction_to_row(extraction)
|
||||
writer.writerow(row)
|
||||
|
||||
logger.info(f"CSV exporté : {output_path} ({len(extractions)} lignes)")
|
||||
return len(extractions)
|
||||
Reference in New Issue
Block a user