Files
t2a-extractor/extractor/exporter.py
dom f70d138db3 feat: T2A-Extractor pipeline with CIM-10 normalizer (31→0 warnings)
Initial commit with full extraction pipeline: PDF OCR (docTR), text
segmentation, LLM extraction (Ollama), deterministic post-processing
normalizer, validation, and Excel/CSV export.

The normalizer fixes OCR/LLM errors on CIM-10 codes:
- OCR digit→letter confusion in position 1 (1→I, 0→O, 5→S, 2→Z, 8→B)
- Missing dot separator (F050→F05.0, R410→R41.0)
- '+' instead of '.' (B99+1→B99.1, J961+0→J96.10)
- Excess decimals (Z04.880→Z04.88)
- OCR letter→digit in positions 2-3 (LO2.2→L02.2)
- Literal "null" string purge
- Auto-fill codes_retenus from decision context

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-23 20:44:32 +01:00

158 lines
5.3 KiB
Python

"""
Export des données extraites en Excel et CSV.
"""
import csv
import logging
from pathlib import Path
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
from config import OUTPUT_COLUMNS
logger = logging.getLogger(__name__)
# Styles Excel
HEADER_FONT = Font(bold=True, color="FFFFFF", size=11, name="Arial")
HEADER_FILL = PatternFill("solid", fgColor="2F5496")
HEADER_ALIGN = Alignment(horizontal="center", vertical="center", wrap_text=True)
CELL_ALIGN = Alignment(vertical="top", wrap_text=True)
CELL_FONT = Font(name="Arial", size=10)
THIN_BORDER = Border(
left=Side(style='thin'),
right=Side(style='thin'),
top=Side(style='thin'),
bottom=Side(style='thin'),
)
# Couleurs de décision
FILL_FAVORABLE = PatternFill("solid", fgColor="CCFFCC") # Vert clair
FILL_DEFAVORABLE = PatternFill("solid", fgColor="FFCCCC") # Rouge clair
FILL_UNKNOWN = PatternFill("solid", fgColor="FFFFCC") # Jaune clair
FILL_ERROR = PatternFill("solid", fgColor="FFD9CC") # Orange clair
# Largeurs de colonnes
COLUMN_WIDTHS = {
"champ": 8,
"num_ogc": 10,
"type_desaccord": 14,
"codes_etablissement": 22,
"libelle_etablissement": 40,
"codes_controleurs": 22,
"libelle_controleurs": 40,
"decision_ucr": 16,
"codes_retenus": 22,
"ghm_ghs": 22,
"texte_decision": 80,
}
# Labels d'en-tête plus lisibles
HEADER_LABELS = {
"champ": "Champ",
"num_ogc": "N° OGC",
"type_desaccord": "Type désaccord",
"codes_etablissement": "Codes Établissement",
"libelle_etablissement": "Libellé Établissement",
"codes_controleurs": "Codes Contrôleurs",
"libelle_controleurs": "Libellé Contrôleurs",
"decision_ucr": "Décision UCR",
"codes_retenus": "Codes retenus",
"ghm_ghs": "GHM / GHS",
"texte_decision": "Texte décision",
}
def _extraction_to_row(extraction) -> dict:
"""Convertit une extraction en dictionnaire pour l'export."""
return {
"champ": extraction.champ,
"num_ogc": extraction.num_ogc,
"type_desaccord": extraction.type_desaccord,
"codes_etablissement": extraction.codes_etablissement,
"libelle_etablissement": extraction.libelle_etablissement,
"codes_controleurs": extraction.codes_controleurs,
"libelle_controleurs": extraction.libelle_controleurs,
"decision_ucr": extraction.decision_ucr,
"codes_retenus": extraction.codes_retenus,
"ghm_ghs": extraction.ghm_ghs,
"texte_decision": extraction.texte_decision,
}
def export_excel(extractions: list, output_path: str | Path) -> int:
"""
Exporte les extractions en fichier Excel formaté.
Retourne le nombre de lignes exportées.
"""
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
wb = Workbook()
ws = wb.active
ws.title = "Décisions UCR"
# En-têtes
for col_idx, col_name in enumerate(OUTPUT_COLUMNS, 1):
cell = ws.cell(row=1, column=col_idx, value=HEADER_LABELS.get(col_name, col_name))
cell.font = HEADER_FONT
cell.fill = HEADER_FILL
cell.alignment = HEADER_ALIGN
cell.border = THIN_BORDER
# Données
for row_idx, extraction in enumerate(extractions, 2):
row_data = _extraction_to_row(extraction)
for col_idx, col_name in enumerate(OUTPUT_COLUMNS, 1):
value = row_data.get(col_name)
cell = ws.cell(row=row_idx, column=col_idx, value=value)
cell.alignment = CELL_ALIGN
cell.font = CELL_FONT
cell.border = THIN_BORDER
# Colorer la cellule décision
decision_col = OUTPUT_COLUMNS.index("decision_ucr") + 1
decision_cell = ws.cell(row=row_idx, column=decision_col)
decision_value = row_data.get("decision_ucr", "")
if not extraction.extraction_success:
decision_cell.fill = FILL_ERROR
elif decision_value == "Favorable":
decision_cell.fill = FILL_FAVORABLE
elif decision_value == "Défavorable":
decision_cell.fill = FILL_DEFAVORABLE
else:
decision_cell.fill = FILL_UNKNOWN
# Largeurs de colonnes
for col_idx, col_name in enumerate(OUTPUT_COLUMNS, 1):
col_letter = chr(64 + col_idx) if col_idx <= 26 else chr(64 + (col_idx - 1) // 26) + chr(65 + (col_idx - 1) % 26)
ws.column_dimensions[col_letter].width = COLUMN_WIDTHS.get(col_name, 15)
# Filtres et gel
ws.auto_filter.ref = f"A1:{chr(64 + len(OUTPUT_COLUMNS))}{len(extractions) + 1}"
ws.freeze_panes = "A2"
wb.save(str(output_path))
logger.info(f"Excel exporté : {output_path} ({len(extractions)} lignes)")
return len(extractions)
def export_csv(extractions: list, output_path: str | Path) -> int:
"""
Exporte les extractions en fichier CSV.
Retourne le nombre de lignes exportées.
"""
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=OUTPUT_COLUMNS, delimiter=';')
writer.writeheader()
for extraction in extractions:
row = _extraction_to_row(extraction)
writer.writerow(row)
logger.info(f"CSV exporté : {output_path} ({len(extractions)} lignes)")
return len(extractions)