Initial commit with full extraction pipeline: PDF OCR (docTR), text segmentation, LLM extraction (Ollama), deterministic post-processing normalizer, validation, and Excel/CSV export. The normalizer fixes OCR/LLM errors on CIM-10 codes: - OCR digit→letter confusion in position 1 (1→I, 0→O, 5→S, 2→Z, 8→B) - Missing dot separator (F050→F05.0, R410→R41.0) - '+' instead of '.' (B99+1→B99.1, J961+0→J96.10) - Excess decimals (Z04.880→Z04.88) - OCR letter→digit in positions 2-3 (LO2.2→L02.2) - Literal "null" string purge - Auto-fill codes_retenus from decision context Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
158 lines
5.3 KiB
Python
158 lines
5.3 KiB
Python
"""
|
|
Export des données extraites en Excel et CSV.
|
|
"""
|
|
import csv
|
|
import logging
|
|
from pathlib import Path
|
|
from openpyxl import Workbook
|
|
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
|
|
|
|
from config import OUTPUT_COLUMNS
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Styles Excel
|
|
HEADER_FONT = Font(bold=True, color="FFFFFF", size=11, name="Arial")
|
|
HEADER_FILL = PatternFill("solid", fgColor="2F5496")
|
|
HEADER_ALIGN = Alignment(horizontal="center", vertical="center", wrap_text=True)
|
|
CELL_ALIGN = Alignment(vertical="top", wrap_text=True)
|
|
CELL_FONT = Font(name="Arial", size=10)
|
|
THIN_BORDER = Border(
|
|
left=Side(style='thin'),
|
|
right=Side(style='thin'),
|
|
top=Side(style='thin'),
|
|
bottom=Side(style='thin'),
|
|
)
|
|
|
|
# Couleurs de décision
|
|
FILL_FAVORABLE = PatternFill("solid", fgColor="CCFFCC") # Vert clair
|
|
FILL_DEFAVORABLE = PatternFill("solid", fgColor="FFCCCC") # Rouge clair
|
|
FILL_UNKNOWN = PatternFill("solid", fgColor="FFFFCC") # Jaune clair
|
|
FILL_ERROR = PatternFill("solid", fgColor="FFD9CC") # Orange clair
|
|
|
|
# Largeurs de colonnes
|
|
COLUMN_WIDTHS = {
|
|
"champ": 8,
|
|
"num_ogc": 10,
|
|
"type_desaccord": 14,
|
|
"codes_etablissement": 22,
|
|
"libelle_etablissement": 40,
|
|
"codes_controleurs": 22,
|
|
"libelle_controleurs": 40,
|
|
"decision_ucr": 16,
|
|
"codes_retenus": 22,
|
|
"ghm_ghs": 22,
|
|
"texte_decision": 80,
|
|
}
|
|
|
|
# Labels d'en-tête plus lisibles
|
|
HEADER_LABELS = {
|
|
"champ": "Champ",
|
|
"num_ogc": "N° OGC",
|
|
"type_desaccord": "Type désaccord",
|
|
"codes_etablissement": "Codes Établissement",
|
|
"libelle_etablissement": "Libellé Établissement",
|
|
"codes_controleurs": "Codes Contrôleurs",
|
|
"libelle_controleurs": "Libellé Contrôleurs",
|
|
"decision_ucr": "Décision UCR",
|
|
"codes_retenus": "Codes retenus",
|
|
"ghm_ghs": "GHM / GHS",
|
|
"texte_decision": "Texte décision",
|
|
}
|
|
|
|
|
|
def _extraction_to_row(extraction) -> dict:
|
|
"""Convertit une extraction en dictionnaire pour l'export."""
|
|
return {
|
|
"champ": extraction.champ,
|
|
"num_ogc": extraction.num_ogc,
|
|
"type_desaccord": extraction.type_desaccord,
|
|
"codes_etablissement": extraction.codes_etablissement,
|
|
"libelle_etablissement": extraction.libelle_etablissement,
|
|
"codes_controleurs": extraction.codes_controleurs,
|
|
"libelle_controleurs": extraction.libelle_controleurs,
|
|
"decision_ucr": extraction.decision_ucr,
|
|
"codes_retenus": extraction.codes_retenus,
|
|
"ghm_ghs": extraction.ghm_ghs,
|
|
"texte_decision": extraction.texte_decision,
|
|
}
|
|
|
|
|
|
def export_excel(extractions: list, output_path: str | Path) -> int:
|
|
"""
|
|
Exporte les extractions en fichier Excel formaté.
|
|
Retourne le nombre de lignes exportées.
|
|
"""
|
|
output_path = Path(output_path)
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
wb = Workbook()
|
|
ws = wb.active
|
|
ws.title = "Décisions UCR"
|
|
|
|
# En-têtes
|
|
for col_idx, col_name in enumerate(OUTPUT_COLUMNS, 1):
|
|
cell = ws.cell(row=1, column=col_idx, value=HEADER_LABELS.get(col_name, col_name))
|
|
cell.font = HEADER_FONT
|
|
cell.fill = HEADER_FILL
|
|
cell.alignment = HEADER_ALIGN
|
|
cell.border = THIN_BORDER
|
|
|
|
# Données
|
|
for row_idx, extraction in enumerate(extractions, 2):
|
|
row_data = _extraction_to_row(extraction)
|
|
|
|
for col_idx, col_name in enumerate(OUTPUT_COLUMNS, 1):
|
|
value = row_data.get(col_name)
|
|
cell = ws.cell(row=row_idx, column=col_idx, value=value)
|
|
cell.alignment = CELL_ALIGN
|
|
cell.font = CELL_FONT
|
|
cell.border = THIN_BORDER
|
|
|
|
# Colorer la cellule décision
|
|
decision_col = OUTPUT_COLUMNS.index("decision_ucr") + 1
|
|
decision_cell = ws.cell(row=row_idx, column=decision_col)
|
|
decision_value = row_data.get("decision_ucr", "")
|
|
|
|
if not extraction.extraction_success:
|
|
decision_cell.fill = FILL_ERROR
|
|
elif decision_value == "Favorable":
|
|
decision_cell.fill = FILL_FAVORABLE
|
|
elif decision_value == "Défavorable":
|
|
decision_cell.fill = FILL_DEFAVORABLE
|
|
else:
|
|
decision_cell.fill = FILL_UNKNOWN
|
|
|
|
# Largeurs de colonnes
|
|
for col_idx, col_name in enumerate(OUTPUT_COLUMNS, 1):
|
|
col_letter = chr(64 + col_idx) if col_idx <= 26 else chr(64 + (col_idx - 1) // 26) + chr(65 + (col_idx - 1) % 26)
|
|
ws.column_dimensions[col_letter].width = COLUMN_WIDTHS.get(col_name, 15)
|
|
|
|
# Filtres et gel
|
|
ws.auto_filter.ref = f"A1:{chr(64 + len(OUTPUT_COLUMNS))}{len(extractions) + 1}"
|
|
ws.freeze_panes = "A2"
|
|
|
|
wb.save(str(output_path))
|
|
logger.info(f"Excel exporté : {output_path} ({len(extractions)} lignes)")
|
|
return len(extractions)
|
|
|
|
|
|
def export_csv(extractions: list, output_path: str | Path) -> int:
|
|
"""
|
|
Exporte les extractions en fichier CSV.
|
|
Retourne le nombre de lignes exportées.
|
|
"""
|
|
output_path = Path(output_path)
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(output_path, 'w', newline='', encoding='utf-8') as f:
|
|
writer = csv.DictWriter(f, fieldnames=OUTPUT_COLUMNS, delimiter=';')
|
|
writer.writeheader()
|
|
|
|
for extraction in extractions:
|
|
row = _extraction_to_row(extraction)
|
|
writer.writerow(row)
|
|
|
|
logger.info(f"CSV exporté : {output_path} ({len(extractions)} lignes)")
|
|
return len(extractions)
|