feat: T2A-Extractor pipeline with CIM-10 normalizer (31→0 warnings)

Initial commit with full extraction pipeline: PDF OCR (docTR), text segmentation, LLM extraction (Ollama), deterministic post-processing normalizer, validation, and Excel/CSV export. The normalizer fixes OCR/LLM errors on CIM-10 codes: - OCR digit→letter confusion in position 1 (1→I, 0→O, 5→S, 2→Z, 8→B) - Missing dot separator (F050→F05.0, R410→R41.0) - '+' instead of '.' (B99+1→B99.1, J961+0→J96.10) - Excess decimals (Z04.880→Z04.88) - OCR letter→digit in positions 2-3 (LO2.2→L02.2) - Literal "null" string purge - Auto-fill codes_retenus from decision context Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-23 20:44:32 +01:00
commit f70d138db3
13 changed files with 1699 additions and 0 deletions
--- a/extractor/exporter.py
+++ b/extractor/exporter.py
@@ -0,0 +1,157 @@
+"""
+Export des données extraites en Excel et CSV.
+"""
+import csv
+import logging
+from pathlib import Path
+from openpyxl import Workbook
+from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
+
+from config import OUTPUT_COLUMNS
+
+logger = logging.getLogger(__name__)
+
+# Styles Excel
+HEADER_FONT = Font(bold=True, color="FFFFFF", size=11, name="Arial")
+HEADER_FILL = PatternFill("solid", fgColor="2F5496")
+HEADER_ALIGN = Alignment(horizontal="center", vertical="center", wrap_text=True)
+CELL_ALIGN = Alignment(vertical="top", wrap_text=True)
+CELL_FONT = Font(name="Arial", size=10)
+THIN_BORDER = Border(
+    left=Side(style='thin'),
+    right=Side(style='thin'),
+    top=Side(style='thin'),
+    bottom=Side(style='thin'),
+)
+
+# Couleurs de décision
+FILL_FAVORABLE = PatternFill("solid", fgColor="CCFFCC")    # Vert clair
+FILL_DEFAVORABLE = PatternFill("solid", fgColor="FFCCCC")  # Rouge clair
+FILL_UNKNOWN = PatternFill("solid", fgColor="FFFFCC")      # Jaune clair
+FILL_ERROR = PatternFill("solid", fgColor="FFD9CC")        # Orange clair
+
+# Largeurs de colonnes
+COLUMN_WIDTHS = {
+    "champ": 8,
+    "num_ogc": 10,
+    "type_desaccord": 14,
+    "codes_etablissement": 22,
+    "libelle_etablissement": 40,
+    "codes_controleurs": 22,
+    "libelle_controleurs": 40,
+    "decision_ucr": 16,
+    "codes_retenus": 22,
+    "ghm_ghs": 22,
+    "texte_decision": 80,
+}
+
+# Labels d'en-tête plus lisibles
+HEADER_LABELS = {
+    "champ": "Champ",
+    "num_ogc": "N° OGC",
+    "type_desaccord": "Type désaccord",
+    "codes_etablissement": "Codes Établissement",
+    "libelle_etablissement": "Libellé Établissement",
+    "codes_controleurs": "Codes Contrôleurs",
+    "libelle_controleurs": "Libellé Contrôleurs",
+    "decision_ucr": "Décision UCR",
+    "codes_retenus": "Codes retenus",
+    "ghm_ghs": "GHM / GHS",
+    "texte_decision": "Texte décision",
+}
+
+
+def _extraction_to_row(extraction) -> dict:
+    """Convertit une extraction en dictionnaire pour l'export."""
+    return {
+        "champ": extraction.champ,
+        "num_ogc": extraction.num_ogc,
+        "type_desaccord": extraction.type_desaccord,
+        "codes_etablissement": extraction.codes_etablissement,
+        "libelle_etablissement": extraction.libelle_etablissement,
+        "codes_controleurs": extraction.codes_controleurs,
+        "libelle_controleurs": extraction.libelle_controleurs,
+        "decision_ucr": extraction.decision_ucr,
+        "codes_retenus": extraction.codes_retenus,
+        "ghm_ghs": extraction.ghm_ghs,
+        "texte_decision": extraction.texte_decision,
+    }
+
+
+def export_excel(extractions: list, output_path: str | Path) -> int:
+    """
+    Exporte les extractions en fichier Excel formaté.
+    Retourne le nombre de lignes exportées.
+    """
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    wb = Workbook()
+    ws = wb.active
+    ws.title = "Décisions UCR"
+
+    # En-têtes
+    for col_idx, col_name in enumerate(OUTPUT_COLUMNS, 1):
+        cell = ws.cell(row=1, column=col_idx, value=HEADER_LABELS.get(col_name, col_name))
+        cell.font = HEADER_FONT
+        cell.fill = HEADER_FILL
+        cell.alignment = HEADER_ALIGN
+        cell.border = THIN_BORDER
+
+    # Données
+    for row_idx, extraction in enumerate(extractions, 2):
+        row_data = _extraction_to_row(extraction)
+
+        for col_idx, col_name in enumerate(OUTPUT_COLUMNS, 1):
+            value = row_data.get(col_name)
+            cell = ws.cell(row=row_idx, column=col_idx, value=value)
+            cell.alignment = CELL_ALIGN
+            cell.font = CELL_FONT
+            cell.border = THIN_BORDER
+
+        # Colorer la cellule décision
+        decision_col = OUTPUT_COLUMNS.index("decision_ucr") + 1
+        decision_cell = ws.cell(row=row_idx, column=decision_col)
+        decision_value = row_data.get("decision_ucr", "")
+
+        if not extraction.extraction_success:
+            decision_cell.fill = FILL_ERROR
+        elif decision_value == "Favorable":
+            decision_cell.fill = FILL_FAVORABLE
+        elif decision_value == "Défavorable":
+            decision_cell.fill = FILL_DEFAVORABLE
+        else:
+            decision_cell.fill = FILL_UNKNOWN
+
+    # Largeurs de colonnes
+    for col_idx, col_name in enumerate(OUTPUT_COLUMNS, 1):
+        col_letter = chr(64 + col_idx) if col_idx <= 26 else chr(64 + (col_idx - 1) // 26) + chr(65 + (col_idx - 1) % 26)
+        ws.column_dimensions[col_letter].width = COLUMN_WIDTHS.get(col_name, 15)
+
+    # Filtres et gel
+    ws.auto_filter.ref = f"A1:{chr(64 + len(OUTPUT_COLUMNS))}{len(extractions) + 1}"
+    ws.freeze_panes = "A2"
+
+    wb.save(str(output_path))
+    logger.info(f"Excel exporté : {output_path} ({len(extractions)} lignes)")
+    return len(extractions)
+
+
+def export_csv(extractions: list, output_path: str | Path) -> int:
+    """
+    Exporte les extractions en fichier CSV.
+    Retourne le nombre de lignes exportées.
+    """
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    with open(output_path, 'w', newline='', encoding='utf-8') as f:
+        writer = csv.DictWriter(f, fieldnames=OUTPUT_COLUMNS, delimiter=';')
+        writer.writeheader()
+
+        for extraction in extractions:
+            row = _extraction_to_row(extraction)
+            writer.writerow(row)
+
+    logger.info(f"CSV exporté : {output_path} ({len(extractions)} lignes)")
+    return len(extractions)