chore: add .gitignore

2026-03-05 00:37:34 +01:00
parent da34bdc8d7
commit d2e0fec97d
2087 changed files with 1485338 additions and 14 deletions
--- a/cpam/parse_decision_ucr.py
+++ b/cpam/parse_decision_ucr.py
@@ -0,0 +1,690 @@
+#!/usr/bin/env python3
+"""
+parse_decision_ucr.py — Extraction des décisions UCR depuis un PDF scanné (contrôle T2A)
+
+Entrée  : PDF scanné de décision UCR (CPAM / Assurance Maladie)
+Sortie  : Fichier Excel (.xlsx) avec une feuille unique
+
+Colonnes extraites (enrichies pour analyse IA) :
+  Champ, OGC, Type_desaccord,
+  Code_etablissement, Libelle_etablissement,
+  Code_controleurs, Libelle_controleurs,
+  Codes_retenus_final,
+  Decision, Texte_decision_complet, Resume_motif,
+  Regles_citees, References_guide,
+  GHM_mentionne, GHS_mentionne, GHM_final, GHS_final,
+  Impact_groupage
+"""
+from __future__ import annotations
+
+import re
+import sys
+from pathlib import Path
+
+import pymupdf
+import pytesseract
+from PIL import Image
+import io
+from openpyxl import Workbook
+from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
+import unicodedata
+
+
+# ---------------------------------------------------------------------------
+# 0. Normalisation texte OCR
+# ---------------------------------------------------------------------------
+
+def normalize_text(text: str) -> str:
+    """Normalise les apostrophes, guillemets et espaces issus de l'OCR."""
+    text = text.replace("\u2018", "'").replace("\u2019", "'")
+    text = text.replace("\u201C", '"').replace("\u201D", '"')
+    text = text.replace("\u00AB", '"').replace("\u00BB", '"')
+    text = text.replace("''", "'")
+    text = text.replace("\u00A0", " ").replace("\u202F", " ")
+    # Erreurs OCR courantes
+    text = re.sub(r"\bF'UCR\b", "l'UCR", text)
+    text = re.sub(r"\bl''UCR\b", "l'UCR", text)
+    return text
+
+
+# ---------------------------------------------------------------------------
+# 1. OCR
+# ---------------------------------------------------------------------------
+
+def ocr_pdf(pdf_path: str, dpi: int = 300) -> str:
+    """Extrait le texte de toutes les pages du PDF via Tesseract OCR."""
+    doc = pymupdf.open(pdf_path)
+    full_text = []
+    total = len(doc)
+    for i, page in enumerate(doc):
+        print(f"  OCR page {i+1}/{total}...", end="\r")
+        mat = pymupdf.Matrix(dpi / 72, dpi / 72)
+        pix = page.get_pixmap(matrix=mat)
+        img = Image.open(io.BytesIO(pix.tobytes("png")))
+        text = pytesseract.image_to_string(img, lang="fra")
+        full_text.append(text)
+    print(f"  OCR terminé : {total} pages.          ")
+    return normalize_text("\n\n".join(full_text))
+
+
+# ---------------------------------------------------------------------------
+# 2. Parsing — Regex
+# ---------------------------------------------------------------------------
+
+RE_CHAMP = re.compile(
+    r"Champ\s*(?:n°\s*)?(\d+)\s*[:\-—]?\s*(?:Séjours|:)",
+    re.IGNORECASE,
+)
+
+RE_OGC_HEADER = re.compile(
+    r"(?:^|\n)\s*OGC\s+(\d+)\s*:",
+    re.MULTILINE,
+)
+
+RE_TYPE_DESACCORD = re.compile(
+    r"(?:désaccord|discussion)\s+porte\s+(?:sur\s+)?(?:le\s+|les\s+)?(DP\s+et\s+(?:le\s+)?DAS|DP\s+et\s+DAS|DP|DAS)",
+    re.IGNORECASE,
+)
+
+RE_CIM10 = re.compile(r"\b([A-Z]\d{2}(?:\.\d{1,2})?)\b")
+
+RE_CODAGE_ETS = re.compile(
+    r"Codage\s+[ée]tablissement\s*:\s*(.*?)(?=Codage\s+contr[ôo]leurs)",
+    re.IGNORECASE | re.DOTALL,
+)
+
+RE_CODAGE_CTRL = re.compile(
+    r"Codage\s+contr[ôo]leurs\s*:\s*(.*?)(?=D[EÉ]C[I1]?SION\s+UCR|PROPOSITION\s+UCR)",
+    re.IGNORECASE | re.DOTALL,
+)
+
+RE_DECISION = re.compile(
+    r"(?:D[EÉ]C[I1]?SION|PROPOSITION)\s+UCR\s*:?\s*(.*)",
+    re.IGNORECASE | re.DOTALL,
+)
+
+# --- Classification ---
+
+RE_FAVORABLE = re.compile(
+    r"(?:"
+    r"retient\s+(?:la\s+demande|le\s+codage|l'avis)\s+(?:de\s+)?l'[ée]tablissement"
+    r"|retient\s+en\s+D[PA]S\s+le\s+code"
+    r"|retient\s+le\s+codage\s+du\s+DP\s+de\s+l'[ée]tablissement"
+    r"|l'UCR\s+retient\s+l'avis\s+de\s+l'[ée]tablissement"
+    r"|confirme\s+l'avis\s+(?:de\s+)?l'[ée]tablissement"
+    r")",
+    re.IGNORECASE,
+)
+
+RE_DEFAVORABLE = re.compile(
+    r"confirme\s+l'avis\s+des\s+(?:m[ée]decins\s+)?contr[oô]leurs",
+    re.IGNORECASE,
+)
+
+RE_UCR_RETIENT = re.compile(r"l'UCR\s+retient\b", re.IGNORECASE)
+RE_UCR_PROPOSE = re.compile(r"l'UCR\s+propose\b", re.IGNORECASE)
+RE_NE_RETIENT_PAS = re.compile(r"ne\s+retient\s+pas", re.IGNORECASE)
+
+# --- GHM / GHS ---
+
+RE_GHM = re.compile(r"GHM\s+([A-Z0-9]{5,7})", re.IGNORECASE)
+RE_GHS = re.compile(r"GHS\s+(\d{3,5})", re.IGNORECASE)
+
+RE_MIEUX_VALORISE = re.compile(r"mieux\s+valoris[ée]", re.IGNORECASE)
+RE_PAS_MODIFIE = re.compile(
+    r"(?:ne\s+modifie\s+pas|ne\s+change(?:nt)?\s+pas|pas\s+de\s+changement|reste\s+group[ée])",
+    re.IGNORECASE,
+)
+
+# --- Règles et références ---
+
+# Pages du guide méthodologique
+RE_GUIDE_PAGE = re.compile(
+    r"(?:guide\s+m[ée]thodologique|guide)\s*(?:p\.?|page)\s*(\d{1,3})",
+    re.IGNORECASE,
+)
+RE_PAGE_GUIDE = re.compile(
+    r"(?:p\.?|page)\s*(\d{1,3})\s+du\s+guide",
+    re.IGNORECASE,
+)
+
+# Règles T (T3, T7, etc.)
+RE_REGLE_T = re.compile(
+    r"r[èe]gle\s+(T\d+)",
+    re.IGNORECASE,
+)
+
+# Fascicules ATIH
+RE_FASCICULE = re.compile(
+    r"fascicule\s+(?:ATIH\s+)?(?:de\s+codage\s+)?(?:PMSI\s+)?(?:n°\s*)?(\d{1,2})?\s*(?:[-–]\s*)?([A-ZÀ-Üa-zà-ü\s]+?)(?:\s+(?:de\s+)?(\d{4}))?(?:\s*(?:,\s*)?(?:p\.?\s*|page\s*)(\d+))?",
+    re.IGNORECASE,
+)
+
+# Avis Agora
+RE_AVIS_AGORA = re.compile(
+    r"avis\s+(?:agora|AGORA)\s*(?:n°\s*)?(\d+)",
+    re.IGNORECASE,
+)
+
+# Consignes de codage avec page
+RE_CONSIGNES_CODAGE = re.compile(
+    r"consignes?\s+de\s+codage\s*(?:p\.?\s*|page\s*)(\d+)",
+    re.IGNORECASE,
+)
+
+# Codage retenu / DP retenu / DAS retenu
+RE_CODAGE_RETENU = re.compile(
+    r"(?:codage\s+retenu|DP\s*(?:retenu|=)|DAS\s*(?:retenu|=)|code\s+retenu|est\s+cod[ée]\s+en|se\s+code)\s*(?:est\s+)?(?::?\s*)([A-Z]\d{2}(?:\.\d{1,2})?)",
+    re.IGNORECASE,
+)
+
+# "est ajouté en DAS" / "ajout du code X"
+RE_CODE_AJOUTE = re.compile(
+    r"(?:est\s+ajout[ée]\s+en\s+D[PA]S|ajout(?:er)?\s+(?:du\s+|en\s+D[PA]S\s+(?:le\s+)?)?(?:code\s+)?)\s*(?::?\s*)([A-Z]\d{2}(?:\.\d{1,2})?)",
+    re.IGNORECASE,
+)
+
+
+# ---------------------------------------------------------------------------
+# 2b. Fonctions d'extraction
+# ---------------------------------------------------------------------------
+
+def extract_codes_and_label(text: str) -> tuple[str, str]:
+    """Extrait les codes CIM-10 et le libellé depuis un bloc de codage."""
+    codes = RE_CIM10.findall(text)
+    labels = re.findall(r'[«"](.*?)[»"]', text)
+    code_str = " + ".join(codes) if codes else ""
+    label_str = " | ".join(labels) if labels else text.strip()[:120]
+    label_str = re.sub(r"\s+", " ", label_str).strip()
+    return code_str, label_str
+
+
+def extract_codes_retenus(decision_text: str) -> str:
+    """Extrait les codes finalement retenus par l'UCR."""
+    codes = set()
+    for m in RE_CODAGE_RETENU.finditer(decision_text):
+        codes.add(m.group(1))
+    for m in RE_CODE_AJOUTE.finditer(decision_text):
+        codes.add(m.group(1))
+    return " + ".join(sorted(codes)) if codes else ""
+
+
+def extract_regles(text: str) -> str:
+    """Extrait les règles de codage citées (T3, T7, etc.)."""
+    regles = set()
+    for m in RE_REGLE_T.finditer(text):
+        regles.add(m.group(1).upper())
+    return ", ".join(sorted(regles)) if regles else ""
+
+
+def extract_references(text: str) -> str:
+    """Extrait toutes les références (guide, fascicules, avis Agora, consignes)."""
+    refs = []
+
+    # Pages du guide méthodologique
+    pages_guide = set()
+    for m in RE_GUIDE_PAGE.finditer(text):
+        pages_guide.add(m.group(1))
+    for m in RE_PAGE_GUIDE.finditer(text):
+        pages_guide.add(m.group(1))
+    if pages_guide:
+        refs.append("Guide méthodologique p." + ", p.".join(sorted(pages_guide, key=int)))
+
+    # Fascicules ATIH
+    for m in RE_FASCICULE.finditer(text):
+        num = m.group(1) or ""
+        sujet = (m.group(2) or "").strip()
+        annee = m.group(3) or ""
+        page = m.group(4) or ""
+        ref = "Fascicule"
+        if num:
+            ref += f" {num}"
+        if sujet:
+            ref += f" {sujet}"
+        if annee:
+            ref += f" ({annee})"
+        if page:
+            ref += f" p.{page}"
+        refs.append(ref.strip())
+
+    # Avis Agora
+    for m in RE_AVIS_AGORA.finditer(text):
+        refs.append(f"Avis Agora n°{m.group(1)}")
+
+    # Consignes de codage
+    for m in RE_CONSIGNES_CODAGE.finditer(text):
+        refs.append(f"Consignes de codage p.{m.group(1)}")
+
+    # Dédupliquer
+    seen = set()
+    unique = []
+    for r in refs:
+        r_lower = r.lower()
+        if r_lower not in seen:
+            seen.add(r_lower)
+            unique.append(r)
+
+    return " ; ".join(unique) if unique else ""
+
+
+def extract_ghm_ghs_all(text: str) -> tuple[list[str], list[str]]:
+    """Extrait tous les GHM et GHS mentionnés."""
+    ghms = []
+    for m in RE_GHM.finditer(text):
+        v = m.group(1).upper()
+        if v not in ghms:
+            ghms.append(v)
+    ghss = []
+    for m in RE_GHS.finditer(text):
+        v = m.group(1)
+        if v not in ghss:
+            ghss.append(v)
+    return ghms, ghss
+
+
+def classify_decision(decision_text: str) -> str:
+    """Classifie la décision : Favorable / Défavorable / Mixte / Indéterminé."""
+    text = normalize_text(decision_text)
+
+    fav = bool(RE_FAVORABLE.search(text))
+    defav = bool(RE_DEFAVORABLE.search(text))
+
+    ucr_retient = bool(RE_UCR_RETIENT.search(text))
+    ucr_propose = bool(RE_UCR_PROPOSE.search(text))
+    ne_retient_pas = bool(RE_NE_RETIENT_PAS.search(text))
+
+    if ucr_retient and not ne_retient_pas:
+        fav = True
+    if ucr_propose and not defav:
+        fav = True
+
+    if (ucr_retient or fav) and defav:
+        return "Mixte"
+    if fav and defav:
+        return "Mixte"
+    elif fav:
+        return "Favorable établissement"
+    elif defav:
+        return "Défavorable établissement"
+    else:
+        return "Indéterminé"
+
+
+def clean_decision_text(text: str) -> str:
+    """Nettoie le texte de décision (supprime artifacts OCR en fin de bloc)."""
+    # Supprimer les lignes de pied de page UCR
+    text = re.sub(r"\n\s*(?:UCR\s+NA|CONFIDENTIEL|Page\s+\d+).*$", "", text, flags=re.MULTILINE | re.IGNORECASE)
+    # Supprimer les artefacts OCR de fin (séquences de caractères isolés)
+    text = re.sub(r"\n\s*[A-Z]{1,4}\s*(?:—|—|-)\s*[a-zA-Z]{0,3}\s*$", "", text, flags=re.MULTILINE)
+    text = re.sub(r"\n\s*(?:EE|ESS|2 ae|A D ES|EE nd)\s*$", "", text, flags=re.MULTILINE | re.IGNORECASE)
+    # Normaliser les espaces
+    text = re.sub(r"[ \t]+", " ", text)
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    return text.strip()
+
+
+# ---------------------------------------------------------------------------
+# 2c. Parsing des blocs
+# ---------------------------------------------------------------------------
+
+def parse_ogc_block(block_text: str, champ: int, ogc_num: int) -> dict:
+    """Parse un bloc OGC et retourne un dictionnaire structuré enrichi."""
+    result = {
+        "Champ": champ,
+        "OGC": ogc_num,
+        "Type_desaccord": "",
+        "Code_etablissement": "",
+        "Libelle_etablissement": "",
+        "Code_controleurs": "",
+        "Libelle_controleurs": "",
+        "Codes_retenus_final": "",
+        "Decision": "",
+        "Texte_decision_complet": "",
+        "Resume_motif": "",
+        "Regles_citees": "",
+        "References_guide": "",
+        "GHM_mentionne": "",
+        "GHS_mentionne": "",
+        "GHM_final": "",
+        "GHS_final": "",
+        "Impact_groupage": "",
+    }
+
+    # Type de désaccord
+    m = RE_TYPE_DESACCORD.search(block_text)
+    if m:
+        raw = m.group(1).upper().strip()
+        raw = re.sub(r"\s+", " ", raw)
+        if "DP" in raw and "DAS" in raw:
+            result["Type_desaccord"] = "DP + DAS"
+        elif "DAS" in raw:
+            result["Type_desaccord"] = "DAS"
+        elif "DP" in raw:
+            result["Type_desaccord"] = "DP"
+
+    # Codage établissement
+    m = RE_CODAGE_ETS.search(block_text)
+    if m:
+        raw_ets = m.group(1).strip()
+        result["Code_etablissement"], result["Libelle_etablissement"] = extract_codes_and_label(raw_ets)
+
+    # Codage contrôleurs
+    m = RE_CODAGE_CTRL.search(block_text)
+    if m:
+        raw_ctrl = m.group(1).strip()
+        if re.search(r"non\s+repris", raw_ctrl, re.IGNORECASE):
+            result["Code_controleurs"] = "non repris"
+            result["Libelle_controleurs"] = ""
+        else:
+            result["Code_controleurs"], result["Libelle_controleurs"] = extract_codes_and_label(raw_ctrl)
+
+    # Décision UCR — TEXTE COMPLET
+    m = RE_DECISION.search(block_text)
+    if m:
+        decision_text = m.group(1).strip()
+        decision_clean = clean_decision_text(decision_text)
+
+        result["Decision"] = classify_decision(decision_clean)
+        result["Texte_decision_complet"] = decision_clean
+
+        # Résumé court (première phrase significative)
+        resume = re.sub(r"\s+", " ", decision_clean)[:300].strip()
+        # Couper à la dernière phrase complète
+        last_dot = resume.rfind(".")
+        if last_dot > 100:
+            resume = resume[:last_dot + 1]
+        result["Resume_motif"] = resume
+
+        # Codes finalement retenus
+        result["Codes_retenus_final"] = extract_codes_retenus(decision_clean)
+
+        # Règles citées (T3, T7, etc.)
+        result["Regles_citees"] = extract_regles(block_text)
+
+        # Références (guide, fascicules, avis Agora)
+        result["References_guide"] = extract_references(block_text)
+
+    # GHM / GHS — tous ceux mentionnés et le dernier (= final)
+    ghms, ghss = extract_ghm_ghs_all(block_text)
+    if ghms:
+        result["GHM_mentionne"] = " / ".join(ghms)
+        result["GHM_final"] = ghms[-1]  # Le dernier mentionné est souvent le final
+    if ghss:
+        result["GHS_mentionne"] = " / ".join(ghss)
+        result["GHS_final"] = ghss[-1]
+
+    # Impact groupage
+    if RE_MIEUX_VALORISE.search(block_text):
+        result["Impact_groupage"] = "Mieux valorisé"
+    elif RE_PAS_MODIFIE.search(block_text):
+        result["Impact_groupage"] = "Pas de changement"
+
+    return result
+
+
+def parse_grouped_ogcs(text_block: str, champ: int, ogc_nums: list[int]) -> list[dict]:
+    """Parse un bloc groupé (ex: OGC 14,19,46,50 traités ensemble)."""
+    template = parse_ogc_block(text_block, champ, ogc_nums[0])
+    results = []
+    for num in ogc_nums:
+        row = dict(template)
+        row["OGC"] = num
+        results.append(row)
+    return results
+
+
+def parse_document(full_text: str) -> list[dict]:
+    """Parse le texte OCR complet et retourne la liste des dossiers."""
+    rows = []
+
+    champ_positions = [(m.start(), int(m.group(1))) for m in RE_CHAMP.finditer(full_text)]
+    ogc_positions = [(m.start(), int(m.group(1))) for m in RE_OGC_HEADER.finditer(full_text)]
+
+    def get_champ_for_position(pos: int) -> int:
+        ch = 0
+        for cp, cn in champ_positions:
+            if cp <= pos:
+                ch = cn
+            else:
+                break
+        return ch
+
+    # Blocs groupés
+    RE_GROUPED = re.compile(
+        r"(?:Concernant|Pour)\s+les\s+OGC\s+([\d,\s]+)",
+        re.IGNORECASE,
+    )
+
+    grouped_ogcs = set()
+    for m in RE_GROUPED.finditer(full_text):
+        nums = [int(n.strip()) for n in m.group(1).split(",") if n.strip().isdigit()]
+        if len(nums) > 1:
+            start = m.start()
+            end = len(full_text)
+            for op, on in ogc_positions:
+                if op > start + 50 and on not in nums:
+                    end = op
+                    break
+            block = full_text[start:end]
+            champ = get_champ_for_position(start)
+            group_rows = parse_grouped_ogcs(block, champ, nums)
+            rows.extend(group_rows)
+            grouped_ogcs.update(nums)
+
+    # OGC individuels
+    for idx, (pos, ogc_num) in enumerate(ogc_positions):
+        champ = get_champ_for_position(pos)
+
+        end = len(full_text)
+        for next_pos, _ in ogc_positions[idx + 1:]:
+            if next_pos > pos + 20:
+                end = next_pos
+                break
+        for cp, _ in champ_positions:
+            if pos < cp < end:
+                end = cp
+                break
+
+        block = full_text[pos:end]
+        row = parse_ogc_block(block, champ, ogc_num)
+
+        if ogc_num in grouped_ogcs:
+            if row["Code_etablissement"] and row["Decision"]:
+                rows = [r for r in rows if r["OGC"] != ogc_num]
+                rows.append(row)
+        else:
+            if row["Code_etablissement"] or row["Decision"]:
+                rows.append(row)
+
+    rows.sort(key=lambda r: (r["Champ"], r["OGC"]))
+
+    # Dédupliquer
+    seen = {}
+    deduped = []
+    for r in rows:
+        key = r["OGC"]
+        if key in seen:
+            old = seen[key]
+            old_score = sum(1 for v in old.values() if v)
+            new_score = sum(1 for v in r.values() if v)
+            if new_score > old_score:
+                deduped = [x for x in deduped if x["OGC"] != key]
+                deduped.append(r)
+                seen[key] = r
+        else:
+            seen[key] = r
+            deduped.append(r)
+
+    deduped.sort(key=lambda r: (r["Champ"], r["OGC"]))
+    return deduped
+
+
+# ---------------------------------------------------------------------------
+# 3. Export Excel
+# ---------------------------------------------------------------------------
+
+HEADERS = [
+    "Champ",
+    "OGC",
+    "Type_desaccord",
+    "Code_etablissement",
+    "Libelle_etablissement",
+    "Code_controleurs",
+    "Libelle_controleurs",
+    "Codes_retenus_final",
+    "Decision",
+    "Texte_decision_complet",
+    "Resume_motif",
+    "Regles_citees",
+    "References_guide",
+    "GHM_mentionne",
+    "GHS_mentionne",
+    "GHM_final",
+    "GHS_final",
+    "Impact_groupage",
+]
+
+HEADER_LABELS = [
+    "Champ",
+    "N° OGC",
+    "Type désaccord",
+    "Code(s) Établissement",
+    "Libellé Établissement",
+    "Code(s) Contrôleurs",
+    "Libellé Contrôleurs",
+    "Code(s) retenus (final)",
+    "Décision UCR",
+    "Texte décision complet",
+    "Résumé du motif",
+    "Règles codage citées",
+    "Références (guide, fascicules, avis)",
+    "GHM mentionné(s)",
+    "GHS mentionné(s)",
+    "GHM final",
+    "GHS final",
+    "Impact groupage",
+]
+
+
+def write_excel(rows: list[dict], output_path: str):
+    """Écrit les résultats dans un fichier Excel (feuille unique)."""
+    wb = Workbook()
+    ws = wb.active
+    ws.title = "Décisions UCR"
+
+    # Styles
+    header_font = Font(bold=True, color="FFFFFF", size=11)
+    header_fill = PatternFill(start_color="2F5496", end_color="2F5496", fill_type="solid")
+    header_align = Alignment(horizontal="center", vertical="center", wrap_text=True)
+    thin_border = Border(
+        left=Side(style="thin"),
+        right=Side(style="thin"),
+        top=Side(style="thin"),
+        bottom=Side(style="thin"),
+    )
+
+    fav_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
+    defav_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
+    mixte_fill = PatternFill(start_color="FFEB9C", end_color="FFEB9C", fill_type="solid")
+
+    # En-têtes
+    for col, label in enumerate(HEADER_LABELS, 1):
+        cell = ws.cell(row=1, column=col, value=label)
+        cell.font = header_font
+        cell.fill = header_fill
+        cell.alignment = header_align
+        cell.border = thin_border
+
+    # Données
+    for row_idx, data in enumerate(rows, 2):
+        for col_idx, key in enumerate(HEADERS, 1):
+            val = data.get(key, "")
+            cell = ws.cell(row=row_idx, column=col_idx, value=val)
+            cell.border = thin_border
+            cell.alignment = Alignment(vertical="top", wrap_text=True)
+
+        # Colorer la colonne Décision
+        dec_col = HEADERS.index("Decision") + 1
+        decision_cell = ws.cell(row=row_idx, column=dec_col)
+        dv = str(decision_cell.value or "")
+        if "Favorable" in dv and "Défavorable" not in dv:
+            decision_cell.fill = fav_fill
+        elif "Défavorable" in dv:
+            decision_cell.fill = defav_fill
+        elif "Mixte" in dv:
+            decision_cell.fill = mixte_fill
+
+    # Largeurs de colonnes
+    col_widths = {
+        "Champ": 8, "OGC": 8, "Type_desaccord": 14,
+        "Code_etablissement": 22, "Libelle_etablissement": 40,
+        "Code_controleurs": 22, "Libelle_controleurs": 40,
+        "Codes_retenus_final": 22,
+        "Decision": 24, "Texte_decision_complet": 80,
+        "Resume_motif": 60,
+        "Regles_citees": 16, "References_guide": 50,
+        "GHM_mentionne": 16, "GHS_mentionne": 16,
+        "GHM_final": 12, "GHS_final": 10,
+        "Impact_groupage": 20,
+    }
+    for i, key in enumerate(HEADERS, 1):
+        ws.column_dimensions[ws.cell(row=1, column=i).column_letter].width = col_widths.get(key, 15)
+
+    # Filtre automatique
+    last_col_letter = ws.cell(row=1, column=len(HEADERS)).column_letter
+    ws.auto_filter.ref = f"A1:{last_col_letter}{len(rows)+1}"
+
+    # Figer la première ligne
+    ws.freeze_panes = "A2"
+
+    wb.save(output_path)
+    print(f"Excel enregistré : {output_path}")
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def main():
+    if len(sys.argv) < 2:
+        pdf_path = str(Path(__file__).parent / "SPHO-FINANC26020915121.pdf")
+    else:
+        pdf_path = sys.argv[1]
+
+    output_path = str(Path(pdf_path).with_suffix(".xlsx"))
+
+    print(f"Fichier PDF : {pdf_path}")
+    print("Étape 1/3 : OCR du document...")
+    full_text = ocr_pdf(pdf_path)
+
+    txt_path = str(Path(pdf_path).with_suffix(".txt"))
+    Path(txt_path).write_text(full_text, encoding="utf-8")
+    print(f"  Texte brut sauvegardé : {txt_path}")
+
+    print("Étape 2/3 : Extraction des décisions...")
+    rows = parse_document(full_text)
+    print(f"  {len(rows)} dossiers OGC extraits.")
+
+    fav = sum(1 for r in rows if "Favorable" in r.get("Decision", "") and "Défavorable" not in r.get("Decision", ""))
+    defav = sum(1 for r in rows if "Défavorable" in r.get("Decision", ""))
+    mixte = sum(1 for r in rows if "Mixte" in r.get("Decision", ""))
+    indet = sum(1 for r in rows if r.get("Decision", "") in ("Indéterminé", ""))
+    refs_count = sum(1 for r in rows if r.get("References_guide"))
+    codes_ret = sum(1 for r in rows if r.get("Codes_retenus_final"))
+    regles = sum(1 for r in rows if r.get("Regles_citees"))
+
+    print(f"  Favorable établissement : {fav}")
+    print(f"  Défavorable établissement : {defav}")
+    print(f"  Mixte : {mixte}")
+    print(f"  Indéterminé : {indet}")
+    print(f"  Avec références citées : {refs_count}")
+    print(f"  Avec codes retenus : {codes_ret}")
+    print(f"  Avec règles T : {regles}")
+
+    print("Étape 3/3 : Génération du fichier Excel...")
+    write_excel(rows, output_path)
+    print("Terminé.")
+
+
+if __name__ == "__main__":
+    main()