#!/usr/bin/env python3 """ parse_decision_ucr.py — Extraction des décisions UCR depuis un PDF scanné (contrôle T2A) Entrée : PDF scanné de décision UCR (CPAM / Assurance Maladie) Sortie : Fichier Excel (.xlsx) avec une feuille unique Colonnes extraites (enrichies pour analyse IA) : Champ, OGC, Type_desaccord, Code_etablissement, Libelle_etablissement, Code_controleurs, Libelle_controleurs, Codes_retenus_final, Decision, Texte_decision_complet, Resume_motif, Regles_citees, References_guide, GHM_mentionne, GHS_mentionne, GHM_final, GHS_final, Impact_groupage """ from __future__ import annotations import re import sys from pathlib import Path import pymupdf import pytesseract from PIL import Image import io from openpyxl import Workbook from openpyxl.styles import Font, PatternFill, Alignment, Border, Side import unicodedata # --------------------------------------------------------------------------- # 0. Normalisation texte OCR # --------------------------------------------------------------------------- def normalize_text(text: str) -> str: """Normalise les apostrophes, guillemets et espaces issus de l'OCR.""" text = text.replace("\u2018", "'").replace("\u2019", "'") text = text.replace("\u201C", '"').replace("\u201D", '"') text = text.replace("\u00AB", '"').replace("\u00BB", '"') text = text.replace("''", "'") text = text.replace("\u00A0", " ").replace("\u202F", " ") # Erreurs OCR courantes text = re.sub(r"\bF'UCR\b", "l'UCR", text) text = re.sub(r"\bl''UCR\b", "l'UCR", text) return text # --------------------------------------------------------------------------- # 1. OCR # --------------------------------------------------------------------------- def ocr_pdf(pdf_path: str, dpi: int = 300) -> str: """Extrait le texte de toutes les pages du PDF via Tesseract OCR.""" doc = pymupdf.open(pdf_path) full_text = [] total = len(doc) for i, page in enumerate(doc): print(f" OCR page {i+1}/{total}...", end="\r") mat = pymupdf.Matrix(dpi / 72, dpi / 72) pix = page.get_pixmap(matrix=mat) img = Image.open(io.BytesIO(pix.tobytes("png"))) text = pytesseract.image_to_string(img, lang="fra") full_text.append(text) print(f" OCR terminé : {total} pages. ") return normalize_text("\n\n".join(full_text)) # --------------------------------------------------------------------------- # 2. Parsing — Regex # --------------------------------------------------------------------------- RE_CHAMP = re.compile( r"Champ\s*(?:n°\s*)?(\d+)\s*[:\-—]?\s*(?:Séjours|:)", re.IGNORECASE, ) RE_OGC_HEADER = re.compile( r"(?:^|\n)\s*OGC\s+(\d+)\s*:", re.MULTILINE, ) RE_TYPE_DESACCORD = re.compile( r"(?:désaccord|discussion)\s+porte\s+(?:sur\s+)?(?:le\s+|les\s+)?(DP\s+et\s+(?:le\s+)?DAS|DP\s+et\s+DAS|DP|DAS)", re.IGNORECASE, ) RE_CIM10 = re.compile(r"\b([A-Z]\d{2}(?:\.\d{1,2})?)\b") RE_CODAGE_ETS = re.compile( r"Codage\s+[ée]tablissement\s*:\s*(.*?)(?=Codage\s+contr[ôo]leurs)", re.IGNORECASE | re.DOTALL, ) RE_CODAGE_CTRL = re.compile( r"Codage\s+contr[ôo]leurs\s*:\s*(.*?)(?=D[EÉ]C[I1]?SION\s+UCR|PROPOSITION\s+UCR)", re.IGNORECASE | re.DOTALL, ) RE_DECISION = re.compile( r"(?:D[EÉ]C[I1]?SION|PROPOSITION)\s+UCR\s*:?\s*(.*)", re.IGNORECASE | re.DOTALL, ) # --- Classification --- RE_FAVORABLE = re.compile( r"(?:" r"retient\s+(?:la\s+demande|le\s+codage|l'avis)\s+(?:de\s+)?l'[ée]tablissement" r"|retient\s+en\s+D[PA]S\s+le\s+code" r"|retient\s+le\s+codage\s+du\s+DP\s+de\s+l'[ée]tablissement" r"|l'UCR\s+retient\s+l'avis\s+de\s+l'[ée]tablissement" r"|confirme\s+l'avis\s+(?:de\s+)?l'[ée]tablissement" r")", re.IGNORECASE, ) RE_DEFAVORABLE = re.compile( r"confirme\s+l'avis\s+des\s+(?:m[ée]decins\s+)?contr[oô]leurs", re.IGNORECASE, ) RE_UCR_RETIENT = re.compile(r"l'UCR\s+retient\b", re.IGNORECASE) RE_UCR_PROPOSE = re.compile(r"l'UCR\s+propose\b", re.IGNORECASE) RE_NE_RETIENT_PAS = re.compile(r"ne\s+retient\s+pas", re.IGNORECASE) # --- GHM / GHS --- RE_GHM = re.compile(r"GHM\s+([A-Z0-9]{5,7})", re.IGNORECASE) RE_GHS = re.compile(r"GHS\s+(\d{3,5})", re.IGNORECASE) RE_MIEUX_VALORISE = re.compile(r"mieux\s+valoris[ée]", re.IGNORECASE) RE_PAS_MODIFIE = re.compile( r"(?:ne\s+modifie\s+pas|ne\s+change(?:nt)?\s+pas|pas\s+de\s+changement|reste\s+group[ée])", re.IGNORECASE, ) # --- Règles et références --- # Pages du guide méthodologique RE_GUIDE_PAGE = re.compile( r"(?:guide\s+m[ée]thodologique|guide)\s*(?:p\.?|page)\s*(\d{1,3})", re.IGNORECASE, ) RE_PAGE_GUIDE = re.compile( r"(?:p\.?|page)\s*(\d{1,3})\s+du\s+guide", re.IGNORECASE, ) # Règles T (T3, T7, etc.) RE_REGLE_T = re.compile( r"r[èe]gle\s+(T\d+)", re.IGNORECASE, ) # Fascicules ATIH RE_FASCICULE = re.compile( r"fascicule\s+(?:ATIH\s+)?(?:de\s+codage\s+)?(?:PMSI\s+)?(?:n°\s*)?(\d{1,2})?\s*(?:[-–]\s*)?([A-ZÀ-Üa-zà-ü\s]+?)(?:\s+(?:de\s+)?(\d{4}))?(?:\s*(?:,\s*)?(?:p\.?\s*|page\s*)(\d+))?", re.IGNORECASE, ) # Avis Agora RE_AVIS_AGORA = re.compile( r"avis\s+(?:agora|AGORA)\s*(?:n°\s*)?(\d+)", re.IGNORECASE, ) # Consignes de codage avec page RE_CONSIGNES_CODAGE = re.compile( r"consignes?\s+de\s+codage\s*(?:p\.?\s*|page\s*)(\d+)", re.IGNORECASE, ) # Codage retenu / DP retenu / DAS retenu RE_CODAGE_RETENU = re.compile( r"(?:codage\s+retenu|DP\s*(?:retenu|=)|DAS\s*(?:retenu|=)|code\s+retenu|est\s+cod[ée]\s+en|se\s+code)\s*(?:est\s+)?(?::?\s*)([A-Z]\d{2}(?:\.\d{1,2})?)", re.IGNORECASE, ) # "est ajouté en DAS" / "ajout du code X" RE_CODE_AJOUTE = re.compile( r"(?:est\s+ajout[ée]\s+en\s+D[PA]S|ajout(?:er)?\s+(?:du\s+|en\s+D[PA]S\s+(?:le\s+)?)?(?:code\s+)?)\s*(?::?\s*)([A-Z]\d{2}(?:\.\d{1,2})?)", re.IGNORECASE, ) # --------------------------------------------------------------------------- # 2b. Fonctions d'extraction # --------------------------------------------------------------------------- def extract_codes_and_label(text: str) -> tuple[str, str]: """Extrait les codes CIM-10 et le libellé depuis un bloc de codage.""" codes = RE_CIM10.findall(text) labels = re.findall(r'[«"](.*?)[»"]', text) code_str = " + ".join(codes) if codes else "" label_str = " | ".join(labels) if labels else text.strip()[:120] label_str = re.sub(r"\s+", " ", label_str).strip() return code_str, label_str def extract_codes_retenus(decision_text: str) -> str: """Extrait les codes finalement retenus par l'UCR.""" codes = set() for m in RE_CODAGE_RETENU.finditer(decision_text): codes.add(m.group(1)) for m in RE_CODE_AJOUTE.finditer(decision_text): codes.add(m.group(1)) return " + ".join(sorted(codes)) if codes else "" def extract_regles(text: str) -> str: """Extrait les règles de codage citées (T3, T7, etc.).""" regles = set() for m in RE_REGLE_T.finditer(text): regles.add(m.group(1).upper()) return ", ".join(sorted(regles)) if regles else "" def extract_references(text: str) -> str: """Extrait toutes les références (guide, fascicules, avis Agora, consignes).""" refs = [] # Pages du guide méthodologique pages_guide = set() for m in RE_GUIDE_PAGE.finditer(text): pages_guide.add(m.group(1)) for m in RE_PAGE_GUIDE.finditer(text): pages_guide.add(m.group(1)) if pages_guide: refs.append("Guide méthodologique p." + ", p.".join(sorted(pages_guide, key=int))) # Fascicules ATIH for m in RE_FASCICULE.finditer(text): num = m.group(1) or "" sujet = (m.group(2) or "").strip() annee = m.group(3) or "" page = m.group(4) or "" ref = "Fascicule" if num: ref += f" {num}" if sujet: ref += f" {sujet}" if annee: ref += f" ({annee})" if page: ref += f" p.{page}" refs.append(ref.strip()) # Avis Agora for m in RE_AVIS_AGORA.finditer(text): refs.append(f"Avis Agora n°{m.group(1)}") # Consignes de codage for m in RE_CONSIGNES_CODAGE.finditer(text): refs.append(f"Consignes de codage p.{m.group(1)}") # Dédupliquer seen = set() unique = [] for r in refs: r_lower = r.lower() if r_lower not in seen: seen.add(r_lower) unique.append(r) return " ; ".join(unique) if unique else "" def extract_ghm_ghs_all(text: str) -> tuple[list[str], list[str]]: """Extrait tous les GHM et GHS mentionnés.""" ghms = [] for m in RE_GHM.finditer(text): v = m.group(1).upper() if v not in ghms: ghms.append(v) ghss = [] for m in RE_GHS.finditer(text): v = m.group(1) if v not in ghss: ghss.append(v) return ghms, ghss def classify_decision(decision_text: str) -> str: """Classifie la décision : Favorable / Défavorable / Mixte / Indéterminé.""" text = normalize_text(decision_text) fav = bool(RE_FAVORABLE.search(text)) defav = bool(RE_DEFAVORABLE.search(text)) ucr_retient = bool(RE_UCR_RETIENT.search(text)) ucr_propose = bool(RE_UCR_PROPOSE.search(text)) ne_retient_pas = bool(RE_NE_RETIENT_PAS.search(text)) if ucr_retient and not ne_retient_pas: fav = True if ucr_propose and not defav: fav = True if (ucr_retient or fav) and defav: return "Mixte" if fav and defav: return "Mixte" elif fav: return "Favorable établissement" elif defav: return "Défavorable établissement" else: return "Indéterminé" def clean_decision_text(text: str) -> str: """Nettoie le texte de décision (supprime artifacts OCR en fin de bloc).""" # Supprimer les lignes de pied de page UCR text = re.sub(r"\n\s*(?:UCR\s+NA|CONFIDENTIEL|Page\s+\d+).*$", "", text, flags=re.MULTILINE | re.IGNORECASE) # Supprimer les artefacts OCR de fin (séquences de caractères isolés) text = re.sub(r"\n\s*[A-Z]{1,4}\s*(?:—|—|-)\s*[a-zA-Z]{0,3}\s*$", "", text, flags=re.MULTILINE) text = re.sub(r"\n\s*(?:EE|ESS|2 ae|A D ES|EE nd)\s*$", "", text, flags=re.MULTILINE | re.IGNORECASE) # Normaliser les espaces text = re.sub(r"[ \t]+", " ", text) text = re.sub(r"\n{3,}", "\n\n", text) return text.strip() # --------------------------------------------------------------------------- # 2c. Parsing des blocs # --------------------------------------------------------------------------- def parse_ogc_block(block_text: str, champ: int, ogc_num: int) -> dict: """Parse un bloc OGC et retourne un dictionnaire structuré enrichi.""" result = { "Champ": champ, "OGC": ogc_num, "Type_desaccord": "", "Code_etablissement": "", "Libelle_etablissement": "", "Code_controleurs": "", "Libelle_controleurs": "", "Codes_retenus_final": "", "Decision": "", "Texte_decision_complet": "", "Resume_motif": "", "Regles_citees": "", "References_guide": "", "GHM_mentionne": "", "GHS_mentionne": "", "GHM_final": "", "GHS_final": "", "Impact_groupage": "", } # Type de désaccord m = RE_TYPE_DESACCORD.search(block_text) if m: raw = m.group(1).upper().strip() raw = re.sub(r"\s+", " ", raw) if "DP" in raw and "DAS" in raw: result["Type_desaccord"] = "DP + DAS" elif "DAS" in raw: result["Type_desaccord"] = "DAS" elif "DP" in raw: result["Type_desaccord"] = "DP" # Codage établissement m = RE_CODAGE_ETS.search(block_text) if m: raw_ets = m.group(1).strip() result["Code_etablissement"], result["Libelle_etablissement"] = extract_codes_and_label(raw_ets) # Codage contrôleurs m = RE_CODAGE_CTRL.search(block_text) if m: raw_ctrl = m.group(1).strip() if re.search(r"non\s+repris", raw_ctrl, re.IGNORECASE): result["Code_controleurs"] = "non repris" result["Libelle_controleurs"] = "" else: result["Code_controleurs"], result["Libelle_controleurs"] = extract_codes_and_label(raw_ctrl) # Décision UCR — TEXTE COMPLET m = RE_DECISION.search(block_text) if m: decision_text = m.group(1).strip() decision_clean = clean_decision_text(decision_text) result["Decision"] = classify_decision(decision_clean) result["Texte_decision_complet"] = decision_clean # Résumé court (première phrase significative) resume = re.sub(r"\s+", " ", decision_clean)[:300].strip() # Couper à la dernière phrase complète last_dot = resume.rfind(".") if last_dot > 100: resume = resume[:last_dot + 1] result["Resume_motif"] = resume # Codes finalement retenus result["Codes_retenus_final"] = extract_codes_retenus(decision_clean) # Règles citées (T3, T7, etc.) result["Regles_citees"] = extract_regles(block_text) # Références (guide, fascicules, avis Agora) result["References_guide"] = extract_references(block_text) # GHM / GHS — tous ceux mentionnés et le dernier (= final) ghms, ghss = extract_ghm_ghs_all(block_text) if ghms: result["GHM_mentionne"] = " / ".join(ghms) result["GHM_final"] = ghms[-1] # Le dernier mentionné est souvent le final if ghss: result["GHS_mentionne"] = " / ".join(ghss) result["GHS_final"] = ghss[-1] # Impact groupage if RE_MIEUX_VALORISE.search(block_text): result["Impact_groupage"] = "Mieux valorisé" elif RE_PAS_MODIFIE.search(block_text): result["Impact_groupage"] = "Pas de changement" return result def parse_grouped_ogcs(text_block: str, champ: int, ogc_nums: list[int]) -> list[dict]: """Parse un bloc groupé (ex: OGC 14,19,46,50 traités ensemble).""" template = parse_ogc_block(text_block, champ, ogc_nums[0]) results = [] for num in ogc_nums: row = dict(template) row["OGC"] = num results.append(row) return results def parse_document(full_text: str) -> list[dict]: """Parse le texte OCR complet et retourne la liste des dossiers.""" rows = [] champ_positions = [(m.start(), int(m.group(1))) for m in RE_CHAMP.finditer(full_text)] ogc_positions = [(m.start(), int(m.group(1))) for m in RE_OGC_HEADER.finditer(full_text)] def get_champ_for_position(pos: int) -> int: ch = 0 for cp, cn in champ_positions: if cp <= pos: ch = cn else: break return ch # Blocs groupés RE_GROUPED = re.compile( r"(?:Concernant|Pour)\s+les\s+OGC\s+([\d,\s]+)", re.IGNORECASE, ) grouped_ogcs = set() for m in RE_GROUPED.finditer(full_text): nums = [int(n.strip()) for n in m.group(1).split(",") if n.strip().isdigit()] if len(nums) > 1: start = m.start() end = len(full_text) for op, on in ogc_positions: if op > start + 50 and on not in nums: end = op break block = full_text[start:end] champ = get_champ_for_position(start) group_rows = parse_grouped_ogcs(block, champ, nums) rows.extend(group_rows) grouped_ogcs.update(nums) # OGC individuels for idx, (pos, ogc_num) in enumerate(ogc_positions): champ = get_champ_for_position(pos) end = len(full_text) for next_pos, _ in ogc_positions[idx + 1:]: if next_pos > pos + 20: end = next_pos break for cp, _ in champ_positions: if pos < cp < end: end = cp break block = full_text[pos:end] row = parse_ogc_block(block, champ, ogc_num) if ogc_num in grouped_ogcs: if row["Code_etablissement"] and row["Decision"]: rows = [r for r in rows if r["OGC"] != ogc_num] rows.append(row) else: if row["Code_etablissement"] or row["Decision"]: rows.append(row) rows.sort(key=lambda r: (r["Champ"], r["OGC"])) # Dédupliquer seen = {} deduped = [] for r in rows: key = r["OGC"] if key in seen: old = seen[key] old_score = sum(1 for v in old.values() if v) new_score = sum(1 for v in r.values() if v) if new_score > old_score: deduped = [x for x in deduped if x["OGC"] != key] deduped.append(r) seen[key] = r else: seen[key] = r deduped.append(r) deduped.sort(key=lambda r: (r["Champ"], r["OGC"])) return deduped # --------------------------------------------------------------------------- # 3. Export Excel # --------------------------------------------------------------------------- HEADERS = [ "Champ", "OGC", "Type_desaccord", "Code_etablissement", "Libelle_etablissement", "Code_controleurs", "Libelle_controleurs", "Codes_retenus_final", "Decision", "Texte_decision_complet", "Resume_motif", "Regles_citees", "References_guide", "GHM_mentionne", "GHS_mentionne", "GHM_final", "GHS_final", "Impact_groupage", ] HEADER_LABELS = [ "Champ", "N° OGC", "Type désaccord", "Code(s) Établissement", "Libellé Établissement", "Code(s) Contrôleurs", "Libellé Contrôleurs", "Code(s) retenus (final)", "Décision UCR", "Texte décision complet", "Résumé du motif", "Règles codage citées", "Références (guide, fascicules, avis)", "GHM mentionné(s)", "GHS mentionné(s)", "GHM final", "GHS final", "Impact groupage", ] def write_excel(rows: list[dict], output_path: str): """Écrit les résultats dans un fichier Excel (feuille unique).""" wb = Workbook() ws = wb.active ws.title = "Décisions UCR" # Styles header_font = Font(bold=True, color="FFFFFF", size=11) header_fill = PatternFill(start_color="2F5496", end_color="2F5496", fill_type="solid") header_align = Alignment(horizontal="center", vertical="center", wrap_text=True) thin_border = Border( left=Side(style="thin"), right=Side(style="thin"), top=Side(style="thin"), bottom=Side(style="thin"), ) fav_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid") defav_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid") mixte_fill = PatternFill(start_color="FFEB9C", end_color="FFEB9C", fill_type="solid") # En-têtes for col, label in enumerate(HEADER_LABELS, 1): cell = ws.cell(row=1, column=col, value=label) cell.font = header_font cell.fill = header_fill cell.alignment = header_align cell.border = thin_border # Données for row_idx, data in enumerate(rows, 2): for col_idx, key in enumerate(HEADERS, 1): val = data.get(key, "") cell = ws.cell(row=row_idx, column=col_idx, value=val) cell.border = thin_border cell.alignment = Alignment(vertical="top", wrap_text=True) # Colorer la colonne Décision dec_col = HEADERS.index("Decision") + 1 decision_cell = ws.cell(row=row_idx, column=dec_col) dv = str(decision_cell.value or "") if "Favorable" in dv and "Défavorable" not in dv: decision_cell.fill = fav_fill elif "Défavorable" in dv: decision_cell.fill = defav_fill elif "Mixte" in dv: decision_cell.fill = mixte_fill # Largeurs de colonnes col_widths = { "Champ": 8, "OGC": 8, "Type_desaccord": 14, "Code_etablissement": 22, "Libelle_etablissement": 40, "Code_controleurs": 22, "Libelle_controleurs": 40, "Codes_retenus_final": 22, "Decision": 24, "Texte_decision_complet": 80, "Resume_motif": 60, "Regles_citees": 16, "References_guide": 50, "GHM_mentionne": 16, "GHS_mentionne": 16, "GHM_final": 12, "GHS_final": 10, "Impact_groupage": 20, } for i, key in enumerate(HEADERS, 1): ws.column_dimensions[ws.cell(row=1, column=i).column_letter].width = col_widths.get(key, 15) # Filtre automatique last_col_letter = ws.cell(row=1, column=len(HEADERS)).column_letter ws.auto_filter.ref = f"A1:{last_col_letter}{len(rows)+1}" # Figer la première ligne ws.freeze_panes = "A2" wb.save(output_path) print(f"Excel enregistré : {output_path}") # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- def main(): if len(sys.argv) < 2: pdf_path = str(Path(__file__).parent / "SPHO-FINANC26020915121.pdf") else: pdf_path = sys.argv[1] output_path = str(Path(pdf_path).with_suffix(".xlsx")) print(f"Fichier PDF : {pdf_path}") print("Étape 1/3 : OCR du document...") full_text = ocr_pdf(pdf_path) txt_path = str(Path(pdf_path).with_suffix(".txt")) Path(txt_path).write_text(full_text, encoding="utf-8") print(f" Texte brut sauvegardé : {txt_path}") print("Étape 2/3 : Extraction des décisions...") rows = parse_document(full_text) print(f" {len(rows)} dossiers OGC extraits.") fav = sum(1 for r in rows if "Favorable" in r.get("Decision", "") and "Défavorable" not in r.get("Decision", "")) defav = sum(1 for r in rows if "Défavorable" in r.get("Decision", "")) mixte = sum(1 for r in rows if "Mixte" in r.get("Decision", "")) indet = sum(1 for r in rows if r.get("Decision", "") in ("Indéterminé", "")) refs_count = sum(1 for r in rows if r.get("References_guide")) codes_ret = sum(1 for r in rows if r.get("Codes_retenus_final")) regles = sum(1 for r in rows if r.get("Regles_citees")) print(f" Favorable établissement : {fav}") print(f" Défavorable établissement : {defav}") print(f" Mixte : {mixte}") print(f" Indéterminé : {indet}") print(f" Avec références citées : {refs_count}") print(f" Avec codes retenus : {codes_ret}") print(f" Avec règles T : {regles}") print("Étape 3/3 : Génération du fichier Excel...") write_excel(rows, output_path) print("Terminé.") if __name__ == "__main__": main()