691 lines
22 KiB
Python
691 lines
22 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
parse_decision_ucr.py — Extraction des décisions UCR depuis un PDF scanné (contrôle T2A)
|
||
|
||
Entrée : PDF scanné de décision UCR (CPAM / Assurance Maladie)
|
||
Sortie : Fichier Excel (.xlsx) avec une feuille unique
|
||
|
||
Colonnes extraites (enrichies pour analyse IA) :
|
||
Champ, OGC, Type_desaccord,
|
||
Code_etablissement, Libelle_etablissement,
|
||
Code_controleurs, Libelle_controleurs,
|
||
Codes_retenus_final,
|
||
Decision, Texte_decision_complet, Resume_motif,
|
||
Regles_citees, References_guide,
|
||
GHM_mentionne, GHS_mentionne, GHM_final, GHS_final,
|
||
Impact_groupage
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import re
|
||
import sys
|
||
from pathlib import Path
|
||
|
||
import pymupdf
|
||
import pytesseract
|
||
from PIL import Image
|
||
import io
|
||
from openpyxl import Workbook
|
||
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
|
||
import unicodedata
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 0. Normalisation texte OCR
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def normalize_text(text: str) -> str:
|
||
"""Normalise les apostrophes, guillemets et espaces issus de l'OCR."""
|
||
text = text.replace("\u2018", "'").replace("\u2019", "'")
|
||
text = text.replace("\u201C", '"').replace("\u201D", '"')
|
||
text = text.replace("\u00AB", '"').replace("\u00BB", '"')
|
||
text = text.replace("''", "'")
|
||
text = text.replace("\u00A0", " ").replace("\u202F", " ")
|
||
# Erreurs OCR courantes
|
||
text = re.sub(r"\bF'UCR\b", "l'UCR", text)
|
||
text = re.sub(r"\bl''UCR\b", "l'UCR", text)
|
||
return text
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 1. OCR
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def ocr_pdf(pdf_path: str, dpi: int = 300) -> str:
|
||
"""Extrait le texte de toutes les pages du PDF via Tesseract OCR."""
|
||
doc = pymupdf.open(pdf_path)
|
||
full_text = []
|
||
total = len(doc)
|
||
for i, page in enumerate(doc):
|
||
print(f" OCR page {i+1}/{total}...", end="\r")
|
||
mat = pymupdf.Matrix(dpi / 72, dpi / 72)
|
||
pix = page.get_pixmap(matrix=mat)
|
||
img = Image.open(io.BytesIO(pix.tobytes("png")))
|
||
text = pytesseract.image_to_string(img, lang="fra")
|
||
full_text.append(text)
|
||
print(f" OCR terminé : {total} pages. ")
|
||
return normalize_text("\n\n".join(full_text))
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 2. Parsing — Regex
|
||
# ---------------------------------------------------------------------------
|
||
|
||
RE_CHAMP = re.compile(
|
||
r"Champ\s*(?:n°\s*)?(\d+)\s*[:\-—]?\s*(?:Séjours|:)",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
RE_OGC_HEADER = re.compile(
|
||
r"(?:^|\n)\s*OGC\s+(\d+)\s*:",
|
||
re.MULTILINE,
|
||
)
|
||
|
||
RE_TYPE_DESACCORD = re.compile(
|
||
r"(?:désaccord|discussion)\s+porte\s+(?:sur\s+)?(?:le\s+|les\s+)?(DP\s+et\s+(?:le\s+)?DAS|DP\s+et\s+DAS|DP|DAS)",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
RE_CIM10 = re.compile(r"\b([A-Z]\d{2}(?:\.\d{1,2})?)\b")
|
||
|
||
RE_CODAGE_ETS = re.compile(
|
||
r"Codage\s+[ée]tablissement\s*:\s*(.*?)(?=Codage\s+contr[ôo]leurs)",
|
||
re.IGNORECASE | re.DOTALL,
|
||
)
|
||
|
||
RE_CODAGE_CTRL = re.compile(
|
||
r"Codage\s+contr[ôo]leurs\s*:\s*(.*?)(?=D[EÉ]C[I1]?SION\s+UCR|PROPOSITION\s+UCR)",
|
||
re.IGNORECASE | re.DOTALL,
|
||
)
|
||
|
||
RE_DECISION = re.compile(
|
||
r"(?:D[EÉ]C[I1]?SION|PROPOSITION)\s+UCR\s*:?\s*(.*)",
|
||
re.IGNORECASE | re.DOTALL,
|
||
)
|
||
|
||
# --- Classification ---
|
||
|
||
RE_FAVORABLE = re.compile(
|
||
r"(?:"
|
||
r"retient\s+(?:la\s+demande|le\s+codage|l'avis)\s+(?:de\s+)?l'[ée]tablissement"
|
||
r"|retient\s+en\s+D[PA]S\s+le\s+code"
|
||
r"|retient\s+le\s+codage\s+du\s+DP\s+de\s+l'[ée]tablissement"
|
||
r"|l'UCR\s+retient\s+l'avis\s+de\s+l'[ée]tablissement"
|
||
r"|confirme\s+l'avis\s+(?:de\s+)?l'[ée]tablissement"
|
||
r")",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
RE_DEFAVORABLE = re.compile(
|
||
r"confirme\s+l'avis\s+des\s+(?:m[ée]decins\s+)?contr[oô]leurs",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
RE_UCR_RETIENT = re.compile(r"l'UCR\s+retient\b", re.IGNORECASE)
|
||
RE_UCR_PROPOSE = re.compile(r"l'UCR\s+propose\b", re.IGNORECASE)
|
||
RE_NE_RETIENT_PAS = re.compile(r"ne\s+retient\s+pas", re.IGNORECASE)
|
||
|
||
# --- GHM / GHS ---
|
||
|
||
RE_GHM = re.compile(r"GHM\s+([A-Z0-9]{5,7})", re.IGNORECASE)
|
||
RE_GHS = re.compile(r"GHS\s+(\d{3,5})", re.IGNORECASE)
|
||
|
||
RE_MIEUX_VALORISE = re.compile(r"mieux\s+valoris[ée]", re.IGNORECASE)
|
||
RE_PAS_MODIFIE = re.compile(
|
||
r"(?:ne\s+modifie\s+pas|ne\s+change(?:nt)?\s+pas|pas\s+de\s+changement|reste\s+group[ée])",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
# --- Règles et références ---
|
||
|
||
# Pages du guide méthodologique
|
||
RE_GUIDE_PAGE = re.compile(
|
||
r"(?:guide\s+m[ée]thodologique|guide)\s*(?:p\.?|page)\s*(\d{1,3})",
|
||
re.IGNORECASE,
|
||
)
|
||
RE_PAGE_GUIDE = re.compile(
|
||
r"(?:p\.?|page)\s*(\d{1,3})\s+du\s+guide",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
# Règles T (T3, T7, etc.)
|
||
RE_REGLE_T = re.compile(
|
||
r"r[èe]gle\s+(T\d+)",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
# Fascicules ATIH
|
||
RE_FASCICULE = re.compile(
|
||
r"fascicule\s+(?:ATIH\s+)?(?:de\s+codage\s+)?(?:PMSI\s+)?(?:n°\s*)?(\d{1,2})?\s*(?:[-–]\s*)?([A-ZÀ-Üa-zà-ü\s]+?)(?:\s+(?:de\s+)?(\d{4}))?(?:\s*(?:,\s*)?(?:p\.?\s*|page\s*)(\d+))?",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
# Avis Agora
|
||
RE_AVIS_AGORA = re.compile(
|
||
r"avis\s+(?:agora|AGORA)\s*(?:n°\s*)?(\d+)",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
# Consignes de codage avec page
|
||
RE_CONSIGNES_CODAGE = re.compile(
|
||
r"consignes?\s+de\s+codage\s*(?:p\.?\s*|page\s*)(\d+)",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
# Codage retenu / DP retenu / DAS retenu
|
||
RE_CODAGE_RETENU = re.compile(
|
||
r"(?:codage\s+retenu|DP\s*(?:retenu|=)|DAS\s*(?:retenu|=)|code\s+retenu|est\s+cod[ée]\s+en|se\s+code)\s*(?:est\s+)?(?::?\s*)([A-Z]\d{2}(?:\.\d{1,2})?)",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
# "est ajouté en DAS" / "ajout du code X"
|
||
RE_CODE_AJOUTE = re.compile(
|
||
r"(?:est\s+ajout[ée]\s+en\s+D[PA]S|ajout(?:er)?\s+(?:du\s+|en\s+D[PA]S\s+(?:le\s+)?)?(?:code\s+)?)\s*(?::?\s*)([A-Z]\d{2}(?:\.\d{1,2})?)",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 2b. Fonctions d'extraction
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def extract_codes_and_label(text: str) -> tuple[str, str]:
|
||
"""Extrait les codes CIM-10 et le libellé depuis un bloc de codage."""
|
||
codes = RE_CIM10.findall(text)
|
||
labels = re.findall(r'[«"](.*?)[»"]', text)
|
||
code_str = " + ".join(codes) if codes else ""
|
||
label_str = " | ".join(labels) if labels else text.strip()[:120]
|
||
label_str = re.sub(r"\s+", " ", label_str).strip()
|
||
return code_str, label_str
|
||
|
||
|
||
def extract_codes_retenus(decision_text: str) -> str:
|
||
"""Extrait les codes finalement retenus par l'UCR."""
|
||
codes = set()
|
||
for m in RE_CODAGE_RETENU.finditer(decision_text):
|
||
codes.add(m.group(1))
|
||
for m in RE_CODE_AJOUTE.finditer(decision_text):
|
||
codes.add(m.group(1))
|
||
return " + ".join(sorted(codes)) if codes else ""
|
||
|
||
|
||
def extract_regles(text: str) -> str:
|
||
"""Extrait les règles de codage citées (T3, T7, etc.)."""
|
||
regles = set()
|
||
for m in RE_REGLE_T.finditer(text):
|
||
regles.add(m.group(1).upper())
|
||
return ", ".join(sorted(regles)) if regles else ""
|
||
|
||
|
||
def extract_references(text: str) -> str:
|
||
"""Extrait toutes les références (guide, fascicules, avis Agora, consignes)."""
|
||
refs = []
|
||
|
||
# Pages du guide méthodologique
|
||
pages_guide = set()
|
||
for m in RE_GUIDE_PAGE.finditer(text):
|
||
pages_guide.add(m.group(1))
|
||
for m in RE_PAGE_GUIDE.finditer(text):
|
||
pages_guide.add(m.group(1))
|
||
if pages_guide:
|
||
refs.append("Guide méthodologique p." + ", p.".join(sorted(pages_guide, key=int)))
|
||
|
||
# Fascicules ATIH
|
||
for m in RE_FASCICULE.finditer(text):
|
||
num = m.group(1) or ""
|
||
sujet = (m.group(2) or "").strip()
|
||
annee = m.group(3) or ""
|
||
page = m.group(4) or ""
|
||
ref = "Fascicule"
|
||
if num:
|
||
ref += f" {num}"
|
||
if sujet:
|
||
ref += f" {sujet}"
|
||
if annee:
|
||
ref += f" ({annee})"
|
||
if page:
|
||
ref += f" p.{page}"
|
||
refs.append(ref.strip())
|
||
|
||
# Avis Agora
|
||
for m in RE_AVIS_AGORA.finditer(text):
|
||
refs.append(f"Avis Agora n°{m.group(1)}")
|
||
|
||
# Consignes de codage
|
||
for m in RE_CONSIGNES_CODAGE.finditer(text):
|
||
refs.append(f"Consignes de codage p.{m.group(1)}")
|
||
|
||
# Dédupliquer
|
||
seen = set()
|
||
unique = []
|
||
for r in refs:
|
||
r_lower = r.lower()
|
||
if r_lower not in seen:
|
||
seen.add(r_lower)
|
||
unique.append(r)
|
||
|
||
return " ; ".join(unique) if unique else ""
|
||
|
||
|
||
def extract_ghm_ghs_all(text: str) -> tuple[list[str], list[str]]:
|
||
"""Extrait tous les GHM et GHS mentionnés."""
|
||
ghms = []
|
||
for m in RE_GHM.finditer(text):
|
||
v = m.group(1).upper()
|
||
if v not in ghms:
|
||
ghms.append(v)
|
||
ghss = []
|
||
for m in RE_GHS.finditer(text):
|
||
v = m.group(1)
|
||
if v not in ghss:
|
||
ghss.append(v)
|
||
return ghms, ghss
|
||
|
||
|
||
def classify_decision(decision_text: str) -> str:
|
||
"""Classifie la décision : Favorable / Défavorable / Mixte / Indéterminé."""
|
||
text = normalize_text(decision_text)
|
||
|
||
fav = bool(RE_FAVORABLE.search(text))
|
||
defav = bool(RE_DEFAVORABLE.search(text))
|
||
|
||
ucr_retient = bool(RE_UCR_RETIENT.search(text))
|
||
ucr_propose = bool(RE_UCR_PROPOSE.search(text))
|
||
ne_retient_pas = bool(RE_NE_RETIENT_PAS.search(text))
|
||
|
||
if ucr_retient and not ne_retient_pas:
|
||
fav = True
|
||
if ucr_propose and not defav:
|
||
fav = True
|
||
|
||
if (ucr_retient or fav) and defav:
|
||
return "Mixte"
|
||
if fav and defav:
|
||
return "Mixte"
|
||
elif fav:
|
||
return "Favorable établissement"
|
||
elif defav:
|
||
return "Défavorable établissement"
|
||
else:
|
||
return "Indéterminé"
|
||
|
||
|
||
def clean_decision_text(text: str) -> str:
|
||
"""Nettoie le texte de décision (supprime artifacts OCR en fin de bloc)."""
|
||
# Supprimer les lignes de pied de page UCR
|
||
text = re.sub(r"\n\s*(?:UCR\s+NA|CONFIDENTIEL|Page\s+\d+).*$", "", text, flags=re.MULTILINE | re.IGNORECASE)
|
||
# Supprimer les artefacts OCR de fin (séquences de caractères isolés)
|
||
text = re.sub(r"\n\s*[A-Z]{1,4}\s*(?:—|—|-)\s*[a-zA-Z]{0,3}\s*$", "", text, flags=re.MULTILINE)
|
||
text = re.sub(r"\n\s*(?:EE|ESS|2 ae|A D ES|EE nd)\s*$", "", text, flags=re.MULTILINE | re.IGNORECASE)
|
||
# Normaliser les espaces
|
||
text = re.sub(r"[ \t]+", " ", text)
|
||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||
return text.strip()
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 2c. Parsing des blocs
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def parse_ogc_block(block_text: str, champ: int, ogc_num: int) -> dict:
|
||
"""Parse un bloc OGC et retourne un dictionnaire structuré enrichi."""
|
||
result = {
|
||
"Champ": champ,
|
||
"OGC": ogc_num,
|
||
"Type_desaccord": "",
|
||
"Code_etablissement": "",
|
||
"Libelle_etablissement": "",
|
||
"Code_controleurs": "",
|
||
"Libelle_controleurs": "",
|
||
"Codes_retenus_final": "",
|
||
"Decision": "",
|
||
"Texte_decision_complet": "",
|
||
"Resume_motif": "",
|
||
"Regles_citees": "",
|
||
"References_guide": "",
|
||
"GHM_mentionne": "",
|
||
"GHS_mentionne": "",
|
||
"GHM_final": "",
|
||
"GHS_final": "",
|
||
"Impact_groupage": "",
|
||
}
|
||
|
||
# Type de désaccord
|
||
m = RE_TYPE_DESACCORD.search(block_text)
|
||
if m:
|
||
raw = m.group(1).upper().strip()
|
||
raw = re.sub(r"\s+", " ", raw)
|
||
if "DP" in raw and "DAS" in raw:
|
||
result["Type_desaccord"] = "DP + DAS"
|
||
elif "DAS" in raw:
|
||
result["Type_desaccord"] = "DAS"
|
||
elif "DP" in raw:
|
||
result["Type_desaccord"] = "DP"
|
||
|
||
# Codage établissement
|
||
m = RE_CODAGE_ETS.search(block_text)
|
||
if m:
|
||
raw_ets = m.group(1).strip()
|
||
result["Code_etablissement"], result["Libelle_etablissement"] = extract_codes_and_label(raw_ets)
|
||
|
||
# Codage contrôleurs
|
||
m = RE_CODAGE_CTRL.search(block_text)
|
||
if m:
|
||
raw_ctrl = m.group(1).strip()
|
||
if re.search(r"non\s+repris", raw_ctrl, re.IGNORECASE):
|
||
result["Code_controleurs"] = "non repris"
|
||
result["Libelle_controleurs"] = ""
|
||
else:
|
||
result["Code_controleurs"], result["Libelle_controleurs"] = extract_codes_and_label(raw_ctrl)
|
||
|
||
# Décision UCR — TEXTE COMPLET
|
||
m = RE_DECISION.search(block_text)
|
||
if m:
|
||
decision_text = m.group(1).strip()
|
||
decision_clean = clean_decision_text(decision_text)
|
||
|
||
result["Decision"] = classify_decision(decision_clean)
|
||
result["Texte_decision_complet"] = decision_clean
|
||
|
||
# Résumé court (première phrase significative)
|
||
resume = re.sub(r"\s+", " ", decision_clean)[:300].strip()
|
||
# Couper à la dernière phrase complète
|
||
last_dot = resume.rfind(".")
|
||
if last_dot > 100:
|
||
resume = resume[:last_dot + 1]
|
||
result["Resume_motif"] = resume
|
||
|
||
# Codes finalement retenus
|
||
result["Codes_retenus_final"] = extract_codes_retenus(decision_clean)
|
||
|
||
# Règles citées (T3, T7, etc.)
|
||
result["Regles_citees"] = extract_regles(block_text)
|
||
|
||
# Références (guide, fascicules, avis Agora)
|
||
result["References_guide"] = extract_references(block_text)
|
||
|
||
# GHM / GHS — tous ceux mentionnés et le dernier (= final)
|
||
ghms, ghss = extract_ghm_ghs_all(block_text)
|
||
if ghms:
|
||
result["GHM_mentionne"] = " / ".join(ghms)
|
||
result["GHM_final"] = ghms[-1] # Le dernier mentionné est souvent le final
|
||
if ghss:
|
||
result["GHS_mentionne"] = " / ".join(ghss)
|
||
result["GHS_final"] = ghss[-1]
|
||
|
||
# Impact groupage
|
||
if RE_MIEUX_VALORISE.search(block_text):
|
||
result["Impact_groupage"] = "Mieux valorisé"
|
||
elif RE_PAS_MODIFIE.search(block_text):
|
||
result["Impact_groupage"] = "Pas de changement"
|
||
|
||
return result
|
||
|
||
|
||
def parse_grouped_ogcs(text_block: str, champ: int, ogc_nums: list[int]) -> list[dict]:
|
||
"""Parse un bloc groupé (ex: OGC 14,19,46,50 traités ensemble)."""
|
||
template = parse_ogc_block(text_block, champ, ogc_nums[0])
|
||
results = []
|
||
for num in ogc_nums:
|
||
row = dict(template)
|
||
row["OGC"] = num
|
||
results.append(row)
|
||
return results
|
||
|
||
|
||
def parse_document(full_text: str) -> list[dict]:
|
||
"""Parse le texte OCR complet et retourne la liste des dossiers."""
|
||
rows = []
|
||
|
||
champ_positions = [(m.start(), int(m.group(1))) for m in RE_CHAMP.finditer(full_text)]
|
||
ogc_positions = [(m.start(), int(m.group(1))) for m in RE_OGC_HEADER.finditer(full_text)]
|
||
|
||
def get_champ_for_position(pos: int) -> int:
|
||
ch = 0
|
||
for cp, cn in champ_positions:
|
||
if cp <= pos:
|
||
ch = cn
|
||
else:
|
||
break
|
||
return ch
|
||
|
||
# Blocs groupés
|
||
RE_GROUPED = re.compile(
|
||
r"(?:Concernant|Pour)\s+les\s+OGC\s+([\d,\s]+)",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
grouped_ogcs = set()
|
||
for m in RE_GROUPED.finditer(full_text):
|
||
nums = [int(n.strip()) for n in m.group(1).split(",") if n.strip().isdigit()]
|
||
if len(nums) > 1:
|
||
start = m.start()
|
||
end = len(full_text)
|
||
for op, on in ogc_positions:
|
||
if op > start + 50 and on not in nums:
|
||
end = op
|
||
break
|
||
block = full_text[start:end]
|
||
champ = get_champ_for_position(start)
|
||
group_rows = parse_grouped_ogcs(block, champ, nums)
|
||
rows.extend(group_rows)
|
||
grouped_ogcs.update(nums)
|
||
|
||
# OGC individuels
|
||
for idx, (pos, ogc_num) in enumerate(ogc_positions):
|
||
champ = get_champ_for_position(pos)
|
||
|
||
end = len(full_text)
|
||
for next_pos, _ in ogc_positions[idx + 1:]:
|
||
if next_pos > pos + 20:
|
||
end = next_pos
|
||
break
|
||
for cp, _ in champ_positions:
|
||
if pos < cp < end:
|
||
end = cp
|
||
break
|
||
|
||
block = full_text[pos:end]
|
||
row = parse_ogc_block(block, champ, ogc_num)
|
||
|
||
if ogc_num in grouped_ogcs:
|
||
if row["Code_etablissement"] and row["Decision"]:
|
||
rows = [r for r in rows if r["OGC"] != ogc_num]
|
||
rows.append(row)
|
||
else:
|
||
if row["Code_etablissement"] or row["Decision"]:
|
||
rows.append(row)
|
||
|
||
rows.sort(key=lambda r: (r["Champ"], r["OGC"]))
|
||
|
||
# Dédupliquer
|
||
seen = {}
|
||
deduped = []
|
||
for r in rows:
|
||
key = r["OGC"]
|
||
if key in seen:
|
||
old = seen[key]
|
||
old_score = sum(1 for v in old.values() if v)
|
||
new_score = sum(1 for v in r.values() if v)
|
||
if new_score > old_score:
|
||
deduped = [x for x in deduped if x["OGC"] != key]
|
||
deduped.append(r)
|
||
seen[key] = r
|
||
else:
|
||
seen[key] = r
|
||
deduped.append(r)
|
||
|
||
deduped.sort(key=lambda r: (r["Champ"], r["OGC"]))
|
||
return deduped
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 3. Export Excel
|
||
# ---------------------------------------------------------------------------
|
||
|
||
HEADERS = [
|
||
"Champ",
|
||
"OGC",
|
||
"Type_desaccord",
|
||
"Code_etablissement",
|
||
"Libelle_etablissement",
|
||
"Code_controleurs",
|
||
"Libelle_controleurs",
|
||
"Codes_retenus_final",
|
||
"Decision",
|
||
"Texte_decision_complet",
|
||
"Resume_motif",
|
||
"Regles_citees",
|
||
"References_guide",
|
||
"GHM_mentionne",
|
||
"GHS_mentionne",
|
||
"GHM_final",
|
||
"GHS_final",
|
||
"Impact_groupage",
|
||
]
|
||
|
||
HEADER_LABELS = [
|
||
"Champ",
|
||
"N° OGC",
|
||
"Type désaccord",
|
||
"Code(s) Établissement",
|
||
"Libellé Établissement",
|
||
"Code(s) Contrôleurs",
|
||
"Libellé Contrôleurs",
|
||
"Code(s) retenus (final)",
|
||
"Décision UCR",
|
||
"Texte décision complet",
|
||
"Résumé du motif",
|
||
"Règles codage citées",
|
||
"Références (guide, fascicules, avis)",
|
||
"GHM mentionné(s)",
|
||
"GHS mentionné(s)",
|
||
"GHM final",
|
||
"GHS final",
|
||
"Impact groupage",
|
||
]
|
||
|
||
|
||
def write_excel(rows: list[dict], output_path: str):
|
||
"""Écrit les résultats dans un fichier Excel (feuille unique)."""
|
||
wb = Workbook()
|
||
ws = wb.active
|
||
ws.title = "Décisions UCR"
|
||
|
||
# Styles
|
||
header_font = Font(bold=True, color="FFFFFF", size=11)
|
||
header_fill = PatternFill(start_color="2F5496", end_color="2F5496", fill_type="solid")
|
||
header_align = Alignment(horizontal="center", vertical="center", wrap_text=True)
|
||
thin_border = Border(
|
||
left=Side(style="thin"),
|
||
right=Side(style="thin"),
|
||
top=Side(style="thin"),
|
||
bottom=Side(style="thin"),
|
||
)
|
||
|
||
fav_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
|
||
defav_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
|
||
mixte_fill = PatternFill(start_color="FFEB9C", end_color="FFEB9C", fill_type="solid")
|
||
|
||
# En-têtes
|
||
for col, label in enumerate(HEADER_LABELS, 1):
|
||
cell = ws.cell(row=1, column=col, value=label)
|
||
cell.font = header_font
|
||
cell.fill = header_fill
|
||
cell.alignment = header_align
|
||
cell.border = thin_border
|
||
|
||
# Données
|
||
for row_idx, data in enumerate(rows, 2):
|
||
for col_idx, key in enumerate(HEADERS, 1):
|
||
val = data.get(key, "")
|
||
cell = ws.cell(row=row_idx, column=col_idx, value=val)
|
||
cell.border = thin_border
|
||
cell.alignment = Alignment(vertical="top", wrap_text=True)
|
||
|
||
# Colorer la colonne Décision
|
||
dec_col = HEADERS.index("Decision") + 1
|
||
decision_cell = ws.cell(row=row_idx, column=dec_col)
|
||
dv = str(decision_cell.value or "")
|
||
if "Favorable" in dv and "Défavorable" not in dv:
|
||
decision_cell.fill = fav_fill
|
||
elif "Défavorable" in dv:
|
||
decision_cell.fill = defav_fill
|
||
elif "Mixte" in dv:
|
||
decision_cell.fill = mixte_fill
|
||
|
||
# Largeurs de colonnes
|
||
col_widths = {
|
||
"Champ": 8, "OGC": 8, "Type_desaccord": 14,
|
||
"Code_etablissement": 22, "Libelle_etablissement": 40,
|
||
"Code_controleurs": 22, "Libelle_controleurs": 40,
|
||
"Codes_retenus_final": 22,
|
||
"Decision": 24, "Texte_decision_complet": 80,
|
||
"Resume_motif": 60,
|
||
"Regles_citees": 16, "References_guide": 50,
|
||
"GHM_mentionne": 16, "GHS_mentionne": 16,
|
||
"GHM_final": 12, "GHS_final": 10,
|
||
"Impact_groupage": 20,
|
||
}
|
||
for i, key in enumerate(HEADERS, 1):
|
||
ws.column_dimensions[ws.cell(row=1, column=i).column_letter].width = col_widths.get(key, 15)
|
||
|
||
# Filtre automatique
|
||
last_col_letter = ws.cell(row=1, column=len(HEADERS)).column_letter
|
||
ws.auto_filter.ref = f"A1:{last_col_letter}{len(rows)+1}"
|
||
|
||
# Figer la première ligne
|
||
ws.freeze_panes = "A2"
|
||
|
||
wb.save(output_path)
|
||
print(f"Excel enregistré : {output_path}")
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Main
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def main():
|
||
if len(sys.argv) < 2:
|
||
pdf_path = str(Path(__file__).parent / "SPHO-FINANC26020915121.pdf")
|
||
else:
|
||
pdf_path = sys.argv[1]
|
||
|
||
output_path = str(Path(pdf_path).with_suffix(".xlsx"))
|
||
|
||
print(f"Fichier PDF : {pdf_path}")
|
||
print("Étape 1/3 : OCR du document...")
|
||
full_text = ocr_pdf(pdf_path)
|
||
|
||
txt_path = str(Path(pdf_path).with_suffix(".txt"))
|
||
Path(txt_path).write_text(full_text, encoding="utf-8")
|
||
print(f" Texte brut sauvegardé : {txt_path}")
|
||
|
||
print("Étape 2/3 : Extraction des décisions...")
|
||
rows = parse_document(full_text)
|
||
print(f" {len(rows)} dossiers OGC extraits.")
|
||
|
||
fav = sum(1 for r in rows if "Favorable" in r.get("Decision", "") and "Défavorable" not in r.get("Decision", ""))
|
||
defav = sum(1 for r in rows if "Défavorable" in r.get("Decision", ""))
|
||
mixte = sum(1 for r in rows if "Mixte" in r.get("Decision", ""))
|
||
indet = sum(1 for r in rows if r.get("Decision", "") in ("Indéterminé", ""))
|
||
refs_count = sum(1 for r in rows if r.get("References_guide"))
|
||
codes_ret = sum(1 for r in rows if r.get("Codes_retenus_final"))
|
||
regles = sum(1 for r in rows if r.get("Regles_citees"))
|
||
|
||
print(f" Favorable établissement : {fav}")
|
||
print(f" Défavorable établissement : {defav}")
|
||
print(f" Mixte : {mixte}")
|
||
print(f" Indéterminé : {indet}")
|
||
print(f" Avec références citées : {refs_count}")
|
||
print(f" Avec codes retenus : {codes_ret}")
|
||
print(f" Avec règles T : {regles}")
|
||
|
||
print("Étape 3/3 : Génération du fichier Excel...")
|
||
write_excel(rows, output_path)
|
||
print("Terminé.")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|