chore: add .gitignore
This commit is contained in:
690
cpam/parse_decision_ucr.py
Normal file
690
cpam/parse_decision_ucr.py
Normal file
@@ -0,0 +1,690 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
parse_decision_ucr.py — Extraction des décisions UCR depuis un PDF scanné (contrôle T2A)
|
||||
|
||||
Entrée : PDF scanné de décision UCR (CPAM / Assurance Maladie)
|
||||
Sortie : Fichier Excel (.xlsx) avec une feuille unique
|
||||
|
||||
Colonnes extraites (enrichies pour analyse IA) :
|
||||
Champ, OGC, Type_desaccord,
|
||||
Code_etablissement, Libelle_etablissement,
|
||||
Code_controleurs, Libelle_controleurs,
|
||||
Codes_retenus_final,
|
||||
Decision, Texte_decision_complet, Resume_motif,
|
||||
Regles_citees, References_guide,
|
||||
GHM_mentionne, GHS_mentionne, GHM_final, GHS_final,
|
||||
Impact_groupage
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pymupdf
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
import io
|
||||
from openpyxl import Workbook
|
||||
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
|
||||
import unicodedata
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 0. Normalisation texte OCR
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def normalize_text(text: str) -> str:
|
||||
"""Normalise les apostrophes, guillemets et espaces issus de l'OCR."""
|
||||
text = text.replace("\u2018", "'").replace("\u2019", "'")
|
||||
text = text.replace("\u201C", '"').replace("\u201D", '"')
|
||||
text = text.replace("\u00AB", '"').replace("\u00BB", '"')
|
||||
text = text.replace("''", "'")
|
||||
text = text.replace("\u00A0", " ").replace("\u202F", " ")
|
||||
# Erreurs OCR courantes
|
||||
text = re.sub(r"\bF'UCR\b", "l'UCR", text)
|
||||
text = re.sub(r"\bl''UCR\b", "l'UCR", text)
|
||||
return text
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 1. OCR
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def ocr_pdf(pdf_path: str, dpi: int = 300) -> str:
|
||||
"""Extrait le texte de toutes les pages du PDF via Tesseract OCR."""
|
||||
doc = pymupdf.open(pdf_path)
|
||||
full_text = []
|
||||
total = len(doc)
|
||||
for i, page in enumerate(doc):
|
||||
print(f" OCR page {i+1}/{total}...", end="\r")
|
||||
mat = pymupdf.Matrix(dpi / 72, dpi / 72)
|
||||
pix = page.get_pixmap(matrix=mat)
|
||||
img = Image.open(io.BytesIO(pix.tobytes("png")))
|
||||
text = pytesseract.image_to_string(img, lang="fra")
|
||||
full_text.append(text)
|
||||
print(f" OCR terminé : {total} pages. ")
|
||||
return normalize_text("\n\n".join(full_text))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 2. Parsing — Regex
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
RE_CHAMP = re.compile(
|
||||
r"Champ\s*(?:n°\s*)?(\d+)\s*[:\-—]?\s*(?:Séjours|:)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
RE_OGC_HEADER = re.compile(
|
||||
r"(?:^|\n)\s*OGC\s+(\d+)\s*:",
|
||||
re.MULTILINE,
|
||||
)
|
||||
|
||||
RE_TYPE_DESACCORD = re.compile(
|
||||
r"(?:désaccord|discussion)\s+porte\s+(?:sur\s+)?(?:le\s+|les\s+)?(DP\s+et\s+(?:le\s+)?DAS|DP\s+et\s+DAS|DP|DAS)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
RE_CIM10 = re.compile(r"\b([A-Z]\d{2}(?:\.\d{1,2})?)\b")
|
||||
|
||||
RE_CODAGE_ETS = re.compile(
|
||||
r"Codage\s+[ée]tablissement\s*:\s*(.*?)(?=Codage\s+contr[ôo]leurs)",
|
||||
re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
|
||||
RE_CODAGE_CTRL = re.compile(
|
||||
r"Codage\s+contr[ôo]leurs\s*:\s*(.*?)(?=D[EÉ]C[I1]?SION\s+UCR|PROPOSITION\s+UCR)",
|
||||
re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
|
||||
RE_DECISION = re.compile(
|
||||
r"(?:D[EÉ]C[I1]?SION|PROPOSITION)\s+UCR\s*:?\s*(.*)",
|
||||
re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
|
||||
# --- Classification ---
|
||||
|
||||
RE_FAVORABLE = re.compile(
|
||||
r"(?:"
|
||||
r"retient\s+(?:la\s+demande|le\s+codage|l'avis)\s+(?:de\s+)?l'[ée]tablissement"
|
||||
r"|retient\s+en\s+D[PA]S\s+le\s+code"
|
||||
r"|retient\s+le\s+codage\s+du\s+DP\s+de\s+l'[ée]tablissement"
|
||||
r"|l'UCR\s+retient\s+l'avis\s+de\s+l'[ée]tablissement"
|
||||
r"|confirme\s+l'avis\s+(?:de\s+)?l'[ée]tablissement"
|
||||
r")",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
RE_DEFAVORABLE = re.compile(
|
||||
r"confirme\s+l'avis\s+des\s+(?:m[ée]decins\s+)?contr[oô]leurs",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
RE_UCR_RETIENT = re.compile(r"l'UCR\s+retient\b", re.IGNORECASE)
|
||||
RE_UCR_PROPOSE = re.compile(r"l'UCR\s+propose\b", re.IGNORECASE)
|
||||
RE_NE_RETIENT_PAS = re.compile(r"ne\s+retient\s+pas", re.IGNORECASE)
|
||||
|
||||
# --- GHM / GHS ---
|
||||
|
||||
RE_GHM = re.compile(r"GHM\s+([A-Z0-9]{5,7})", re.IGNORECASE)
|
||||
RE_GHS = re.compile(r"GHS\s+(\d{3,5})", re.IGNORECASE)
|
||||
|
||||
RE_MIEUX_VALORISE = re.compile(r"mieux\s+valoris[ée]", re.IGNORECASE)
|
||||
RE_PAS_MODIFIE = re.compile(
|
||||
r"(?:ne\s+modifie\s+pas|ne\s+change(?:nt)?\s+pas|pas\s+de\s+changement|reste\s+group[ée])",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# --- Règles et références ---
|
||||
|
||||
# Pages du guide méthodologique
|
||||
RE_GUIDE_PAGE = re.compile(
|
||||
r"(?:guide\s+m[ée]thodologique|guide)\s*(?:p\.?|page)\s*(\d{1,3})",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
RE_PAGE_GUIDE = re.compile(
|
||||
r"(?:p\.?|page)\s*(\d{1,3})\s+du\s+guide",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Règles T (T3, T7, etc.)
|
||||
RE_REGLE_T = re.compile(
|
||||
r"r[èe]gle\s+(T\d+)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Fascicules ATIH
|
||||
RE_FASCICULE = re.compile(
|
||||
r"fascicule\s+(?:ATIH\s+)?(?:de\s+codage\s+)?(?:PMSI\s+)?(?:n°\s*)?(\d{1,2})?\s*(?:[-–]\s*)?([A-ZÀ-Üa-zà-ü\s]+?)(?:\s+(?:de\s+)?(\d{4}))?(?:\s*(?:,\s*)?(?:p\.?\s*|page\s*)(\d+))?",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Avis Agora
|
||||
RE_AVIS_AGORA = re.compile(
|
||||
r"avis\s+(?:agora|AGORA)\s*(?:n°\s*)?(\d+)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Consignes de codage avec page
|
||||
RE_CONSIGNES_CODAGE = re.compile(
|
||||
r"consignes?\s+de\s+codage\s*(?:p\.?\s*|page\s*)(\d+)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Codage retenu / DP retenu / DAS retenu
|
||||
RE_CODAGE_RETENU = re.compile(
|
||||
r"(?:codage\s+retenu|DP\s*(?:retenu|=)|DAS\s*(?:retenu|=)|code\s+retenu|est\s+cod[ée]\s+en|se\s+code)\s*(?:est\s+)?(?::?\s*)([A-Z]\d{2}(?:\.\d{1,2})?)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# "est ajouté en DAS" / "ajout du code X"
|
||||
RE_CODE_AJOUTE = re.compile(
|
||||
r"(?:est\s+ajout[ée]\s+en\s+D[PA]S|ajout(?:er)?\s+(?:du\s+|en\s+D[PA]S\s+(?:le\s+)?)?(?:code\s+)?)\s*(?::?\s*)([A-Z]\d{2}(?:\.\d{1,2})?)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 2b. Fonctions d'extraction
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def extract_codes_and_label(text: str) -> tuple[str, str]:
|
||||
"""Extrait les codes CIM-10 et le libellé depuis un bloc de codage."""
|
||||
codes = RE_CIM10.findall(text)
|
||||
labels = re.findall(r'[«"](.*?)[»"]', text)
|
||||
code_str = " + ".join(codes) if codes else ""
|
||||
label_str = " | ".join(labels) if labels else text.strip()[:120]
|
||||
label_str = re.sub(r"\s+", " ", label_str).strip()
|
||||
return code_str, label_str
|
||||
|
||||
|
||||
def extract_codes_retenus(decision_text: str) -> str:
|
||||
"""Extrait les codes finalement retenus par l'UCR."""
|
||||
codes = set()
|
||||
for m in RE_CODAGE_RETENU.finditer(decision_text):
|
||||
codes.add(m.group(1))
|
||||
for m in RE_CODE_AJOUTE.finditer(decision_text):
|
||||
codes.add(m.group(1))
|
||||
return " + ".join(sorted(codes)) if codes else ""
|
||||
|
||||
|
||||
def extract_regles(text: str) -> str:
|
||||
"""Extrait les règles de codage citées (T3, T7, etc.)."""
|
||||
regles = set()
|
||||
for m in RE_REGLE_T.finditer(text):
|
||||
regles.add(m.group(1).upper())
|
||||
return ", ".join(sorted(regles)) if regles else ""
|
||||
|
||||
|
||||
def extract_references(text: str) -> str:
|
||||
"""Extrait toutes les références (guide, fascicules, avis Agora, consignes)."""
|
||||
refs = []
|
||||
|
||||
# Pages du guide méthodologique
|
||||
pages_guide = set()
|
||||
for m in RE_GUIDE_PAGE.finditer(text):
|
||||
pages_guide.add(m.group(1))
|
||||
for m in RE_PAGE_GUIDE.finditer(text):
|
||||
pages_guide.add(m.group(1))
|
||||
if pages_guide:
|
||||
refs.append("Guide méthodologique p." + ", p.".join(sorted(pages_guide, key=int)))
|
||||
|
||||
# Fascicules ATIH
|
||||
for m in RE_FASCICULE.finditer(text):
|
||||
num = m.group(1) or ""
|
||||
sujet = (m.group(2) or "").strip()
|
||||
annee = m.group(3) or ""
|
||||
page = m.group(4) or ""
|
||||
ref = "Fascicule"
|
||||
if num:
|
||||
ref += f" {num}"
|
||||
if sujet:
|
||||
ref += f" {sujet}"
|
||||
if annee:
|
||||
ref += f" ({annee})"
|
||||
if page:
|
||||
ref += f" p.{page}"
|
||||
refs.append(ref.strip())
|
||||
|
||||
# Avis Agora
|
||||
for m in RE_AVIS_AGORA.finditer(text):
|
||||
refs.append(f"Avis Agora n°{m.group(1)}")
|
||||
|
||||
# Consignes de codage
|
||||
for m in RE_CONSIGNES_CODAGE.finditer(text):
|
||||
refs.append(f"Consignes de codage p.{m.group(1)}")
|
||||
|
||||
# Dédupliquer
|
||||
seen = set()
|
||||
unique = []
|
||||
for r in refs:
|
||||
r_lower = r.lower()
|
||||
if r_lower not in seen:
|
||||
seen.add(r_lower)
|
||||
unique.append(r)
|
||||
|
||||
return " ; ".join(unique) if unique else ""
|
||||
|
||||
|
||||
def extract_ghm_ghs_all(text: str) -> tuple[list[str], list[str]]:
|
||||
"""Extrait tous les GHM et GHS mentionnés."""
|
||||
ghms = []
|
||||
for m in RE_GHM.finditer(text):
|
||||
v = m.group(1).upper()
|
||||
if v not in ghms:
|
||||
ghms.append(v)
|
||||
ghss = []
|
||||
for m in RE_GHS.finditer(text):
|
||||
v = m.group(1)
|
||||
if v not in ghss:
|
||||
ghss.append(v)
|
||||
return ghms, ghss
|
||||
|
||||
|
||||
def classify_decision(decision_text: str) -> str:
|
||||
"""Classifie la décision : Favorable / Défavorable / Mixte / Indéterminé."""
|
||||
text = normalize_text(decision_text)
|
||||
|
||||
fav = bool(RE_FAVORABLE.search(text))
|
||||
defav = bool(RE_DEFAVORABLE.search(text))
|
||||
|
||||
ucr_retient = bool(RE_UCR_RETIENT.search(text))
|
||||
ucr_propose = bool(RE_UCR_PROPOSE.search(text))
|
||||
ne_retient_pas = bool(RE_NE_RETIENT_PAS.search(text))
|
||||
|
||||
if ucr_retient and not ne_retient_pas:
|
||||
fav = True
|
||||
if ucr_propose and not defav:
|
||||
fav = True
|
||||
|
||||
if (ucr_retient or fav) and defav:
|
||||
return "Mixte"
|
||||
if fav and defav:
|
||||
return "Mixte"
|
||||
elif fav:
|
||||
return "Favorable établissement"
|
||||
elif defav:
|
||||
return "Défavorable établissement"
|
||||
else:
|
||||
return "Indéterminé"
|
||||
|
||||
|
||||
def clean_decision_text(text: str) -> str:
|
||||
"""Nettoie le texte de décision (supprime artifacts OCR en fin de bloc)."""
|
||||
# Supprimer les lignes de pied de page UCR
|
||||
text = re.sub(r"\n\s*(?:UCR\s+NA|CONFIDENTIEL|Page\s+\d+).*$", "", text, flags=re.MULTILINE | re.IGNORECASE)
|
||||
# Supprimer les artefacts OCR de fin (séquences de caractères isolés)
|
||||
text = re.sub(r"\n\s*[A-Z]{1,4}\s*(?:—|—|-)\s*[a-zA-Z]{0,3}\s*$", "", text, flags=re.MULTILINE)
|
||||
text = re.sub(r"\n\s*(?:EE|ESS|2 ae|A D ES|EE nd)\s*$", "", text, flags=re.MULTILINE | re.IGNORECASE)
|
||||
# Normaliser les espaces
|
||||
text = re.sub(r"[ \t]+", " ", text)
|
||||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 2c. Parsing des blocs
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def parse_ogc_block(block_text: str, champ: int, ogc_num: int) -> dict:
|
||||
"""Parse un bloc OGC et retourne un dictionnaire structuré enrichi."""
|
||||
result = {
|
||||
"Champ": champ,
|
||||
"OGC": ogc_num,
|
||||
"Type_desaccord": "",
|
||||
"Code_etablissement": "",
|
||||
"Libelle_etablissement": "",
|
||||
"Code_controleurs": "",
|
||||
"Libelle_controleurs": "",
|
||||
"Codes_retenus_final": "",
|
||||
"Decision": "",
|
||||
"Texte_decision_complet": "",
|
||||
"Resume_motif": "",
|
||||
"Regles_citees": "",
|
||||
"References_guide": "",
|
||||
"GHM_mentionne": "",
|
||||
"GHS_mentionne": "",
|
||||
"GHM_final": "",
|
||||
"GHS_final": "",
|
||||
"Impact_groupage": "",
|
||||
}
|
||||
|
||||
# Type de désaccord
|
||||
m = RE_TYPE_DESACCORD.search(block_text)
|
||||
if m:
|
||||
raw = m.group(1).upper().strip()
|
||||
raw = re.sub(r"\s+", " ", raw)
|
||||
if "DP" in raw and "DAS" in raw:
|
||||
result["Type_desaccord"] = "DP + DAS"
|
||||
elif "DAS" in raw:
|
||||
result["Type_desaccord"] = "DAS"
|
||||
elif "DP" in raw:
|
||||
result["Type_desaccord"] = "DP"
|
||||
|
||||
# Codage établissement
|
||||
m = RE_CODAGE_ETS.search(block_text)
|
||||
if m:
|
||||
raw_ets = m.group(1).strip()
|
||||
result["Code_etablissement"], result["Libelle_etablissement"] = extract_codes_and_label(raw_ets)
|
||||
|
||||
# Codage contrôleurs
|
||||
m = RE_CODAGE_CTRL.search(block_text)
|
||||
if m:
|
||||
raw_ctrl = m.group(1).strip()
|
||||
if re.search(r"non\s+repris", raw_ctrl, re.IGNORECASE):
|
||||
result["Code_controleurs"] = "non repris"
|
||||
result["Libelle_controleurs"] = ""
|
||||
else:
|
||||
result["Code_controleurs"], result["Libelle_controleurs"] = extract_codes_and_label(raw_ctrl)
|
||||
|
||||
# Décision UCR — TEXTE COMPLET
|
||||
m = RE_DECISION.search(block_text)
|
||||
if m:
|
||||
decision_text = m.group(1).strip()
|
||||
decision_clean = clean_decision_text(decision_text)
|
||||
|
||||
result["Decision"] = classify_decision(decision_clean)
|
||||
result["Texte_decision_complet"] = decision_clean
|
||||
|
||||
# Résumé court (première phrase significative)
|
||||
resume = re.sub(r"\s+", " ", decision_clean)[:300].strip()
|
||||
# Couper à la dernière phrase complète
|
||||
last_dot = resume.rfind(".")
|
||||
if last_dot > 100:
|
||||
resume = resume[:last_dot + 1]
|
||||
result["Resume_motif"] = resume
|
||||
|
||||
# Codes finalement retenus
|
||||
result["Codes_retenus_final"] = extract_codes_retenus(decision_clean)
|
||||
|
||||
# Règles citées (T3, T7, etc.)
|
||||
result["Regles_citees"] = extract_regles(block_text)
|
||||
|
||||
# Références (guide, fascicules, avis Agora)
|
||||
result["References_guide"] = extract_references(block_text)
|
||||
|
||||
# GHM / GHS — tous ceux mentionnés et le dernier (= final)
|
||||
ghms, ghss = extract_ghm_ghs_all(block_text)
|
||||
if ghms:
|
||||
result["GHM_mentionne"] = " / ".join(ghms)
|
||||
result["GHM_final"] = ghms[-1] # Le dernier mentionné est souvent le final
|
||||
if ghss:
|
||||
result["GHS_mentionne"] = " / ".join(ghss)
|
||||
result["GHS_final"] = ghss[-1]
|
||||
|
||||
# Impact groupage
|
||||
if RE_MIEUX_VALORISE.search(block_text):
|
||||
result["Impact_groupage"] = "Mieux valorisé"
|
||||
elif RE_PAS_MODIFIE.search(block_text):
|
||||
result["Impact_groupage"] = "Pas de changement"
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def parse_grouped_ogcs(text_block: str, champ: int, ogc_nums: list[int]) -> list[dict]:
|
||||
"""Parse un bloc groupé (ex: OGC 14,19,46,50 traités ensemble)."""
|
||||
template = parse_ogc_block(text_block, champ, ogc_nums[0])
|
||||
results = []
|
||||
for num in ogc_nums:
|
||||
row = dict(template)
|
||||
row["OGC"] = num
|
||||
results.append(row)
|
||||
return results
|
||||
|
||||
|
||||
def parse_document(full_text: str) -> list[dict]:
|
||||
"""Parse le texte OCR complet et retourne la liste des dossiers."""
|
||||
rows = []
|
||||
|
||||
champ_positions = [(m.start(), int(m.group(1))) for m in RE_CHAMP.finditer(full_text)]
|
||||
ogc_positions = [(m.start(), int(m.group(1))) for m in RE_OGC_HEADER.finditer(full_text)]
|
||||
|
||||
def get_champ_for_position(pos: int) -> int:
|
||||
ch = 0
|
||||
for cp, cn in champ_positions:
|
||||
if cp <= pos:
|
||||
ch = cn
|
||||
else:
|
||||
break
|
||||
return ch
|
||||
|
||||
# Blocs groupés
|
||||
RE_GROUPED = re.compile(
|
||||
r"(?:Concernant|Pour)\s+les\s+OGC\s+([\d,\s]+)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
grouped_ogcs = set()
|
||||
for m in RE_GROUPED.finditer(full_text):
|
||||
nums = [int(n.strip()) for n in m.group(1).split(",") if n.strip().isdigit()]
|
||||
if len(nums) > 1:
|
||||
start = m.start()
|
||||
end = len(full_text)
|
||||
for op, on in ogc_positions:
|
||||
if op > start + 50 and on not in nums:
|
||||
end = op
|
||||
break
|
||||
block = full_text[start:end]
|
||||
champ = get_champ_for_position(start)
|
||||
group_rows = parse_grouped_ogcs(block, champ, nums)
|
||||
rows.extend(group_rows)
|
||||
grouped_ogcs.update(nums)
|
||||
|
||||
# OGC individuels
|
||||
for idx, (pos, ogc_num) in enumerate(ogc_positions):
|
||||
champ = get_champ_for_position(pos)
|
||||
|
||||
end = len(full_text)
|
||||
for next_pos, _ in ogc_positions[idx + 1:]:
|
||||
if next_pos > pos + 20:
|
||||
end = next_pos
|
||||
break
|
||||
for cp, _ in champ_positions:
|
||||
if pos < cp < end:
|
||||
end = cp
|
||||
break
|
||||
|
||||
block = full_text[pos:end]
|
||||
row = parse_ogc_block(block, champ, ogc_num)
|
||||
|
||||
if ogc_num in grouped_ogcs:
|
||||
if row["Code_etablissement"] and row["Decision"]:
|
||||
rows = [r for r in rows if r["OGC"] != ogc_num]
|
||||
rows.append(row)
|
||||
else:
|
||||
if row["Code_etablissement"] or row["Decision"]:
|
||||
rows.append(row)
|
||||
|
||||
rows.sort(key=lambda r: (r["Champ"], r["OGC"]))
|
||||
|
||||
# Dédupliquer
|
||||
seen = {}
|
||||
deduped = []
|
||||
for r in rows:
|
||||
key = r["OGC"]
|
||||
if key in seen:
|
||||
old = seen[key]
|
||||
old_score = sum(1 for v in old.values() if v)
|
||||
new_score = sum(1 for v in r.values() if v)
|
||||
if new_score > old_score:
|
||||
deduped = [x for x in deduped if x["OGC"] != key]
|
||||
deduped.append(r)
|
||||
seen[key] = r
|
||||
else:
|
||||
seen[key] = r
|
||||
deduped.append(r)
|
||||
|
||||
deduped.sort(key=lambda r: (r["Champ"], r["OGC"]))
|
||||
return deduped
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 3. Export Excel
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
HEADERS = [
|
||||
"Champ",
|
||||
"OGC",
|
||||
"Type_desaccord",
|
||||
"Code_etablissement",
|
||||
"Libelle_etablissement",
|
||||
"Code_controleurs",
|
||||
"Libelle_controleurs",
|
||||
"Codes_retenus_final",
|
||||
"Decision",
|
||||
"Texte_decision_complet",
|
||||
"Resume_motif",
|
||||
"Regles_citees",
|
||||
"References_guide",
|
||||
"GHM_mentionne",
|
||||
"GHS_mentionne",
|
||||
"GHM_final",
|
||||
"GHS_final",
|
||||
"Impact_groupage",
|
||||
]
|
||||
|
||||
HEADER_LABELS = [
|
||||
"Champ",
|
||||
"N° OGC",
|
||||
"Type désaccord",
|
||||
"Code(s) Établissement",
|
||||
"Libellé Établissement",
|
||||
"Code(s) Contrôleurs",
|
||||
"Libellé Contrôleurs",
|
||||
"Code(s) retenus (final)",
|
||||
"Décision UCR",
|
||||
"Texte décision complet",
|
||||
"Résumé du motif",
|
||||
"Règles codage citées",
|
||||
"Références (guide, fascicules, avis)",
|
||||
"GHM mentionné(s)",
|
||||
"GHS mentionné(s)",
|
||||
"GHM final",
|
||||
"GHS final",
|
||||
"Impact groupage",
|
||||
]
|
||||
|
||||
|
||||
def write_excel(rows: list[dict], output_path: str):
|
||||
"""Écrit les résultats dans un fichier Excel (feuille unique)."""
|
||||
wb = Workbook()
|
||||
ws = wb.active
|
||||
ws.title = "Décisions UCR"
|
||||
|
||||
# Styles
|
||||
header_font = Font(bold=True, color="FFFFFF", size=11)
|
||||
header_fill = PatternFill(start_color="2F5496", end_color="2F5496", fill_type="solid")
|
||||
header_align = Alignment(horizontal="center", vertical="center", wrap_text=True)
|
||||
thin_border = Border(
|
||||
left=Side(style="thin"),
|
||||
right=Side(style="thin"),
|
||||
top=Side(style="thin"),
|
||||
bottom=Side(style="thin"),
|
||||
)
|
||||
|
||||
fav_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
|
||||
defav_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
|
||||
mixte_fill = PatternFill(start_color="FFEB9C", end_color="FFEB9C", fill_type="solid")
|
||||
|
||||
# En-têtes
|
||||
for col, label in enumerate(HEADER_LABELS, 1):
|
||||
cell = ws.cell(row=1, column=col, value=label)
|
||||
cell.font = header_font
|
||||
cell.fill = header_fill
|
||||
cell.alignment = header_align
|
||||
cell.border = thin_border
|
||||
|
||||
# Données
|
||||
for row_idx, data in enumerate(rows, 2):
|
||||
for col_idx, key in enumerate(HEADERS, 1):
|
||||
val = data.get(key, "")
|
||||
cell = ws.cell(row=row_idx, column=col_idx, value=val)
|
||||
cell.border = thin_border
|
||||
cell.alignment = Alignment(vertical="top", wrap_text=True)
|
||||
|
||||
# Colorer la colonne Décision
|
||||
dec_col = HEADERS.index("Decision") + 1
|
||||
decision_cell = ws.cell(row=row_idx, column=dec_col)
|
||||
dv = str(decision_cell.value or "")
|
||||
if "Favorable" in dv and "Défavorable" not in dv:
|
||||
decision_cell.fill = fav_fill
|
||||
elif "Défavorable" in dv:
|
||||
decision_cell.fill = defav_fill
|
||||
elif "Mixte" in dv:
|
||||
decision_cell.fill = mixte_fill
|
||||
|
||||
# Largeurs de colonnes
|
||||
col_widths = {
|
||||
"Champ": 8, "OGC": 8, "Type_desaccord": 14,
|
||||
"Code_etablissement": 22, "Libelle_etablissement": 40,
|
||||
"Code_controleurs": 22, "Libelle_controleurs": 40,
|
||||
"Codes_retenus_final": 22,
|
||||
"Decision": 24, "Texte_decision_complet": 80,
|
||||
"Resume_motif": 60,
|
||||
"Regles_citees": 16, "References_guide": 50,
|
||||
"GHM_mentionne": 16, "GHS_mentionne": 16,
|
||||
"GHM_final": 12, "GHS_final": 10,
|
||||
"Impact_groupage": 20,
|
||||
}
|
||||
for i, key in enumerate(HEADERS, 1):
|
||||
ws.column_dimensions[ws.cell(row=1, column=i).column_letter].width = col_widths.get(key, 15)
|
||||
|
||||
# Filtre automatique
|
||||
last_col_letter = ws.cell(row=1, column=len(HEADERS)).column_letter
|
||||
ws.auto_filter.ref = f"A1:{last_col_letter}{len(rows)+1}"
|
||||
|
||||
# Figer la première ligne
|
||||
ws.freeze_panes = "A2"
|
||||
|
||||
wb.save(output_path)
|
||||
print(f"Excel enregistré : {output_path}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
pdf_path = str(Path(__file__).parent / "SPHO-FINANC26020915121.pdf")
|
||||
else:
|
||||
pdf_path = sys.argv[1]
|
||||
|
||||
output_path = str(Path(pdf_path).with_suffix(".xlsx"))
|
||||
|
||||
print(f"Fichier PDF : {pdf_path}")
|
||||
print("Étape 1/3 : OCR du document...")
|
||||
full_text = ocr_pdf(pdf_path)
|
||||
|
||||
txt_path = str(Path(pdf_path).with_suffix(".txt"))
|
||||
Path(txt_path).write_text(full_text, encoding="utf-8")
|
||||
print(f" Texte brut sauvegardé : {txt_path}")
|
||||
|
||||
print("Étape 2/3 : Extraction des décisions...")
|
||||
rows = parse_document(full_text)
|
||||
print(f" {len(rows)} dossiers OGC extraits.")
|
||||
|
||||
fav = sum(1 for r in rows if "Favorable" in r.get("Decision", "") and "Défavorable" not in r.get("Decision", ""))
|
||||
defav = sum(1 for r in rows if "Défavorable" in r.get("Decision", ""))
|
||||
mixte = sum(1 for r in rows if "Mixte" in r.get("Decision", ""))
|
||||
indet = sum(1 for r in rows if r.get("Decision", "") in ("Indéterminé", ""))
|
||||
refs_count = sum(1 for r in rows if r.get("References_guide"))
|
||||
codes_ret = sum(1 for r in rows if r.get("Codes_retenus_final"))
|
||||
regles = sum(1 for r in rows if r.get("Regles_citees"))
|
||||
|
||||
print(f" Favorable établissement : {fav}")
|
||||
print(f" Défavorable établissement : {defav}")
|
||||
print(f" Mixte : {mixte}")
|
||||
print(f" Indéterminé : {indet}")
|
||||
print(f" Avec références citées : {refs_count}")
|
||||
print(f" Avec codes retenus : {codes_ret}")
|
||||
print(f" Avec règles T : {regles}")
|
||||
|
||||
print("Étape 3/3 : Génération du fichier Excel...")
|
||||
write_excel(rows, output_path)
|
||||
print("Terminé.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user