chore: add .gitignore
This commit is contained in:
2140
cpam/SPHO-FINANC26020915121.txt
Normal file
2140
cpam/SPHO-FINANC26020915121.txt
Normal file
File diff suppressed because it is too large
Load Diff
BIN
cpam/SPHO-FINANC26020915121.xlsx_old
Normal file
BIN
cpam/SPHO-FINANC26020915121.xlsx_old
Normal file
Binary file not shown.
BIN
cpam/SPHO-FINANC26020915121_llm.xlsx_old
Normal file
BIN
cpam/SPHO-FINANC26020915121_llm.xlsx_old
Normal file
Binary file not shown.
738
cpam/extract_t2a_llm.py
Normal file
738
cpam/extract_t2a_llm.py
Normal file
@@ -0,0 +1,738 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
extract_t2a_llm.py — Extracteur T2A généraliste via OCR + LLM (Ollama)
|
||||
|
||||
Entrée : PDF (scanné ou natif) de document T2A (décision UCR, notification CPAM, rapport ARS…)
|
||||
Sortie : Fichier Excel (.xlsx) avec les données structurées
|
||||
|
||||
Architecture :
|
||||
PDF → OCR/texte natif → Détection type (1 appel LLM) → Extraction bloc par bloc (N appels LLM) → Excel
|
||||
|
||||
Usage :
|
||||
python extract_t2a_llm.py FICHIER.pdf [--model gemma3:27b-it-qat] [--output out.xlsx] [--verbose]
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import pymupdf
|
||||
import requests
|
||||
from openpyxl import Workbook
|
||||
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 0. Normalisation texte OCR
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def normalize_text(text: str) -> str:
|
||||
"""Normalise les apostrophes, guillemets et espaces issus de l'OCR."""
|
||||
text = text.replace("\u2018", "'").replace("\u2019", "'")
|
||||
text = text.replace("\u201C", '"').replace("\u201D", '"')
|
||||
text = text.replace("\u00AB", '"').replace("\u00BB", '"')
|
||||
text = text.replace("''", "'")
|
||||
text = text.replace("\u00A0", " ").replace("\u202F", " ")
|
||||
text = re.sub(r"\bF'UCR\b", "l'UCR", text)
|
||||
text = re.sub(r"\bl''UCR\b", "l'UCR", text)
|
||||
return text
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 1. OCR / Extraction texte (docTR — deep learning, GPU)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_doctr_model = None
|
||||
|
||||
|
||||
def _get_doctr_model():
|
||||
"""Lazy-init du modèle docTR (chargé une seule fois, GPU si VRAM libre, sinon CPU)."""
|
||||
global _doctr_model
|
||||
if _doctr_model is not None:
|
||||
return _doctr_model
|
||||
|
||||
from doctr.models import ocr_predictor
|
||||
|
||||
print(" Chargement du modèle docTR (première utilisation)...")
|
||||
t0 = time.time()
|
||||
_doctr_model = ocr_predictor(
|
||||
det_arch="db_resnet50",
|
||||
reco_arch="crnn_vgg16_bn",
|
||||
pretrained=True,
|
||||
)
|
||||
|
||||
# Déplacer sur GPU si disponible et assez de VRAM libre
|
||||
try:
|
||||
import torch
|
||||
if torch.cuda.is_available():
|
||||
free_vram = torch.cuda.mem_get_info()[0] / (1024 ** 3)
|
||||
if free_vram > 1.0:
|
||||
try:
|
||||
_doctr_model = _doctr_model.cuda()
|
||||
print(f" docTR sur GPU ({torch.cuda.get_device_name(0)}, "
|
||||
f"{free_vram:.1f} Go libres) — {time.time() - t0:.1f}s")
|
||||
except torch.cuda.OutOfMemoryError:
|
||||
_doctr_model = _doctr_model.cpu()
|
||||
torch.cuda.empty_cache()
|
||||
print(f" GPU VRAM insuffisante, docTR sur CPU — {time.time() - t0:.1f}s")
|
||||
else:
|
||||
print(f" GPU VRAM trop basse ({free_vram:.1f} Go libres, Ollama ?), "
|
||||
f"docTR sur CPU — {time.time() - t0:.1f}s")
|
||||
else:
|
||||
print(f" docTR sur CPU — {time.time() - t0:.1f}s")
|
||||
except ImportError:
|
||||
print(f" docTR sur CPU — {time.time() - t0:.1f}s")
|
||||
|
||||
return _doctr_model
|
||||
|
||||
|
||||
def ocr_pdf(pdf_path: str, dpi: int = 300) -> str:
|
||||
"""Extrait le texte du PDF : texte natif si disponible, sinon OCR docTR (GPU)."""
|
||||
doc = pymupdf.open(pdf_path)
|
||||
total = len(doc)
|
||||
|
||||
# Détection : texte natif vs scanné (sur la première page)
|
||||
first_page_text = doc[0].get_text() if total > 0 else ""
|
||||
is_native = len(first_page_text.strip()) > 100
|
||||
|
||||
if is_native:
|
||||
print(" Mode : extraction texte natif (pymupdf)")
|
||||
full_text = []
|
||||
for i, page in enumerate(doc):
|
||||
print(f" Extraction page {i+1}/{total}...", end="\r")
|
||||
full_text.append(page.get_text())
|
||||
print(f" Extraction terminée : {total} pages. ")
|
||||
return normalize_text("\n\n".join(full_text))
|
||||
|
||||
# OCR docTR
|
||||
print(" Mode : OCR docTR (deep learning, GPU)")
|
||||
from doctr.io import DocumentFile
|
||||
|
||||
model = _get_doctr_model()
|
||||
|
||||
print(f" Lecture du PDF ({total} pages)...")
|
||||
doc_pages = DocumentFile.from_pdf(pdf_path)
|
||||
print(f" OCR en cours sur {len(doc_pages)} pages...")
|
||||
|
||||
t0 = time.time()
|
||||
result = model(doc_pages)
|
||||
elapsed = time.time() - t0
|
||||
print(f" OCR terminé : {total} pages en {elapsed:.1f}s "
|
||||
f"({elapsed/total:.1f}s/page)")
|
||||
|
||||
full_text = result.render()
|
||||
return normalize_text(full_text)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 2. Client Ollama
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
NO_FORMAT_JSON_PREFIXES = ("qwen3", "qwen2.5")
|
||||
|
||||
OLLAMA_URL = "http://localhost:11434"
|
||||
|
||||
|
||||
def parse_json_response(raw: str) -> dict | list | None:
|
||||
"""Parse une réponse JSON, en gérant les blocs markdown et le texte parasite."""
|
||||
text = raw.strip()
|
||||
|
||||
# Supprimer les blocs <think>...</think> (Qwen3)
|
||||
text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
|
||||
|
||||
# Supprimer les blocs markdown ```json ... ```
|
||||
if text.startswith("```"):
|
||||
first_nl = text.find("\n")
|
||||
if first_nl != -1:
|
||||
text = text[first_nl + 1:]
|
||||
if text.rstrip().endswith("```"):
|
||||
text = text.rstrip()[:-3]
|
||||
text = text.strip()
|
||||
|
||||
# Tentative directe
|
||||
try:
|
||||
return json.loads(text)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Extraire le premier objet ou tableau JSON
|
||||
for start_char, end_char in [("{", "}"), ("[", "]")]:
|
||||
start = text.find(start_char)
|
||||
if start == -1:
|
||||
continue
|
||||
depth = 0
|
||||
for i in range(start, len(text)):
|
||||
if text[i] == start_char:
|
||||
depth += 1
|
||||
elif text[i] == end_char:
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
try:
|
||||
return json.loads(text[start:i + 1])
|
||||
except json.JSONDecodeError:
|
||||
break
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def call_ollama(
|
||||
prompt: str,
|
||||
model: str,
|
||||
temperature: float = 0.1,
|
||||
max_tokens: int = 4000,
|
||||
timeout: int = 120,
|
||||
verbose: bool = False,
|
||||
) -> dict | list | None:
|
||||
"""Appelle Ollama. Utilise l'API chat avec think=false pour Qwen3."""
|
||||
is_qwen = any(model.startswith(p) for p in NO_FORMAT_JSON_PREFIXES)
|
||||
|
||||
if is_qwen:
|
||||
# API chat + think:false pour Qwen3 (pas de format JSON natif)
|
||||
endpoint = f"{OLLAMA_URL}/api/chat"
|
||||
body = {
|
||||
"model": model,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"stream": False,
|
||||
"think": False,
|
||||
"options": {
|
||||
"temperature": temperature,
|
||||
"num_predict": max_tokens,
|
||||
},
|
||||
}
|
||||
else:
|
||||
# API generate + format JSON natif pour les autres modèles
|
||||
endpoint = f"{OLLAMA_URL}/api/generate"
|
||||
body = {
|
||||
"model": model,
|
||||
"prompt": prompt,
|
||||
"stream": False,
|
||||
"format": "json",
|
||||
"options": {
|
||||
"temperature": temperature,
|
||||
"num_predict": max_tokens,
|
||||
},
|
||||
}
|
||||
|
||||
if verbose:
|
||||
print(f"\n--- PROMPT ({model}) ---")
|
||||
print(prompt[:500] + ("..." if len(prompt) > 500 else ""))
|
||||
print("--- FIN PROMPT ---\n")
|
||||
|
||||
for attempt in range(2):
|
||||
try:
|
||||
t0 = time.time()
|
||||
response = requests.post(endpoint, json=body, timeout=timeout)
|
||||
elapsed = time.time() - t0
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
# Extraire le texte de la réponse selon l'API utilisée
|
||||
if is_qwen:
|
||||
raw = data.get("message", {}).get("content", "")
|
||||
else:
|
||||
raw = data.get("response", "")
|
||||
|
||||
if verbose:
|
||||
print(f"--- RÉPONSE ({elapsed:.1f}s) ---")
|
||||
print(raw[:500] + ("..." if len(raw) > 500 else ""))
|
||||
print("--- FIN RÉPONSE ---\n")
|
||||
|
||||
result = parse_json_response(raw)
|
||||
if result is not None:
|
||||
return result
|
||||
if attempt == 0:
|
||||
print(f" [warn] JSON invalide, retry... (raw: {raw[:100]})")
|
||||
except requests.ConnectionError:
|
||||
print("[ERREUR] Ollama non disponible sur localhost:11434")
|
||||
sys.exit(1)
|
||||
except requests.Timeout:
|
||||
print(f" [warn] Timeout ({timeout}s) — tentative {attempt + 1}/2")
|
||||
if attempt == 1:
|
||||
return None
|
||||
except requests.RequestException as e:
|
||||
print(f" [warn] Erreur requête : {e}")
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 3. Phase 1 — Détection du type de document
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
PROMPT_PHASE1 = """\
|
||||
Tu es un expert en codage PMSI et contrôle T2A. Analyse le début de ce document et identifie sa structure.
|
||||
|
||||
TEXTE (début du document) :
|
||||
---
|
||||
{text_preview}
|
||||
---
|
||||
|
||||
Réponds UNIQUEMENT en JSON avec ces champs :
|
||||
{{
|
||||
"type_document": "decision_ucr | notification_cpam | rapport_controle | autre",
|
||||
"organisme": "nom de l'organisme (CPAM, UCR, ARS...)",
|
||||
"date_document": "date au format YYYY-MM-DD si trouvée, sinon vide",
|
||||
"objet": "résumé en une phrase de l'objet du document",
|
||||
"separateur_blocs": "regex Python pour séparer les dossiers individuels (ex: OGC \\\\d+ :)",
|
||||
"colonnes_detectees": ["liste des champs/colonnes détectés dans la structure"]
|
||||
}}
|
||||
|
||||
IMPORTANT :
|
||||
- Le separateur_blocs doit être un regex Python valide
|
||||
- Il doit capturer le motif qui sépare chaque dossier/cas individuel
|
||||
- Si c'est un document UCR, le séparateur est typiquement "OGC \\\\d+ :"
|
||||
- Si tu ne trouves pas de séparateur clair, mets une chaîne vide ""
|
||||
"""
|
||||
|
||||
|
||||
def detect_document_type(full_text: str, model: str, timeout: int, verbose: bool) -> dict:
|
||||
"""Phase 1 : détection du type de document via LLM."""
|
||||
preview = full_text[:3000]
|
||||
prompt = PROMPT_PHASE1.format(text_preview=preview)
|
||||
result = call_ollama(prompt, model=model, timeout=timeout, verbose=verbose)
|
||||
if result is None:
|
||||
print(" [warn] Phase 1 : détection échouée, utilisation des valeurs par défaut")
|
||||
return {
|
||||
"type_document": "autre",
|
||||
"organisme": "",
|
||||
"date_document": "",
|
||||
"objet": "",
|
||||
"separateur_blocs": "",
|
||||
"colonnes_detectees": [],
|
||||
}
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 4. Découpage en blocs
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def split_into_blocks(full_text: str, separator_pattern: str) -> list[str]:
|
||||
"""Découpe le texte en blocs logiques (dossiers individuels)."""
|
||||
blocks = []
|
||||
|
||||
# Tentative avec le séparateur détecté par le LLM
|
||||
if separator_pattern:
|
||||
try:
|
||||
regex = re.compile(separator_pattern, re.MULTILINE | re.IGNORECASE)
|
||||
parts = regex.split(full_text)
|
||||
# Recombiner : le séparateur fait partie du bloc suivant
|
||||
matches = list(regex.finditer(full_text))
|
||||
if len(matches) >= 3:
|
||||
for i, match in enumerate(matches):
|
||||
start = match.start()
|
||||
end = matches[i + 1].start() if i + 1 < len(matches) else len(full_text)
|
||||
block = full_text[start:end].strip()
|
||||
if block:
|
||||
blocks.append(block)
|
||||
print(f" Découpage par séparateur : {len(blocks)} blocs trouvés")
|
||||
return blocks
|
||||
else:
|
||||
print(f" [warn] Séparateur '{separator_pattern}' → seulement {len(matches)} blocs, fallback")
|
||||
except re.error as e:
|
||||
print(f" [warn] Regex invalide '{separator_pattern}' : {e}, fallback")
|
||||
|
||||
# Fallback : découpage par taille (~6000 chars, chevauchement 500)
|
||||
chunk_size = 6000
|
||||
overlap = 500
|
||||
text_len = len(full_text)
|
||||
if text_len <= chunk_size:
|
||||
return [full_text]
|
||||
|
||||
pos = 0
|
||||
while pos < text_len:
|
||||
end = min(pos + chunk_size, text_len)
|
||||
# Essayer de couper à une fin de ligne
|
||||
if end < text_len:
|
||||
newline_pos = full_text.rfind("\n", pos + chunk_size - 200, end + 200)
|
||||
if newline_pos > pos:
|
||||
end = newline_pos
|
||||
blocks.append(full_text[pos:end].strip())
|
||||
pos = end - overlap if end < text_len else text_len
|
||||
|
||||
print(f" Découpage par taille : {len(blocks)} blocs ({chunk_size} chars, chevauchement {overlap})")
|
||||
return blocks
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 5. Phase 2 — Extraction bloc par bloc
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
SCHEMA_FIELDS = """\
|
||||
Champs à extraire (JSON) — remplis chaque champ ou laisse une chaîne vide "" si non trouvé :
|
||||
- "champ": numéro de champ (entier, 0 si non trouvé)
|
||||
- "ogc": numéro OGC / numéro de dossier (entier, 0 si non trouvé)
|
||||
- "type_desaccord": type de désaccord — "DP", "DAS", "DP + DAS", ou ""
|
||||
- "code_etablissement": code(s) CIM-10 de l'établissement (ex: "G40.0 + F10.2")
|
||||
- "libelle_etablissement": libellé(s) correspondant aux codes établissement
|
||||
- "code_controleurs": code(s) CIM-10 des contrôleurs (ou "non repris")
|
||||
- "libelle_controleurs": libellé(s) correspondant aux codes contrôleurs
|
||||
- "codes_retenus_final": code(s) finalement retenus par l'UCR/la décision
|
||||
- "decision": classification — "Favorable établissement", "Défavorable établissement", "Mixte", ou "Indéterminé"
|
||||
* "Favorable établissement" = la décision retient l'avis/le codage de l'établissement
|
||||
* "Défavorable établissement" = la décision confirme l'avis des contrôleurs
|
||||
* "Mixte" = partiellement favorable et partiellement défavorable
|
||||
* "Indéterminé" = impossible à classifier clairement
|
||||
- "texte_decision_complet": texte intégral de la décision/conclusion
|
||||
- "resume_motif": résumé en 1-2 phrases du motif de la décision
|
||||
- "regles_citees": règles de codage citées (ex: "T3, T7")
|
||||
- "references_guide": références documentaires (guide méthodologique, fascicules ATIH, avis Agora…)
|
||||
- "ghm_mentionne": tous les GHM mentionnés (ex: "05M09 / 05M092")
|
||||
- "ghs_mentionne": tous les GHS mentionnés
|
||||
- "ghm_final": le GHM final retenu
|
||||
- "ghs_final": le GHS final retenu
|
||||
- "impact_groupage": impact sur le groupage — "Mieux valorisé", "Pas de changement", ou ""
|
||||
"""
|
||||
|
||||
PROMPT_PHASE2 = """\
|
||||
Tu es un expert en codage PMSI et contrôle T2A.
|
||||
|
||||
CONTEXTE DOCUMENT :
|
||||
- Type : {type_document}
|
||||
- Organisme : {organisme}
|
||||
- Objet : {objet}
|
||||
|
||||
BLOC DE TEXTE À ANALYSER :
|
||||
---
|
||||
{block_text}
|
||||
---
|
||||
|
||||
CONSIGNES :
|
||||
1. Extrais les informations de chaque dossier/cas présent dans ce bloc.
|
||||
2. Si le bloc contient UN SEUL dossier, retourne un objet JSON.
|
||||
3. Si le bloc contient PLUSIEURS dossiers, retourne une LISTE d'objets JSON.
|
||||
4. Si le bloc ne contient aucun dossier exploitable (en-tête, pied de page, texte administratif sans cas individuel), retourne : {{"skip": true}}
|
||||
|
||||
{schema}
|
||||
|
||||
IMPORTANT :
|
||||
- Sois précis sur les codes CIM-10 (format X00.0)
|
||||
- Pour la décision, analyse attentivement le texte : "retient l'avis de l'établissement" = Favorable, "confirme l'avis des contrôleurs" = Défavorable
|
||||
- Ne laisse aucun champ sans clé, utilise "" pour les valeurs inconnues
|
||||
- Retourne UNIQUEMENT du JSON valide, sans texte avant ou après
|
||||
"""
|
||||
|
||||
|
||||
def extract_block(
|
||||
block_text: str,
|
||||
doc_info: dict,
|
||||
model: str,
|
||||
timeout: int,
|
||||
verbose: bool,
|
||||
) -> list[dict]:
|
||||
"""Extrait les données d'un bloc via LLM. Retourne une liste de dossiers."""
|
||||
prompt = PROMPT_PHASE2.format(
|
||||
type_document=doc_info.get("type_document", "autre"),
|
||||
organisme=doc_info.get("organisme", ""),
|
||||
objet=doc_info.get("objet", ""),
|
||||
block_text=block_text[:8000], # Limiter la taille
|
||||
schema=SCHEMA_FIELDS,
|
||||
)
|
||||
result = call_ollama(prompt, model=model, max_tokens=4000, timeout=timeout, verbose=verbose)
|
||||
if result is None:
|
||||
return []
|
||||
|
||||
# Skip
|
||||
if isinstance(result, dict) and result.get("skip"):
|
||||
return []
|
||||
|
||||
# Normaliser en liste
|
||||
if isinstance(result, dict):
|
||||
items = [result]
|
||||
elif isinstance(result, list):
|
||||
items = [r for r in result if isinstance(r, dict) and not r.get("skip")]
|
||||
else:
|
||||
return []
|
||||
|
||||
return items
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 6. Fusion et dédoublonnage
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Mapping clés LLM (snake_case) → clés Excel (TitleCase)
|
||||
KEY_MAP = {
|
||||
"champ": "Champ",
|
||||
"ogc": "OGC",
|
||||
"type_desaccord": "Type_desaccord",
|
||||
"code_etablissement": "Code_etablissement",
|
||||
"libelle_etablissement": "Libelle_etablissement",
|
||||
"code_controleurs": "Code_controleurs",
|
||||
"libelle_controleurs": "Libelle_controleurs",
|
||||
"codes_retenus_final": "Codes_retenus_final",
|
||||
"decision": "Decision",
|
||||
"texte_decision_complet": "Texte_decision_complet",
|
||||
"resume_motif": "Resume_motif",
|
||||
"regles_citees": "Regles_citees",
|
||||
"references_guide": "References_guide",
|
||||
"ghm_mentionne": "GHM_mentionne",
|
||||
"ghs_mentionne": "GHS_mentionne",
|
||||
"ghm_final": "GHM_final",
|
||||
"ghs_final": "GHS_final",
|
||||
"impact_groupage": "Impact_groupage",
|
||||
}
|
||||
|
||||
|
||||
def normalize_row(raw: dict) -> dict:
|
||||
"""Convertit les clés LLM en clés Excel et normalise les types."""
|
||||
row = {}
|
||||
for llm_key, excel_key in KEY_MAP.items():
|
||||
val = raw.get(llm_key, raw.get(excel_key, ""))
|
||||
# Convertir en int pour Champ et OGC
|
||||
if excel_key in ("Champ", "OGC"):
|
||||
try:
|
||||
val = int(val) if val else 0
|
||||
except (ValueError, TypeError):
|
||||
val = 0
|
||||
elif not isinstance(val, str):
|
||||
val = str(val) if val is not None else ""
|
||||
row[excel_key] = val
|
||||
return row
|
||||
|
||||
|
||||
def merge_and_deduplicate(all_items: list[dict]) -> list[dict]:
|
||||
"""Fusionne, déduplique par OGC, et trie les résultats."""
|
||||
rows = [normalize_row(item) for item in all_items]
|
||||
|
||||
# Filtrer les lignes sans contenu utile
|
||||
rows = [r for r in rows if r["OGC"] > 0 or r["Code_etablissement"] or r["Decision"]]
|
||||
|
||||
# Dédoublonnage par OGC (garder la version la plus complète)
|
||||
seen: dict[int, dict] = {}
|
||||
deduped: list[dict] = []
|
||||
for r in rows:
|
||||
key = r["OGC"]
|
||||
if key == 0:
|
||||
deduped.append(r)
|
||||
continue
|
||||
if key in seen:
|
||||
old = seen[key]
|
||||
old_score = sum(1 for v in old.values() if v and v != 0)
|
||||
new_score = sum(1 for v in r.values() if v and v != 0)
|
||||
if new_score > old_score:
|
||||
deduped = [x for x in deduped if x["OGC"] != key]
|
||||
deduped.append(r)
|
||||
seen[key] = r
|
||||
else:
|
||||
seen[key] = r
|
||||
deduped.append(r)
|
||||
|
||||
deduped.sort(key=lambda r: (r["Champ"], r["OGC"]))
|
||||
return deduped
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 7. Export Excel
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
HEADERS = [
|
||||
"Champ", "OGC", "Type_desaccord",
|
||||
"Code_etablissement", "Libelle_etablissement",
|
||||
"Code_controleurs", "Libelle_controleurs",
|
||||
"Codes_retenus_final",
|
||||
"Decision", "Texte_decision_complet", "Resume_motif",
|
||||
"Regles_citees", "References_guide",
|
||||
"GHM_mentionne", "GHS_mentionne", "GHM_final", "GHS_final",
|
||||
"Impact_groupage",
|
||||
]
|
||||
|
||||
HEADER_LABELS = [
|
||||
"Champ", "N° OGC", "Type désaccord",
|
||||
"Code(s) Établissement", "Libellé Établissement",
|
||||
"Code(s) Contrôleurs", "Libellé Contrôleurs",
|
||||
"Code(s) retenus (final)",
|
||||
"Décision UCR", "Texte décision complet", "Résumé du motif",
|
||||
"Règles codage citées", "Références (guide, fascicules, avis)",
|
||||
"GHM mentionné(s)", "GHS mentionné(s)", "GHM final", "GHS final",
|
||||
"Impact groupage",
|
||||
]
|
||||
|
||||
|
||||
def write_excel(rows: list[dict], output_path: str):
|
||||
"""Écrit les résultats dans un fichier Excel (feuille unique)."""
|
||||
wb = Workbook()
|
||||
ws = wb.active
|
||||
ws.title = "Décisions UCR"
|
||||
|
||||
# Styles
|
||||
header_font = Font(bold=True, color="FFFFFF", size=11)
|
||||
header_fill = PatternFill(start_color="2F5496", end_color="2F5496", fill_type="solid")
|
||||
header_align = Alignment(horizontal="center", vertical="center", wrap_text=True)
|
||||
thin_border = Border(
|
||||
left=Side(style="thin"), right=Side(style="thin"),
|
||||
top=Side(style="thin"), bottom=Side(style="thin"),
|
||||
)
|
||||
|
||||
fav_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
|
||||
defav_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
|
||||
mixte_fill = PatternFill(start_color="FFEB9C", end_color="FFEB9C", fill_type="solid")
|
||||
|
||||
# En-têtes
|
||||
for col, label in enumerate(HEADER_LABELS, 1):
|
||||
cell = ws.cell(row=1, column=col, value=label)
|
||||
cell.font = header_font
|
||||
cell.fill = header_fill
|
||||
cell.alignment = header_align
|
||||
cell.border = thin_border
|
||||
|
||||
# Données
|
||||
for row_idx, data in enumerate(rows, 2):
|
||||
for col_idx, key in enumerate(HEADERS, 1):
|
||||
val = data.get(key, "")
|
||||
cell = ws.cell(row=row_idx, column=col_idx, value=val)
|
||||
cell.border = thin_border
|
||||
cell.alignment = Alignment(vertical="top", wrap_text=True)
|
||||
|
||||
# Colorer la colonne Décision
|
||||
dec_col = HEADERS.index("Decision") + 1
|
||||
decision_cell = ws.cell(row=row_idx, column=dec_col)
|
||||
dv = str(decision_cell.value or "")
|
||||
if "Favorable" in dv and "Défavorable" not in dv:
|
||||
decision_cell.fill = fav_fill
|
||||
elif "Défavorable" in dv:
|
||||
decision_cell.fill = defav_fill
|
||||
elif "Mixte" in dv:
|
||||
decision_cell.fill = mixte_fill
|
||||
|
||||
# Largeurs de colonnes
|
||||
col_widths = {
|
||||
"Champ": 8, "OGC": 8, "Type_desaccord": 14,
|
||||
"Code_etablissement": 22, "Libelle_etablissement": 40,
|
||||
"Code_controleurs": 22, "Libelle_controleurs": 40,
|
||||
"Codes_retenus_final": 22,
|
||||
"Decision": 24, "Texte_decision_complet": 80,
|
||||
"Resume_motif": 60,
|
||||
"Regles_citees": 16, "References_guide": 50,
|
||||
"GHM_mentionne": 16, "GHS_mentionne": 16,
|
||||
"GHM_final": 12, "GHS_final": 10,
|
||||
"Impact_groupage": 20,
|
||||
}
|
||||
for i, key in enumerate(HEADERS, 1):
|
||||
ws.column_dimensions[ws.cell(row=1, column=i).column_letter].width = col_widths.get(key, 15)
|
||||
|
||||
# Filtre automatique + freeze
|
||||
last_col_letter = ws.cell(row=1, column=len(HEADERS)).column_letter
|
||||
ws.auto_filter.ref = f"A1:{last_col_letter}{len(rows)+1}"
|
||||
ws.freeze_panes = "A2"
|
||||
|
||||
wb.save(output_path)
|
||||
print(f"Excel enregistré : {output_path}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 8. CLI / Main
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Extracteur T2A généraliste via OCR + LLM (Ollama)",
|
||||
)
|
||||
parser.add_argument("pdf", help="Fichier PDF à traiter")
|
||||
parser.add_argument("--model", default="gemma3:27b-it-qat",
|
||||
help="Modèle Ollama (défaut: gemma3:27b-it-qat)")
|
||||
parser.add_argument("--timeout", type=int, default=120,
|
||||
help="Timeout par appel LLM en secondes (défaut: 120)")
|
||||
parser.add_argument("--output", default=None,
|
||||
help="Fichier Excel de sortie (défaut: <nom>_llm.xlsx)")
|
||||
parser.add_argument("--dpi", type=int, default=300,
|
||||
help="Résolution OCR (défaut: 300)")
|
||||
parser.add_argument("--no-cache", action="store_true",
|
||||
help="Désactiver le cache texte OCR")
|
||||
parser.add_argument("--verbose", action="store_true",
|
||||
help="Afficher les prompts/réponses LLM")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
pdf_path = args.pdf
|
||||
if not Path(pdf_path).exists():
|
||||
print(f"[ERREUR] Fichier non trouvé : {pdf_path}")
|
||||
sys.exit(1)
|
||||
|
||||
output_path = args.output or str(Path(pdf_path).with_name(
|
||||
Path(pdf_path).stem + "_llm.xlsx"
|
||||
))
|
||||
|
||||
print(f"Fichier PDF : {pdf_path}")
|
||||
print(f"Modèle LLM : {args.model}")
|
||||
print(f"Sortie Excel : {output_path}")
|
||||
print()
|
||||
|
||||
# --- Étape 1 : OCR ---
|
||||
txt_cache = Path(pdf_path).with_suffix(".txt")
|
||||
if txt_cache.exists() and not args.no_cache:
|
||||
print("Étape 1/4 : Chargement du texte depuis le cache...")
|
||||
full_text = txt_cache.read_text(encoding="utf-8")
|
||||
full_text = normalize_text(full_text)
|
||||
print(f" {len(full_text)} caractères chargés depuis {txt_cache}")
|
||||
else:
|
||||
print("Étape 1/4 : OCR du document...")
|
||||
full_text = ocr_pdf(pdf_path, dpi=args.dpi)
|
||||
if not args.no_cache:
|
||||
txt_cache.write_text(full_text, encoding="utf-8")
|
||||
print(f" Cache texte sauvegardé : {txt_cache}")
|
||||
print(f" Longueur du texte : {len(full_text)} caractères")
|
||||
print()
|
||||
|
||||
# --- Étape 2 : Détection du type de document ---
|
||||
print("Étape 2/4 : Détection du type de document...")
|
||||
t0 = time.time()
|
||||
doc_info = detect_document_type(full_text, model=args.model, timeout=args.timeout, verbose=args.verbose)
|
||||
print(f" Type : {doc_info.get('type_document', '?')}")
|
||||
print(f" Organisme : {doc_info.get('organisme', '?')}")
|
||||
print(f" Objet : {doc_info.get('objet', '?')}")
|
||||
print(f" Séparateur: {doc_info.get('separateur_blocs', '(aucun)')}")
|
||||
print(f" Colonnes : {doc_info.get('colonnes_detectees', [])}")
|
||||
print(f" ({time.time() - t0:.1f}s)")
|
||||
print()
|
||||
|
||||
# --- Étape 3 : Découpage et extraction ---
|
||||
print("Étape 3/4 : Découpage en blocs et extraction LLM...")
|
||||
separator = doc_info.get("separateur_blocs", "")
|
||||
blocks = split_into_blocks(full_text, separator)
|
||||
print(f" {len(blocks)} blocs à traiter")
|
||||
|
||||
all_items = []
|
||||
t0 = time.time()
|
||||
for i, block in enumerate(blocks):
|
||||
print(f" Bloc {i+1}/{len(blocks)}...", end="\r")
|
||||
items = extract_block(block, doc_info, model=args.model, timeout=args.timeout, verbose=args.verbose)
|
||||
all_items.extend(items)
|
||||
# Progression
|
||||
elapsed = time.time() - t0
|
||||
avg = elapsed / (i + 1)
|
||||
remaining = avg * (len(blocks) - i - 1)
|
||||
print(f" Bloc {i+1}/{len(blocks)} → {len(items)} dossier(s) "
|
||||
f"[{elapsed:.0f}s écoulé, ~{remaining:.0f}s restant] ")
|
||||
|
||||
total_elapsed = time.time() - t0
|
||||
print(f" Extraction terminée : {len(all_items)} dossiers bruts en {total_elapsed:.0f}s")
|
||||
print()
|
||||
|
||||
# --- Étape 4 : Fusion et export ---
|
||||
print("Étape 4/4 : Fusion, dédoublonnage et export Excel...")
|
||||
rows = merge_and_deduplicate(all_items)
|
||||
print(f" {len(rows)} dossiers après dédoublonnage")
|
||||
|
||||
# Statistiques
|
||||
fav = sum(1 for r in rows if "Favorable" in r.get("Decision", "") and "Défavorable" not in r.get("Decision", ""))
|
||||
defav = sum(1 for r in rows if "Défavorable" in r.get("Decision", ""))
|
||||
mixte = sum(1 for r in rows if "Mixte" in r.get("Decision", ""))
|
||||
indet = sum(1 for r in rows if r.get("Decision", "") in ("Indéterminé", ""))
|
||||
print(f" Favorable établissement : {fav}")
|
||||
print(f" Défavorable établissement : {defav}")
|
||||
print(f" Mixte : {mixte}")
|
||||
print(f" Indéterminé : {indet}")
|
||||
|
||||
write_excel(rows, output_path)
|
||||
print()
|
||||
print("Terminé.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
690
cpam/parse_decision_ucr.py
Normal file
690
cpam/parse_decision_ucr.py
Normal file
@@ -0,0 +1,690 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
parse_decision_ucr.py — Extraction des décisions UCR depuis un PDF scanné (contrôle T2A)
|
||||
|
||||
Entrée : PDF scanné de décision UCR (CPAM / Assurance Maladie)
|
||||
Sortie : Fichier Excel (.xlsx) avec une feuille unique
|
||||
|
||||
Colonnes extraites (enrichies pour analyse IA) :
|
||||
Champ, OGC, Type_desaccord,
|
||||
Code_etablissement, Libelle_etablissement,
|
||||
Code_controleurs, Libelle_controleurs,
|
||||
Codes_retenus_final,
|
||||
Decision, Texte_decision_complet, Resume_motif,
|
||||
Regles_citees, References_guide,
|
||||
GHM_mentionne, GHS_mentionne, GHM_final, GHS_final,
|
||||
Impact_groupage
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pymupdf
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
import io
|
||||
from openpyxl import Workbook
|
||||
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
|
||||
import unicodedata
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 0. Normalisation texte OCR
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def normalize_text(text: str) -> str:
|
||||
"""Normalise les apostrophes, guillemets et espaces issus de l'OCR."""
|
||||
text = text.replace("\u2018", "'").replace("\u2019", "'")
|
||||
text = text.replace("\u201C", '"').replace("\u201D", '"')
|
||||
text = text.replace("\u00AB", '"').replace("\u00BB", '"')
|
||||
text = text.replace("''", "'")
|
||||
text = text.replace("\u00A0", " ").replace("\u202F", " ")
|
||||
# Erreurs OCR courantes
|
||||
text = re.sub(r"\bF'UCR\b", "l'UCR", text)
|
||||
text = re.sub(r"\bl''UCR\b", "l'UCR", text)
|
||||
return text
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 1. OCR
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def ocr_pdf(pdf_path: str, dpi: int = 300) -> str:
|
||||
"""Extrait le texte de toutes les pages du PDF via Tesseract OCR."""
|
||||
doc = pymupdf.open(pdf_path)
|
||||
full_text = []
|
||||
total = len(doc)
|
||||
for i, page in enumerate(doc):
|
||||
print(f" OCR page {i+1}/{total}...", end="\r")
|
||||
mat = pymupdf.Matrix(dpi / 72, dpi / 72)
|
||||
pix = page.get_pixmap(matrix=mat)
|
||||
img = Image.open(io.BytesIO(pix.tobytes("png")))
|
||||
text = pytesseract.image_to_string(img, lang="fra")
|
||||
full_text.append(text)
|
||||
print(f" OCR terminé : {total} pages. ")
|
||||
return normalize_text("\n\n".join(full_text))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 2. Parsing — Regex
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
RE_CHAMP = re.compile(
|
||||
r"Champ\s*(?:n°\s*)?(\d+)\s*[:\-—]?\s*(?:Séjours|:)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
RE_OGC_HEADER = re.compile(
|
||||
r"(?:^|\n)\s*OGC\s+(\d+)\s*:",
|
||||
re.MULTILINE,
|
||||
)
|
||||
|
||||
RE_TYPE_DESACCORD = re.compile(
|
||||
r"(?:désaccord|discussion)\s+porte\s+(?:sur\s+)?(?:le\s+|les\s+)?(DP\s+et\s+(?:le\s+)?DAS|DP\s+et\s+DAS|DP|DAS)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
RE_CIM10 = re.compile(r"\b([A-Z]\d{2}(?:\.\d{1,2})?)\b")
|
||||
|
||||
RE_CODAGE_ETS = re.compile(
|
||||
r"Codage\s+[ée]tablissement\s*:\s*(.*?)(?=Codage\s+contr[ôo]leurs)",
|
||||
re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
|
||||
RE_CODAGE_CTRL = re.compile(
|
||||
r"Codage\s+contr[ôo]leurs\s*:\s*(.*?)(?=D[EÉ]C[I1]?SION\s+UCR|PROPOSITION\s+UCR)",
|
||||
re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
|
||||
RE_DECISION = re.compile(
|
||||
r"(?:D[EÉ]C[I1]?SION|PROPOSITION)\s+UCR\s*:?\s*(.*)",
|
||||
re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
|
||||
# --- Classification ---
|
||||
|
||||
RE_FAVORABLE = re.compile(
|
||||
r"(?:"
|
||||
r"retient\s+(?:la\s+demande|le\s+codage|l'avis)\s+(?:de\s+)?l'[ée]tablissement"
|
||||
r"|retient\s+en\s+D[PA]S\s+le\s+code"
|
||||
r"|retient\s+le\s+codage\s+du\s+DP\s+de\s+l'[ée]tablissement"
|
||||
r"|l'UCR\s+retient\s+l'avis\s+de\s+l'[ée]tablissement"
|
||||
r"|confirme\s+l'avis\s+(?:de\s+)?l'[ée]tablissement"
|
||||
r")",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
RE_DEFAVORABLE = re.compile(
|
||||
r"confirme\s+l'avis\s+des\s+(?:m[ée]decins\s+)?contr[oô]leurs",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
RE_UCR_RETIENT = re.compile(r"l'UCR\s+retient\b", re.IGNORECASE)
|
||||
RE_UCR_PROPOSE = re.compile(r"l'UCR\s+propose\b", re.IGNORECASE)
|
||||
RE_NE_RETIENT_PAS = re.compile(r"ne\s+retient\s+pas", re.IGNORECASE)
|
||||
|
||||
# --- GHM / GHS ---
|
||||
|
||||
RE_GHM = re.compile(r"GHM\s+([A-Z0-9]{5,7})", re.IGNORECASE)
|
||||
RE_GHS = re.compile(r"GHS\s+(\d{3,5})", re.IGNORECASE)
|
||||
|
||||
RE_MIEUX_VALORISE = re.compile(r"mieux\s+valoris[ée]", re.IGNORECASE)
|
||||
RE_PAS_MODIFIE = re.compile(
|
||||
r"(?:ne\s+modifie\s+pas|ne\s+change(?:nt)?\s+pas|pas\s+de\s+changement|reste\s+group[ée])",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# --- Règles et références ---
|
||||
|
||||
# Pages du guide méthodologique
|
||||
RE_GUIDE_PAGE = re.compile(
|
||||
r"(?:guide\s+m[ée]thodologique|guide)\s*(?:p\.?|page)\s*(\d{1,3})",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
RE_PAGE_GUIDE = re.compile(
|
||||
r"(?:p\.?|page)\s*(\d{1,3})\s+du\s+guide",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Règles T (T3, T7, etc.)
|
||||
RE_REGLE_T = re.compile(
|
||||
r"r[èe]gle\s+(T\d+)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Fascicules ATIH
|
||||
RE_FASCICULE = re.compile(
|
||||
r"fascicule\s+(?:ATIH\s+)?(?:de\s+codage\s+)?(?:PMSI\s+)?(?:n°\s*)?(\d{1,2})?\s*(?:[-–]\s*)?([A-ZÀ-Üa-zà-ü\s]+?)(?:\s+(?:de\s+)?(\d{4}))?(?:\s*(?:,\s*)?(?:p\.?\s*|page\s*)(\d+))?",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Avis Agora
|
||||
RE_AVIS_AGORA = re.compile(
|
||||
r"avis\s+(?:agora|AGORA)\s*(?:n°\s*)?(\d+)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Consignes de codage avec page
|
||||
RE_CONSIGNES_CODAGE = re.compile(
|
||||
r"consignes?\s+de\s+codage\s*(?:p\.?\s*|page\s*)(\d+)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Codage retenu / DP retenu / DAS retenu
|
||||
RE_CODAGE_RETENU = re.compile(
|
||||
r"(?:codage\s+retenu|DP\s*(?:retenu|=)|DAS\s*(?:retenu|=)|code\s+retenu|est\s+cod[ée]\s+en|se\s+code)\s*(?:est\s+)?(?::?\s*)([A-Z]\d{2}(?:\.\d{1,2})?)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# "est ajouté en DAS" / "ajout du code X"
|
||||
RE_CODE_AJOUTE = re.compile(
|
||||
r"(?:est\s+ajout[ée]\s+en\s+D[PA]S|ajout(?:er)?\s+(?:du\s+|en\s+D[PA]S\s+(?:le\s+)?)?(?:code\s+)?)\s*(?::?\s*)([A-Z]\d{2}(?:\.\d{1,2})?)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 2b. Fonctions d'extraction
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def extract_codes_and_label(text: str) -> tuple[str, str]:
|
||||
"""Extrait les codes CIM-10 et le libellé depuis un bloc de codage."""
|
||||
codes = RE_CIM10.findall(text)
|
||||
labels = re.findall(r'[«"](.*?)[»"]', text)
|
||||
code_str = " + ".join(codes) if codes else ""
|
||||
label_str = " | ".join(labels) if labels else text.strip()[:120]
|
||||
label_str = re.sub(r"\s+", " ", label_str).strip()
|
||||
return code_str, label_str
|
||||
|
||||
|
||||
def extract_codes_retenus(decision_text: str) -> str:
|
||||
"""Extrait les codes finalement retenus par l'UCR."""
|
||||
codes = set()
|
||||
for m in RE_CODAGE_RETENU.finditer(decision_text):
|
||||
codes.add(m.group(1))
|
||||
for m in RE_CODE_AJOUTE.finditer(decision_text):
|
||||
codes.add(m.group(1))
|
||||
return " + ".join(sorted(codes)) if codes else ""
|
||||
|
||||
|
||||
def extract_regles(text: str) -> str:
|
||||
"""Extrait les règles de codage citées (T3, T7, etc.)."""
|
||||
regles = set()
|
||||
for m in RE_REGLE_T.finditer(text):
|
||||
regles.add(m.group(1).upper())
|
||||
return ", ".join(sorted(regles)) if regles else ""
|
||||
|
||||
|
||||
def extract_references(text: str) -> str:
|
||||
"""Extrait toutes les références (guide, fascicules, avis Agora, consignes)."""
|
||||
refs = []
|
||||
|
||||
# Pages du guide méthodologique
|
||||
pages_guide = set()
|
||||
for m in RE_GUIDE_PAGE.finditer(text):
|
||||
pages_guide.add(m.group(1))
|
||||
for m in RE_PAGE_GUIDE.finditer(text):
|
||||
pages_guide.add(m.group(1))
|
||||
if pages_guide:
|
||||
refs.append("Guide méthodologique p." + ", p.".join(sorted(pages_guide, key=int)))
|
||||
|
||||
# Fascicules ATIH
|
||||
for m in RE_FASCICULE.finditer(text):
|
||||
num = m.group(1) or ""
|
||||
sujet = (m.group(2) or "").strip()
|
||||
annee = m.group(3) or ""
|
||||
page = m.group(4) or ""
|
||||
ref = "Fascicule"
|
||||
if num:
|
||||
ref += f" {num}"
|
||||
if sujet:
|
||||
ref += f" {sujet}"
|
||||
if annee:
|
||||
ref += f" ({annee})"
|
||||
if page:
|
||||
ref += f" p.{page}"
|
||||
refs.append(ref.strip())
|
||||
|
||||
# Avis Agora
|
||||
for m in RE_AVIS_AGORA.finditer(text):
|
||||
refs.append(f"Avis Agora n°{m.group(1)}")
|
||||
|
||||
# Consignes de codage
|
||||
for m in RE_CONSIGNES_CODAGE.finditer(text):
|
||||
refs.append(f"Consignes de codage p.{m.group(1)}")
|
||||
|
||||
# Dédupliquer
|
||||
seen = set()
|
||||
unique = []
|
||||
for r in refs:
|
||||
r_lower = r.lower()
|
||||
if r_lower not in seen:
|
||||
seen.add(r_lower)
|
||||
unique.append(r)
|
||||
|
||||
return " ; ".join(unique) if unique else ""
|
||||
|
||||
|
||||
def extract_ghm_ghs_all(text: str) -> tuple[list[str], list[str]]:
|
||||
"""Extrait tous les GHM et GHS mentionnés."""
|
||||
ghms = []
|
||||
for m in RE_GHM.finditer(text):
|
||||
v = m.group(1).upper()
|
||||
if v not in ghms:
|
||||
ghms.append(v)
|
||||
ghss = []
|
||||
for m in RE_GHS.finditer(text):
|
||||
v = m.group(1)
|
||||
if v not in ghss:
|
||||
ghss.append(v)
|
||||
return ghms, ghss
|
||||
|
||||
|
||||
def classify_decision(decision_text: str) -> str:
|
||||
"""Classifie la décision : Favorable / Défavorable / Mixte / Indéterminé."""
|
||||
text = normalize_text(decision_text)
|
||||
|
||||
fav = bool(RE_FAVORABLE.search(text))
|
||||
defav = bool(RE_DEFAVORABLE.search(text))
|
||||
|
||||
ucr_retient = bool(RE_UCR_RETIENT.search(text))
|
||||
ucr_propose = bool(RE_UCR_PROPOSE.search(text))
|
||||
ne_retient_pas = bool(RE_NE_RETIENT_PAS.search(text))
|
||||
|
||||
if ucr_retient and not ne_retient_pas:
|
||||
fav = True
|
||||
if ucr_propose and not defav:
|
||||
fav = True
|
||||
|
||||
if (ucr_retient or fav) and defav:
|
||||
return "Mixte"
|
||||
if fav and defav:
|
||||
return "Mixte"
|
||||
elif fav:
|
||||
return "Favorable établissement"
|
||||
elif defav:
|
||||
return "Défavorable établissement"
|
||||
else:
|
||||
return "Indéterminé"
|
||||
|
||||
|
||||
def clean_decision_text(text: str) -> str:
|
||||
"""Nettoie le texte de décision (supprime artifacts OCR en fin de bloc)."""
|
||||
# Supprimer les lignes de pied de page UCR
|
||||
text = re.sub(r"\n\s*(?:UCR\s+NA|CONFIDENTIEL|Page\s+\d+).*$", "", text, flags=re.MULTILINE | re.IGNORECASE)
|
||||
# Supprimer les artefacts OCR de fin (séquences de caractères isolés)
|
||||
text = re.sub(r"\n\s*[A-Z]{1,4}\s*(?:—|—|-)\s*[a-zA-Z]{0,3}\s*$", "", text, flags=re.MULTILINE)
|
||||
text = re.sub(r"\n\s*(?:EE|ESS|2 ae|A D ES|EE nd)\s*$", "", text, flags=re.MULTILINE | re.IGNORECASE)
|
||||
# Normaliser les espaces
|
||||
text = re.sub(r"[ \t]+", " ", text)
|
||||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 2c. Parsing des blocs
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def parse_ogc_block(block_text: str, champ: int, ogc_num: int) -> dict:
|
||||
"""Parse un bloc OGC et retourne un dictionnaire structuré enrichi."""
|
||||
result = {
|
||||
"Champ": champ,
|
||||
"OGC": ogc_num,
|
||||
"Type_desaccord": "",
|
||||
"Code_etablissement": "",
|
||||
"Libelle_etablissement": "",
|
||||
"Code_controleurs": "",
|
||||
"Libelle_controleurs": "",
|
||||
"Codes_retenus_final": "",
|
||||
"Decision": "",
|
||||
"Texte_decision_complet": "",
|
||||
"Resume_motif": "",
|
||||
"Regles_citees": "",
|
||||
"References_guide": "",
|
||||
"GHM_mentionne": "",
|
||||
"GHS_mentionne": "",
|
||||
"GHM_final": "",
|
||||
"GHS_final": "",
|
||||
"Impact_groupage": "",
|
||||
}
|
||||
|
||||
# Type de désaccord
|
||||
m = RE_TYPE_DESACCORD.search(block_text)
|
||||
if m:
|
||||
raw = m.group(1).upper().strip()
|
||||
raw = re.sub(r"\s+", " ", raw)
|
||||
if "DP" in raw and "DAS" in raw:
|
||||
result["Type_desaccord"] = "DP + DAS"
|
||||
elif "DAS" in raw:
|
||||
result["Type_desaccord"] = "DAS"
|
||||
elif "DP" in raw:
|
||||
result["Type_desaccord"] = "DP"
|
||||
|
||||
# Codage établissement
|
||||
m = RE_CODAGE_ETS.search(block_text)
|
||||
if m:
|
||||
raw_ets = m.group(1).strip()
|
||||
result["Code_etablissement"], result["Libelle_etablissement"] = extract_codes_and_label(raw_ets)
|
||||
|
||||
# Codage contrôleurs
|
||||
m = RE_CODAGE_CTRL.search(block_text)
|
||||
if m:
|
||||
raw_ctrl = m.group(1).strip()
|
||||
if re.search(r"non\s+repris", raw_ctrl, re.IGNORECASE):
|
||||
result["Code_controleurs"] = "non repris"
|
||||
result["Libelle_controleurs"] = ""
|
||||
else:
|
||||
result["Code_controleurs"], result["Libelle_controleurs"] = extract_codes_and_label(raw_ctrl)
|
||||
|
||||
# Décision UCR — TEXTE COMPLET
|
||||
m = RE_DECISION.search(block_text)
|
||||
if m:
|
||||
decision_text = m.group(1).strip()
|
||||
decision_clean = clean_decision_text(decision_text)
|
||||
|
||||
result["Decision"] = classify_decision(decision_clean)
|
||||
result["Texte_decision_complet"] = decision_clean
|
||||
|
||||
# Résumé court (première phrase significative)
|
||||
resume = re.sub(r"\s+", " ", decision_clean)[:300].strip()
|
||||
# Couper à la dernière phrase complète
|
||||
last_dot = resume.rfind(".")
|
||||
if last_dot > 100:
|
||||
resume = resume[:last_dot + 1]
|
||||
result["Resume_motif"] = resume
|
||||
|
||||
# Codes finalement retenus
|
||||
result["Codes_retenus_final"] = extract_codes_retenus(decision_clean)
|
||||
|
||||
# Règles citées (T3, T7, etc.)
|
||||
result["Regles_citees"] = extract_regles(block_text)
|
||||
|
||||
# Références (guide, fascicules, avis Agora)
|
||||
result["References_guide"] = extract_references(block_text)
|
||||
|
||||
# GHM / GHS — tous ceux mentionnés et le dernier (= final)
|
||||
ghms, ghss = extract_ghm_ghs_all(block_text)
|
||||
if ghms:
|
||||
result["GHM_mentionne"] = " / ".join(ghms)
|
||||
result["GHM_final"] = ghms[-1] # Le dernier mentionné est souvent le final
|
||||
if ghss:
|
||||
result["GHS_mentionne"] = " / ".join(ghss)
|
||||
result["GHS_final"] = ghss[-1]
|
||||
|
||||
# Impact groupage
|
||||
if RE_MIEUX_VALORISE.search(block_text):
|
||||
result["Impact_groupage"] = "Mieux valorisé"
|
||||
elif RE_PAS_MODIFIE.search(block_text):
|
||||
result["Impact_groupage"] = "Pas de changement"
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def parse_grouped_ogcs(text_block: str, champ: int, ogc_nums: list[int]) -> list[dict]:
|
||||
"""Parse un bloc groupé (ex: OGC 14,19,46,50 traités ensemble)."""
|
||||
template = parse_ogc_block(text_block, champ, ogc_nums[0])
|
||||
results = []
|
||||
for num in ogc_nums:
|
||||
row = dict(template)
|
||||
row["OGC"] = num
|
||||
results.append(row)
|
||||
return results
|
||||
|
||||
|
||||
def parse_document(full_text: str) -> list[dict]:
|
||||
"""Parse le texte OCR complet et retourne la liste des dossiers."""
|
||||
rows = []
|
||||
|
||||
champ_positions = [(m.start(), int(m.group(1))) for m in RE_CHAMP.finditer(full_text)]
|
||||
ogc_positions = [(m.start(), int(m.group(1))) for m in RE_OGC_HEADER.finditer(full_text)]
|
||||
|
||||
def get_champ_for_position(pos: int) -> int:
|
||||
ch = 0
|
||||
for cp, cn in champ_positions:
|
||||
if cp <= pos:
|
||||
ch = cn
|
||||
else:
|
||||
break
|
||||
return ch
|
||||
|
||||
# Blocs groupés
|
||||
RE_GROUPED = re.compile(
|
||||
r"(?:Concernant|Pour)\s+les\s+OGC\s+([\d,\s]+)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
grouped_ogcs = set()
|
||||
for m in RE_GROUPED.finditer(full_text):
|
||||
nums = [int(n.strip()) for n in m.group(1).split(",") if n.strip().isdigit()]
|
||||
if len(nums) > 1:
|
||||
start = m.start()
|
||||
end = len(full_text)
|
||||
for op, on in ogc_positions:
|
||||
if op > start + 50 and on not in nums:
|
||||
end = op
|
||||
break
|
||||
block = full_text[start:end]
|
||||
champ = get_champ_for_position(start)
|
||||
group_rows = parse_grouped_ogcs(block, champ, nums)
|
||||
rows.extend(group_rows)
|
||||
grouped_ogcs.update(nums)
|
||||
|
||||
# OGC individuels
|
||||
for idx, (pos, ogc_num) in enumerate(ogc_positions):
|
||||
champ = get_champ_for_position(pos)
|
||||
|
||||
end = len(full_text)
|
||||
for next_pos, _ in ogc_positions[idx + 1:]:
|
||||
if next_pos > pos + 20:
|
||||
end = next_pos
|
||||
break
|
||||
for cp, _ in champ_positions:
|
||||
if pos < cp < end:
|
||||
end = cp
|
||||
break
|
||||
|
||||
block = full_text[pos:end]
|
||||
row = parse_ogc_block(block, champ, ogc_num)
|
||||
|
||||
if ogc_num in grouped_ogcs:
|
||||
if row["Code_etablissement"] and row["Decision"]:
|
||||
rows = [r for r in rows if r["OGC"] != ogc_num]
|
||||
rows.append(row)
|
||||
else:
|
||||
if row["Code_etablissement"] or row["Decision"]:
|
||||
rows.append(row)
|
||||
|
||||
rows.sort(key=lambda r: (r["Champ"], r["OGC"]))
|
||||
|
||||
# Dédupliquer
|
||||
seen = {}
|
||||
deduped = []
|
||||
for r in rows:
|
||||
key = r["OGC"]
|
||||
if key in seen:
|
||||
old = seen[key]
|
||||
old_score = sum(1 for v in old.values() if v)
|
||||
new_score = sum(1 for v in r.values() if v)
|
||||
if new_score > old_score:
|
||||
deduped = [x for x in deduped if x["OGC"] != key]
|
||||
deduped.append(r)
|
||||
seen[key] = r
|
||||
else:
|
||||
seen[key] = r
|
||||
deduped.append(r)
|
||||
|
||||
deduped.sort(key=lambda r: (r["Champ"], r["OGC"]))
|
||||
return deduped
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 3. Export Excel
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
HEADERS = [
|
||||
"Champ",
|
||||
"OGC",
|
||||
"Type_desaccord",
|
||||
"Code_etablissement",
|
||||
"Libelle_etablissement",
|
||||
"Code_controleurs",
|
||||
"Libelle_controleurs",
|
||||
"Codes_retenus_final",
|
||||
"Decision",
|
||||
"Texte_decision_complet",
|
||||
"Resume_motif",
|
||||
"Regles_citees",
|
||||
"References_guide",
|
||||
"GHM_mentionne",
|
||||
"GHS_mentionne",
|
||||
"GHM_final",
|
||||
"GHS_final",
|
||||
"Impact_groupage",
|
||||
]
|
||||
|
||||
HEADER_LABELS = [
|
||||
"Champ",
|
||||
"N° OGC",
|
||||
"Type désaccord",
|
||||
"Code(s) Établissement",
|
||||
"Libellé Établissement",
|
||||
"Code(s) Contrôleurs",
|
||||
"Libellé Contrôleurs",
|
||||
"Code(s) retenus (final)",
|
||||
"Décision UCR",
|
||||
"Texte décision complet",
|
||||
"Résumé du motif",
|
||||
"Règles codage citées",
|
||||
"Références (guide, fascicules, avis)",
|
||||
"GHM mentionné(s)",
|
||||
"GHS mentionné(s)",
|
||||
"GHM final",
|
||||
"GHS final",
|
||||
"Impact groupage",
|
||||
]
|
||||
|
||||
|
||||
def write_excel(rows: list[dict], output_path: str):
|
||||
"""Écrit les résultats dans un fichier Excel (feuille unique)."""
|
||||
wb = Workbook()
|
||||
ws = wb.active
|
||||
ws.title = "Décisions UCR"
|
||||
|
||||
# Styles
|
||||
header_font = Font(bold=True, color="FFFFFF", size=11)
|
||||
header_fill = PatternFill(start_color="2F5496", end_color="2F5496", fill_type="solid")
|
||||
header_align = Alignment(horizontal="center", vertical="center", wrap_text=True)
|
||||
thin_border = Border(
|
||||
left=Side(style="thin"),
|
||||
right=Side(style="thin"),
|
||||
top=Side(style="thin"),
|
||||
bottom=Side(style="thin"),
|
||||
)
|
||||
|
||||
fav_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
|
||||
defav_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
|
||||
mixte_fill = PatternFill(start_color="FFEB9C", end_color="FFEB9C", fill_type="solid")
|
||||
|
||||
# En-têtes
|
||||
for col, label in enumerate(HEADER_LABELS, 1):
|
||||
cell = ws.cell(row=1, column=col, value=label)
|
||||
cell.font = header_font
|
||||
cell.fill = header_fill
|
||||
cell.alignment = header_align
|
||||
cell.border = thin_border
|
||||
|
||||
# Données
|
||||
for row_idx, data in enumerate(rows, 2):
|
||||
for col_idx, key in enumerate(HEADERS, 1):
|
||||
val = data.get(key, "")
|
||||
cell = ws.cell(row=row_idx, column=col_idx, value=val)
|
||||
cell.border = thin_border
|
||||
cell.alignment = Alignment(vertical="top", wrap_text=True)
|
||||
|
||||
# Colorer la colonne Décision
|
||||
dec_col = HEADERS.index("Decision") + 1
|
||||
decision_cell = ws.cell(row=row_idx, column=dec_col)
|
||||
dv = str(decision_cell.value or "")
|
||||
if "Favorable" in dv and "Défavorable" not in dv:
|
||||
decision_cell.fill = fav_fill
|
||||
elif "Défavorable" in dv:
|
||||
decision_cell.fill = defav_fill
|
||||
elif "Mixte" in dv:
|
||||
decision_cell.fill = mixte_fill
|
||||
|
||||
# Largeurs de colonnes
|
||||
col_widths = {
|
||||
"Champ": 8, "OGC": 8, "Type_desaccord": 14,
|
||||
"Code_etablissement": 22, "Libelle_etablissement": 40,
|
||||
"Code_controleurs": 22, "Libelle_controleurs": 40,
|
||||
"Codes_retenus_final": 22,
|
||||
"Decision": 24, "Texte_decision_complet": 80,
|
||||
"Resume_motif": 60,
|
||||
"Regles_citees": 16, "References_guide": 50,
|
||||
"GHM_mentionne": 16, "GHS_mentionne": 16,
|
||||
"GHM_final": 12, "GHS_final": 10,
|
||||
"Impact_groupage": 20,
|
||||
}
|
||||
for i, key in enumerate(HEADERS, 1):
|
||||
ws.column_dimensions[ws.cell(row=1, column=i).column_letter].width = col_widths.get(key, 15)
|
||||
|
||||
# Filtre automatique
|
||||
last_col_letter = ws.cell(row=1, column=len(HEADERS)).column_letter
|
||||
ws.auto_filter.ref = f"A1:{last_col_letter}{len(rows)+1}"
|
||||
|
||||
# Figer la première ligne
|
||||
ws.freeze_panes = "A2"
|
||||
|
||||
wb.save(output_path)
|
||||
print(f"Excel enregistré : {output_path}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
pdf_path = str(Path(__file__).parent / "SPHO-FINANC26020915121.pdf")
|
||||
else:
|
||||
pdf_path = sys.argv[1]
|
||||
|
||||
output_path = str(Path(pdf_path).with_suffix(".xlsx"))
|
||||
|
||||
print(f"Fichier PDF : {pdf_path}")
|
||||
print("Étape 1/3 : OCR du document...")
|
||||
full_text = ocr_pdf(pdf_path)
|
||||
|
||||
txt_path = str(Path(pdf_path).with_suffix(".txt"))
|
||||
Path(txt_path).write_text(full_text, encoding="utf-8")
|
||||
print(f" Texte brut sauvegardé : {txt_path}")
|
||||
|
||||
print("Étape 2/3 : Extraction des décisions...")
|
||||
rows = parse_document(full_text)
|
||||
print(f" {len(rows)} dossiers OGC extraits.")
|
||||
|
||||
fav = sum(1 for r in rows if "Favorable" in r.get("Decision", "") and "Défavorable" not in r.get("Decision", ""))
|
||||
defav = sum(1 for r in rows if "Défavorable" in r.get("Decision", ""))
|
||||
mixte = sum(1 for r in rows if "Mixte" in r.get("Decision", ""))
|
||||
indet = sum(1 for r in rows if r.get("Decision", "") in ("Indéterminé", ""))
|
||||
refs_count = sum(1 for r in rows if r.get("References_guide"))
|
||||
codes_ret = sum(1 for r in rows if r.get("Codes_retenus_final"))
|
||||
regles = sum(1 for r in rows if r.get("Regles_citees"))
|
||||
|
||||
print(f" Favorable établissement : {fav}")
|
||||
print(f" Défavorable établissement : {defav}")
|
||||
print(f" Mixte : {mixte}")
|
||||
print(f" Indéterminé : {indet}")
|
||||
print(f" Avec références citées : {refs_count}")
|
||||
print(f" Avec codes retenus : {codes_ret}")
|
||||
print(f" Avec règles T : {regles}")
|
||||
|
||||
print("Étape 3/3 : Génération du fichier Excel...")
|
||||
write_excel(rows, output_path)
|
||||
print("Terminé.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user