- Réorganisation data/referentiels/ : pdfs/, dicts/, user/ (structure unifiée) - Fix badges "Source absente" sur page admin référentiels - Ré-indexation COCOA 2025 (555 → 1451 chunks, couverture 94%) - Fix VRAM OOM : embeddings forcés CPU via T2A_EMBED_CPU - Nouveaux modules : document_router, docx_extractor, image_extractor, ocr_engine - Module complétude (quality/completude.py + config YAML) - Template DIM (synthèse dimensionnelle) - Gunicorn config + systemd service t2a-viewer - Suppression t2a_install_rag_cleanup/ (copie obsolète) - Suppression scripts/ et scripts_t2a_v2/ (anciens benchmarks) - Suppression 81 fichiers _doc.txt de test - Cache Ollama : TTL configurable, corrections loader YAML - Dashboard : améliorations templates (base, index, detail, cpam, validation) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
739 lines
29 KiB
Python
739 lines
29 KiB
Python
"""CLI + orchestrateur du pipeline d'anonymisation et extraction CIM-10."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
from .anonymization.anonymizer import Anonymizer
|
|
from .config import (
|
|
ANONYMIZED_DIR,
|
|
INPUT_DIR,
|
|
OUTPUT_DIR,
|
|
REPORTS_DIR,
|
|
STRUCTURED_DIR,
|
|
AnonymizationReport,
|
|
DossierMedical,
|
|
DossierMetrics,
|
|
VetoReport,
|
|
set_rules_runtime,
|
|
reset_rules_runtime,
|
|
)
|
|
from .extraction.document_classifier import classify
|
|
from .extraction.crh_parser import parse_crh
|
|
from .extraction.document_splitter import split_documents
|
|
from .extraction.document_router import SUPPORTED_EXTENSIONS, extract_document_with_pages
|
|
from .extraction.pdf_extractor import extract_text, extract_text_with_pages
|
|
from .extraction.trackare_parser import parse_trackare
|
|
from .medical.cim10_extractor import extract_medical_info
|
|
from .medical.ghm import estimate_ghm
|
|
from .quality.veto_engine import apply_vetos
|
|
from .quality.decision_engine import apply_decisions, decision_summaries
|
|
from .quality.completude import build_completude_checklist
|
|
from .quality.rules_router import build_rules_runtime_context
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def _compute_metrics(dossier: DossierMedical) -> DossierMetrics:
|
|
"""Calcule les métriques "actifs vs écartés" pour reporting.
|
|
|
|
Règle pro : les métriques (GHM, sévérité, stats) ne doivent compter
|
|
que les diagnostics *actifs* (pas ceux écartés par décisions).
|
|
"""
|
|
|
|
def _has_any_code(diag) -> bool:
|
|
return bool(getattr(diag, "cim10_final", None) or getattr(diag, "cim10_suggestion", None))
|
|
|
|
def _is_active_diag(diag) -> bool:
|
|
dec = getattr(diag, "cim10_decision", None)
|
|
if dec is not None and getattr(dec, "action", None) == "REMOVE":
|
|
return False
|
|
if getattr(diag, "cim10_final", None):
|
|
return True
|
|
# Backward compat : si pas de final + suggestion et pas de décision
|
|
if getattr(diag, "cim10_final", None) is None and getattr(diag, "cim10_suggestion", None) and dec is None:
|
|
return True
|
|
return False
|
|
|
|
das_total = len(dossier.diagnostics_associes)
|
|
das_active = 0
|
|
das_removed = 0
|
|
das_no_code = 0
|
|
for d in dossier.diagnostics_associes:
|
|
dec = getattr(d, "cim10_decision", None)
|
|
if dec is not None and getattr(dec, "action", None) == "REMOVE":
|
|
das_removed += 1
|
|
if not _has_any_code(d):
|
|
das_no_code += 1
|
|
if _is_active_diag(d):
|
|
das_active += 1
|
|
|
|
actes_total = len(dossier.actes_ccam)
|
|
actes_with_code = sum(1 for a in dossier.actes_ccam if getattr(a, "code_ccam_suggestion", None))
|
|
|
|
dp_has_code = False
|
|
if dossier.diagnostic_principal is not None:
|
|
dp = dossier.diagnostic_principal
|
|
dp_dec = getattr(dp, "cim10_decision", None)
|
|
if not (dp_dec is not None and getattr(dp_dec, "action", None) == "REMOVE"):
|
|
dp_has_code = bool(getattr(dp, "cim10_final", None) or getattr(dp, "cim10_suggestion", None))
|
|
|
|
metrics = DossierMetrics(
|
|
das_total=das_total,
|
|
das_active=das_active,
|
|
das_excluded=max(0, das_total - das_active),
|
|
das_removed=das_removed,
|
|
das_no_code=das_no_code,
|
|
actes_total=actes_total,
|
|
actes_with_code=actes_with_code,
|
|
dp_has_code=dp_has_code,
|
|
)
|
|
dossier.metrics = metrics
|
|
return metrics
|
|
|
|
|
|
def _inject_veto_alerts(dossier: DossierMedical, veto: VetoReport, scope: str = "FINAL") -> None:
|
|
"""Injecte les alertes liées aux vetos dans alertes_codage en évitant les doublons.
|
|
|
|
On *remplace* la section VETO précédente (qu'elle vienne d'un PDF individuel ou d'une passe de fusion),
|
|
afin que le JSON fusionné reste lisible.
|
|
"""
|
|
cleaned: list[str] = []
|
|
for line in (dossier.alertes_codage or []):
|
|
if isinstance(line, str) and (line.startswith("VETOS:") or line.startswith("VETOS[") or line.startswith("VETO-")):
|
|
continue
|
|
cleaned.append(line)
|
|
dossier.alertes_codage = cleaned
|
|
|
|
if veto.verdict != "PASS":
|
|
dossier.alertes_codage.append(f"VETOS[{scope}]: {veto.verdict} (score={veto.score_contestabilite})")
|
|
for it in veto.issues[:25]:
|
|
dossier.alertes_codage.append(f"{it.veto} [{it.severity}] {it.where}: {it.message}")
|
|
|
|
|
|
def _inject_decision_alerts(dossier: DossierMedical, scope: str = "FINAL") -> None:
|
|
"""Injecte les décisions (downgrade/suppression) dans alertes_codage.
|
|
|
|
On remplace la section DECISION précédente pour garder un JSON lisible.
|
|
"""
|
|
cleaned: list[str] = []
|
|
for line in (dossier.alertes_codage or []):
|
|
if isinstance(line, str) and line.startswith("DECISION:"):
|
|
continue
|
|
cleaned.append(line)
|
|
dossier.alertes_codage = cleaned
|
|
|
|
lines = decision_summaries(dossier)
|
|
if lines:
|
|
dossier.alertes_codage.append(f"DECISIONS[{scope}]: {len(lines)} ligne(s)")
|
|
dossier.alertes_codage.extend(lines[:30])
|
|
|
|
|
|
# Flags globaux
|
|
_use_edsnlp = True
|
|
_use_rag = True
|
|
|
|
|
|
def process_document(file_path: Path) -> list[tuple[str, DossierMedical, AnonymizationReport]]:
|
|
"""Traite un document : extraction → splitting → parsing → anonymisation → extraction CIM-10.
|
|
|
|
Supporte PDF, images (JPEG/PNG/TIFF) et DOCX via le router d'extraction.
|
|
|
|
Retourne une liste de (texte_anonymisé, dossier, rapport) — un par dossier détecté.
|
|
"""
|
|
t0 = time.time()
|
|
logger.info("Traitement de %s", file_path.name)
|
|
|
|
# 1. Extraction texte avec pages (multi-format)
|
|
raw_text, page_tracker, extraction_stats = extract_document_with_pages(file_path)
|
|
logger.info(" Texte extrait : %d caractères (%d pages, format=%s)", len(raw_text), extraction_stats.total_pages, extraction_stats.source_format)
|
|
|
|
# 2. Classification
|
|
doc_type = classify(raw_text)
|
|
logger.info(" Type de document : %s", doc_type)
|
|
|
|
# 3. Splitting multi-dossiers
|
|
chunks = split_documents(raw_text, doc_type)
|
|
if len(chunks) > 1:
|
|
logger.info(" Découpage : %d dossiers détectés dans %s", len(chunks), file_path.name)
|
|
|
|
results: list[tuple[str, DossierMedical, AnonymizationReport]] = []
|
|
for i, chunk_text in enumerate(chunks):
|
|
part_label = f" [part {i+1}/{len(chunks)}]" if len(chunks) > 1 else ""
|
|
logger.info(" Traitement%s...", part_label)
|
|
|
|
# 4. Parsing
|
|
if doc_type == "trackare":
|
|
parsed = parse_trackare(chunk_text)
|
|
else:
|
|
parsed = parse_crh(chunk_text)
|
|
|
|
# 5. Anonymisation
|
|
anonymizer = Anonymizer(parsed_data=parsed)
|
|
anonymized_text = anonymizer.anonymize(chunk_text)
|
|
report = anonymizer.report
|
|
report.source_file = file_path.name
|
|
logger.info(
|
|
" Anonymisation%s : %d remplacements (regex=%d, ner=%d, sweep=%d)",
|
|
part_label,
|
|
report.total_replacements,
|
|
report.regex_replacements,
|
|
report.ner_replacements,
|
|
report.sweep_replacements,
|
|
)
|
|
|
|
# 6. Analyse edsnlp (optionnelle)
|
|
edsnlp_result = None
|
|
if _use_edsnlp:
|
|
edsnlp_result = _run_edsnlp(anonymized_text)
|
|
|
|
# 7. Extraction médicale CIM-10
|
|
dossier = extract_medical_info(
|
|
parsed, anonymized_text, edsnlp_result, use_rag=_use_rag,
|
|
page_tracker=page_tracker, raw_text=raw_text,
|
|
)
|
|
dossier.source_file = file_path.name
|
|
dossier.document_type = doc_type
|
|
logger.info(" DP%s : %s", part_label, dossier.diagnostic_principal)
|
|
|
|
# Injection des stats d'extraction dans quality_flags
|
|
extraction_flags = extraction_stats.to_flags()
|
|
if extraction_flags:
|
|
dossier.quality_flags.update(extraction_flags)
|
|
extraction_alert = extraction_stats.to_alert()
|
|
if extraction_alert:
|
|
dossier.alertes_codage.append(extraction_alert)
|
|
|
|
# 8. Vetos (contestabilité) + décisions (post-traitement)
|
|
# Routage des règles (packs) : par défaut, on garde le socle vetos/decisions,
|
|
# et on active des packs additionnels selon les signaux du dossier (codes/labs/extraits).
|
|
rules_token = None
|
|
try:
|
|
rules_ctx = build_rules_runtime_context(dossier)
|
|
dossier.rules_runtime = rules_ctx
|
|
rules_token = set_rules_runtime(rules_ctx)
|
|
|
|
packs = ",".join(rules_ctx.get("enabled_packs", []))
|
|
if packs:
|
|
logger.info(" Règles%s : packs=%s", part_label, packs)
|
|
if rules_ctx.get("triggers_fired"):
|
|
logger.info(" Règles%s : triggers=%s", part_label, ",".join(rules_ctx["triggers_fired"]))
|
|
except Exception:
|
|
logger.error(" Routage règles : erreur", exc_info=True)
|
|
dossier.quality_flags["rules_routing"] = "error"
|
|
|
|
veto = None
|
|
try:
|
|
veto = apply_vetos(dossier)
|
|
dossier.veto_report = veto
|
|
except Exception:
|
|
logger.error(" Vetos : erreur lors du contrôle", exc_info=True)
|
|
dossier.quality_flags["veto_engine"] = "error"
|
|
dossier.alertes_codage.append("QUALITE DEGRADEE : moteur de vetos en erreur")
|
|
|
|
try:
|
|
apply_decisions(dossier)
|
|
_inject_decision_alerts(dossier, scope="PDF")
|
|
if veto is not None:
|
|
_inject_veto_alerts(dossier, veto, scope="PDF")
|
|
except Exception:
|
|
logger.error(" Décisions : erreur lors du post-traitement", exc_info=True)
|
|
dossier.quality_flags["decision_engine"] = "error"
|
|
finally:
|
|
if rules_token is not None:
|
|
reset_rules_runtime(rules_token)
|
|
|
|
try:
|
|
dossier.completude = build_completude_checklist(dossier)
|
|
except Exception:
|
|
logger.error(" Complétude : erreur lors du contrôle", exc_info=True)
|
|
dossier.quality_flags["completude"] = "error"
|
|
|
|
# 9. Estimation GHM (sur codes finaux) + métriques (actifs vs écartés)
|
|
try:
|
|
metrics = _compute_metrics(dossier)
|
|
ghm = estimate_ghm(dossier)
|
|
dossier.ghm_estimation = ghm
|
|
|
|
logger.info(
|
|
" DAS : actifs=%d / total=%d (écartés=%d, removed=%d, no_code=%d) | Actes : %d (avec code=%d)",
|
|
metrics.das_active,
|
|
metrics.das_total,
|
|
metrics.das_excluded,
|
|
metrics.das_removed,
|
|
metrics.das_no_code,
|
|
metrics.actes_total,
|
|
metrics.actes_with_code,
|
|
)
|
|
logger.info(
|
|
" GHM : CMD=%s, Type=%s, Sévérité=%d → %s",
|
|
ghm.cmd or "?",
|
|
ghm.type_ghm or "?",
|
|
ghm.severite,
|
|
ghm.ghm_approx or "?",
|
|
)
|
|
except Exception:
|
|
logger.error(" Erreur estimation GHM/metrics", exc_info=True)
|
|
dossier.quality_flags["ghm_estimation"] = "error"
|
|
dossier.alertes_codage.append("QUALITE DEGRADEE : estimation GHM en erreur")
|
|
|
|
# 10. Finalizer DP (arbitrage Trackare vs CRH, traçabilité)
|
|
try:
|
|
from .medical.dp_finalizer import finalize_dp
|
|
finalize_dp(dossier)
|
|
except Exception:
|
|
logger.error(" Finalizer DP : erreur", exc_info=True)
|
|
dossier.quality_flags["dp_finalizer"] = "error"
|
|
|
|
dossier.processing_time_s = round(time.time() - t0, 2)
|
|
results.append((anonymized_text, dossier, report))
|
|
|
|
logger.info(" Temps total : %.2fs", time.time() - t0)
|
|
return results
|
|
|
|
|
|
# Alias backward-compatible
|
|
process_pdf = process_document
|
|
|
|
|
|
def _run_edsnlp(text: str):
|
|
"""Exécute l'analyse edsnlp avec fallback gracieux."""
|
|
try:
|
|
from .medical.edsnlp_pipeline import analyze, is_available
|
|
if not is_available():
|
|
logger.info(" edsnlp non disponible, utilisation du mode regex seul")
|
|
return None
|
|
result = analyze(text)
|
|
logger.info(
|
|
" edsnlp : %d CIM-10, %d médicaments, %d dates",
|
|
len(result.cim10_entities),
|
|
len(result.drug_entities),
|
|
len(result.date_entities),
|
|
)
|
|
return result
|
|
except Exception:
|
|
logger.warning(" edsnlp : erreur lors de l'analyse, fallback regex", exc_info=True)
|
|
return None
|
|
|
|
|
|
def write_outputs(
|
|
stem: str,
|
|
anonymized_text: str,
|
|
dossier: DossierMedical,
|
|
report: AnonymizationReport,
|
|
subdir: str | None = None,
|
|
export_rum_flag: bool = False,
|
|
) -> None:
|
|
"""Écrit les fichiers de sortie."""
|
|
anon_dir = ANONYMIZED_DIR / subdir if subdir else ANONYMIZED_DIR
|
|
struct_dir = STRUCTURED_DIR / subdir if subdir else STRUCTURED_DIR
|
|
rep_dir = REPORTS_DIR / subdir if subdir else REPORTS_DIR
|
|
|
|
anon_dir.mkdir(parents=True, exist_ok=True)
|
|
struct_dir.mkdir(parents=True, exist_ok=True)
|
|
rep_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Texte anonymisé
|
|
anon_path = anon_dir / f"{stem}_anonymized.txt"
|
|
anon_path.write_text(anonymized_text, encoding="utf-8")
|
|
logger.info(" → %s", anon_path)
|
|
|
|
# JSON structuré
|
|
json_path = struct_dir / f"{stem}_cim10.json"
|
|
json_path.write_text(
|
|
dossier.model_dump_json(indent=2, exclude_none=True),
|
|
encoding="utf-8",
|
|
)
|
|
logger.info(" → %s", json_path)
|
|
|
|
# Rapport d'anonymisation
|
|
report_path = rep_dir / f"{stem}_report.json"
|
|
report_path.write_text(
|
|
report.model_dump_json(indent=2),
|
|
encoding="utf-8",
|
|
)
|
|
logger.info(" → %s", report_path)
|
|
|
|
# Export RUM
|
|
if export_rum_flag:
|
|
from .export.rum_export import save_rum
|
|
rum_dir = OUTPUT_DIR / "rum"
|
|
if subdir:
|
|
rum_dir = rum_dir / subdir
|
|
rum_dir.mkdir(parents=True, exist_ok=True)
|
|
rum_path = rum_dir / f"{stem}_rum.txt"
|
|
save_rum(dossier, rum_path)
|
|
logger.info(" → %s", rum_path)
|
|
|
|
|
|
def main(input_path: str | None = None) -> None:
|
|
"""Point d'entrée principal."""
|
|
global _use_edsnlp, _use_rag
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description="Anonymisation de documents médicaux et extraction CIM-10 (PDF, images, DOCX)",
|
|
)
|
|
parser.add_argument(
|
|
"input",
|
|
nargs="*",
|
|
default=[input_path or "input/"],
|
|
help="Chemin(s) vers des documents, dossiers patients, ou le dossier racine (défaut: input/)",
|
|
)
|
|
parser.add_argument(
|
|
"--no-ner",
|
|
action="store_true",
|
|
help="Désactiver la phase NER (plus rapide, moins précis)",
|
|
)
|
|
parser.add_argument(
|
|
"--no-edsnlp",
|
|
action="store_true",
|
|
help="Désactiver l'analyse edsnlp (mode regex seul)",
|
|
)
|
|
parser.add_argument(
|
|
"--no-rag",
|
|
action="store_true",
|
|
help="Désactiver l'enrichissement RAG (FAISS + Ollama)",
|
|
)
|
|
parser.add_argument(
|
|
"--build-dict",
|
|
action="store_true",
|
|
help="Générer le dictionnaire CIM-10 depuis metadata.json et quitter",
|
|
)
|
|
parser.add_argument(
|
|
"--build-ccam-dict",
|
|
nargs="?",
|
|
const="CCAM_V81.xls",
|
|
metavar="PATH",
|
|
help="Générer le dictionnaire CCAM depuis un fichier XLS (défaut: CCAM_V81.xls)",
|
|
)
|
|
parser.add_argument(
|
|
"--rebuild-index",
|
|
action="store_true",
|
|
help="Forcer la reconstruction de l'index FAISS",
|
|
)
|
|
parser.add_argument(
|
|
"--rebuild-index-all",
|
|
action="store_true",
|
|
help="Reconstruit les index FAISS + ré-indexe tous les référentiels uploadés",
|
|
)
|
|
parser.add_argument(
|
|
"--export-rum",
|
|
action="store_true",
|
|
help="Exporter les dossiers au format RUM V016 (pour groupeur ATIH)",
|
|
)
|
|
parser.add_argument(
|
|
"--control-cpam",
|
|
metavar="PATH",
|
|
help="Fichier Excel de contrôle CPAM (enrichit les dossiers avec contre-argumentation)",
|
|
)
|
|
parser.add_argument(
|
|
"--workers",
|
|
type=int,
|
|
default=1,
|
|
help="Nombre de dossiers traités en parallèle (défaut: 1)",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
if args.build_dict:
|
|
from .medical.cim10_dict import build_dict
|
|
build_dict()
|
|
return
|
|
|
|
if args.build_ccam_dict:
|
|
from .medical.ccam_dict import build_dict as build_ccam
|
|
result = build_ccam(args.build_ccam_dict)
|
|
logger.info("Dictionnaire CCAM : %d codes générés", len(result))
|
|
return
|
|
|
|
if args.rebuild_index_all:
|
|
from .medical.rag_index import build_index
|
|
build_index(force=True)
|
|
# Ré-indexer tous les référentiels uploadés (pour appliquer le nouveau chunking/filtrage)
|
|
try:
|
|
from .viewer.referentiels import ReferentielManager
|
|
rm = ReferentielManager()
|
|
total = 0
|
|
for ref in rm.list_all():
|
|
try:
|
|
total += rm.index_referentiel(ref["id"])
|
|
except Exception:
|
|
logger.warning("Ré-indexation référentiel échouée : %s", ref.get("filename"), exc_info=True)
|
|
logger.info("Ré-indexation référentiels terminée : %d chunks ajoutés", total)
|
|
except Exception:
|
|
logger.warning("Impossible de ré-indexer les référentiels uploadés", exc_info=True)
|
|
return
|
|
|
|
if args.rebuild_index:
|
|
from .medical.rag_index import build_index
|
|
build_index(force=True)
|
|
return
|
|
|
|
if args.no_ner:
|
|
# Monkey-patch pour désactiver NER
|
|
from .anonymization import ner_anonymizer
|
|
ner_anonymizer.extract_person_entities = lambda text: []
|
|
|
|
if args.no_edsnlp:
|
|
_use_edsnlp = False
|
|
|
|
if args.no_rag:
|
|
_use_rag = False
|
|
|
|
# Vérification FAISS obligatoire si RAG actif
|
|
if _use_rag:
|
|
from .medical.rag_index import check_faiss_ready
|
|
faiss_status = check_faiss_ready()
|
|
if faiss_status["ok"]:
|
|
total_vecs = faiss_status["ref"] + faiss_status["proc"] + faiss_status["bio"] + faiss_status["legacy"]
|
|
logger.info("FAISS OK : %d vecteurs (ref=%d, proc=%d, bio=%d)",
|
|
total_vecs, faiss_status["ref"], faiss_status["proc"], faiss_status["bio"])
|
|
else:
|
|
for err in faiss_status["errors"]:
|
|
logger.error("FAISS : %s", err)
|
|
logger.error("FAISS non fonctionnel — le codage CIM-10 sera dégradé. "
|
|
"Lancez : python3 -m src.main --rebuild-index")
|
|
print("\n*** ATTENTION : Index FAISS absent ou invalide ***")
|
|
print("*** Le RAG est désactivé — qualité de codage dégradée ***")
|
|
print("*** Corrigez avec : python3 -m src.main --rebuild-index ***\n")
|
|
_use_rag = False
|
|
|
|
export_rum_flag = args.export_rum
|
|
|
|
# Chargement contrôle CPAM (auto-détection ou flag explicite)
|
|
cpam_data = None
|
|
cpam_path = args.control_cpam
|
|
if not cpam_path:
|
|
# Auto-détection : chercher un .xlsx dans input/Control_cpam/
|
|
cpam_dir = INPUT_DIR / "Control_cpam"
|
|
if cpam_dir.is_dir():
|
|
xlsx_files = sorted(cpam_dir.glob("*.xlsx"))
|
|
if xlsx_files:
|
|
cpam_path = str(xlsx_files[0])
|
|
logger.info("CPAM : fichier détecté automatiquement → %s", cpam_path)
|
|
if cpam_path:
|
|
from .control.cpam_parser import parse_cpam_excel
|
|
cpam_data = parse_cpam_excel(cpam_path)
|
|
if not cpam_data:
|
|
logger.warning("Aucun contrôle CPAM chargé depuis %s", cpam_path)
|
|
|
|
input_paths = args.input
|
|
|
|
def _glob_supported(directory: Path) -> list[Path]:
|
|
"""Collecte tous les fichiers supportés dans un dossier."""
|
|
files: list[Path] = []
|
|
for ext in sorted(SUPPORTED_EXTENSIONS):
|
|
files.extend(directory.glob(f"*{ext}"))
|
|
return sorted(set(files))
|
|
|
|
# Collecte des groupes (documents, subdir) à traiter
|
|
groups: list[tuple[list[Path], str | None]] = []
|
|
|
|
for p in input_paths:
|
|
input_p = Path(p)
|
|
if input_p.is_file():
|
|
# Fichier unique → subdir = nom du dossier parent (si ce n'est pas input/)
|
|
subdir = input_p.parent.name if input_p.parent.name != "input" else None
|
|
groups.append(([input_p], subdir))
|
|
elif input_p.is_dir():
|
|
# Vérifier s'il y a des documents directement dans ce dossier
|
|
root_docs = _glob_supported(input_p)
|
|
# Vérifier s'il y a des sous-dossiers avec des documents
|
|
sub_dirs = [c for c in sorted(input_p.iterdir()) if c.is_dir() and _glob_supported(c)]
|
|
|
|
if sub_dirs:
|
|
# C'est un dossier racine (comme input/) → traiter chaque sous-dossier
|
|
for child in sub_dirs:
|
|
sub_docs = _glob_supported(child)
|
|
groups.append((sub_docs, child.name))
|
|
elif root_docs:
|
|
# C'est un dossier patient directement → utiliser son nom comme subdir
|
|
groups.append((root_docs, input_p.name))
|
|
else:
|
|
logger.error("Chemin introuvable : %s", input_p)
|
|
sys.exit(1)
|
|
|
|
total = sum(len(docs) for docs, _ in groups)
|
|
if total == 0:
|
|
logger.warning("Aucun document supporté trouvé dans %s", input_p)
|
|
sys.exit(0)
|
|
|
|
logger.info("Traitement de %d document(s)...", total)
|
|
|
|
def _process_group(docs: list[Path], subdir: str | None) -> None:
|
|
"""Traite un groupe de documents (un dossier patient)."""
|
|
if subdir:
|
|
logger.info("--- Dossier %s (%d documents) ---", subdir, len(docs))
|
|
|
|
group_dossiers: list[DossierMedical] = []
|
|
for doc_path in docs:
|
|
try:
|
|
doc_results = process_document(doc_path)
|
|
stem = doc_path.stem.replace(" ", "_")
|
|
multi = len(doc_results) > 1
|
|
for part_idx, (anonymized_text, dossier, report) in enumerate(doc_results):
|
|
part_stem = f"{stem}_part{part_idx + 1}" if multi else stem
|
|
write_outputs(part_stem, anonymized_text, dossier, report, subdir=subdir, export_rum_flag=export_rum_flag)
|
|
group_dossiers.append(dossier)
|
|
except Exception:
|
|
logger.exception("Erreur lors du traitement de %s", doc_path.name)
|
|
|
|
# Fusion multi-PDFs si plusieurs documents dans le même groupe
|
|
merged = None
|
|
if len(group_dossiers) > 1 and subdir:
|
|
try:
|
|
from .medical.fusion import merge_dossiers
|
|
merged = merge_dossiers(group_dossiers)
|
|
|
|
struct_dir = STRUCTURED_DIR / subdir
|
|
struct_dir.mkdir(parents=True, exist_ok=True)
|
|
merged_path = struct_dir / f"{subdir}_fusionne_cim10.json"
|
|
|
|
# Export RUM du dossier fusionné
|
|
if export_rum_flag:
|
|
from .export.rum_export import save_rum
|
|
rum_dir = OUTPUT_DIR / "rum" / subdir
|
|
rum_dir.mkdir(parents=True, exist_ok=True)
|
|
rum_path = rum_dir / f"{subdir}_fusionne_rum.txt"
|
|
save_rum(merged, rum_path)
|
|
logger.info(" → RUM fusionné : %s", rum_path)
|
|
except Exception:
|
|
logger.exception("Erreur lors de la fusion du groupe %s", subdir)
|
|
merged = None
|
|
|
|
# Contrôle CPAM : enrichir le dossier principal (fusionné ou dernier)
|
|
if cpam_data and subdir:
|
|
try:
|
|
from .control.cpam_parser import match_dossier_ogc
|
|
controles = match_dossier_ogc(subdir, cpam_data)
|
|
if controles:
|
|
from .control.cpam_response import generate_cpam_response
|
|
target = merged if merged else (group_dossiers[-1] if group_dossiers else None)
|
|
if target:
|
|
logger.info(" CPAM : %d contrôle(s) pour %s", len(controles), subdir)
|
|
for ctrl in controles:
|
|
text, response_data, sources = generate_cpam_response(target, ctrl)
|
|
ctrl.contre_argumentation = text
|
|
ctrl.response_data = response_data
|
|
ctrl.sources_reponse = sources
|
|
target.controles_cpam = controles
|
|
except Exception:
|
|
logger.exception("Erreur CPAM pour %s", subdir)
|
|
|
|
# Écrire le dossier fusionné (après enrichissement CPAM éventuel)
|
|
if merged is not None and subdir:
|
|
try:
|
|
# Vetos sur la version finale (fusion + CPAM) + décisions
|
|
# Routage des règles (packs) pour la version fusionnée
|
|
rules_token = None
|
|
try:
|
|
rules_ctx = build_rules_runtime_context(merged)
|
|
merged.rules_runtime = rules_ctx
|
|
rules_token = set_rules_runtime(rules_ctx)
|
|
|
|
packs = ",".join(rules_ctx.get("enabled_packs", []))
|
|
if packs:
|
|
logger.info(" Règles fusionné : packs=%s", packs)
|
|
if rules_ctx.get("triggers_fired"):
|
|
logger.info(" Règles fusionné : triggers=%s", ",".join(rules_ctx["triggers_fired"]))
|
|
except Exception:
|
|
logger.warning(" Routage règles fusionné : erreur", exc_info=True)
|
|
|
|
veto = None
|
|
try:
|
|
veto = apply_vetos(merged)
|
|
merged.veto_report = veto
|
|
except Exception:
|
|
logger.warning(" Vetos fusionné : erreur lors du contrôle", exc_info=True)
|
|
|
|
try:
|
|
apply_decisions(merged)
|
|
_inject_decision_alerts(merged, scope="FINAL")
|
|
if veto is not None:
|
|
_inject_veto_alerts(merged, veto, scope="FINAL")
|
|
except Exception:
|
|
logger.warning(" Décisions fusionné : erreur lors du post-traitement", exc_info=True)
|
|
finally:
|
|
if rules_token is not None:
|
|
reset_rules_runtime(rules_token)
|
|
|
|
try:
|
|
merged.completude = build_completude_checklist(merged)
|
|
except Exception:
|
|
logger.warning(" Complétude fusionné : erreur lors du contrôle", exc_info=True)
|
|
|
|
# Re-estimer le GHM (sur codes finaux) + métriques (actifs vs écartés)
|
|
try:
|
|
metrics = _compute_metrics(merged)
|
|
ghm = estimate_ghm(merged)
|
|
merged.ghm_estimation = ghm
|
|
logger.info(
|
|
" Fusion métriques : DAS actifs=%d / total=%d (écartés=%d, removed=%d, no_code=%d) | Actes=%d (avec code=%d)",
|
|
metrics.das_active,
|
|
metrics.das_total,
|
|
metrics.das_excluded,
|
|
metrics.das_removed,
|
|
metrics.das_no_code,
|
|
metrics.actes_total,
|
|
metrics.actes_with_code,
|
|
)
|
|
logger.info(
|
|
" GHM final : CMD=%s, Type=%s, Sévérité=%d → %s",
|
|
ghm.cmd or "?",
|
|
ghm.type_ghm or "?",
|
|
ghm.severite,
|
|
ghm.ghm_approx or "?",
|
|
)
|
|
except Exception:
|
|
logger.warning(" Erreur estimation GHM/metrics final", exc_info=True)
|
|
|
|
# Finalizer DP (arbitrage Trackare vs CRH, traçabilité)
|
|
try:
|
|
from .medical.dp_finalizer import finalize_dp
|
|
finalize_dp(merged)
|
|
except Exception:
|
|
logger.warning(" Finalizer DP fusionné : erreur", exc_info=True)
|
|
|
|
struct_dir = STRUCTURED_DIR / subdir
|
|
struct_dir.mkdir(parents=True, exist_ok=True)
|
|
merged_path = struct_dir / f"{subdir}_fusionne_cim10.json"
|
|
merged_path.write_text(
|
|
merged.model_dump_json(indent=2, exclude_none=True),
|
|
encoding="utf-8",
|
|
)
|
|
logger.info(" → Dossier fusionné : %s", merged_path)
|
|
except Exception:
|
|
logger.exception("Erreur écriture dossier fusionné %s", subdir)
|
|
|
|
# Exécution séquentielle ou parallèle selon --workers
|
|
if args.workers > 1:
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
logger.info("Mode parallèle : %d workers", args.workers)
|
|
with ThreadPoolExecutor(max_workers=args.workers) as executor:
|
|
futures = {
|
|
executor.submit(_process_group, docs, subdir): subdir
|
|
for docs, subdir in groups
|
|
}
|
|
for future in as_completed(futures):
|
|
try:
|
|
future.result()
|
|
except Exception:
|
|
logger.exception("Erreur groupe %s", futures[future])
|
|
else:
|
|
for docs, subdir in groups:
|
|
_process_group(docs, subdir)
|
|
|
|
logger.info("Terminé.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|