feat: cache Ollama + parallélisation ThreadPool + filtrage DAS renforcé + modules GHM/CPAM/export RUM
- Cache persistant JSON thread-safe pour les résultats Ollama (invalidation par modèle) - Parallélisation des appels Ollama (ThreadPoolExecutor, 2 workers) - 6 nouvelles règles de filtrage DAS parasites (doublons, ponctuation, OCR, labo, fragments) - Client Ollama centralisé (mode JSON natif + retry) - Module GHM (estimation CMD/sévérité) - Module contrôle CPAM (parser + contre-argumentation RAG) - Export RUM (format RSS) - Viewer enrichi (détail dossier) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
86
src/main.py
86
src/main.py
@@ -10,13 +10,14 @@ import time
|
||||
from pathlib import Path
|
||||
|
||||
from .anonymization.anonymizer import Anonymizer
|
||||
from .config import ANONYMIZED_DIR, REPORTS_DIR, STRUCTURED_DIR, AnonymizationReport, DossierMedical
|
||||
from .config import ANONYMIZED_DIR, OUTPUT_DIR, REPORTS_DIR, STRUCTURED_DIR, AnonymizationReport, DossierMedical
|
||||
from .extraction.document_classifier import classify
|
||||
from .extraction.crh_parser import parse_crh
|
||||
from .extraction.document_splitter import split_documents
|
||||
from .extraction.pdf_extractor import extract_text
|
||||
from .extraction.trackare_parser import parse_trackare
|
||||
from .medical.cim10_extractor import extract_medical_info
|
||||
from .medical.ghm import estimate_ghm
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
@@ -84,10 +85,20 @@ def process_pdf(pdf_path: Path) -> list[tuple[str, DossierMedical, Anonymization
|
||||
dossier = extract_medical_info(parsed, anonymized_text, edsnlp_result, use_rag=_use_rag)
|
||||
dossier.source_file = pdf_path.name
|
||||
dossier.document_type = doc_type
|
||||
dossier.processing_time_s = round(time.time() - t0, 2)
|
||||
logger.info(" DP%s : %s", part_label, dossier.diagnostic_principal)
|
||||
logger.info(" DAS : %d, Actes : %d", len(dossier.diagnostics_associes), len(dossier.actes_ccam))
|
||||
|
||||
# 8. Estimation GHM
|
||||
try:
|
||||
ghm = estimate_ghm(dossier)
|
||||
dossier.ghm_estimation = ghm
|
||||
logger.info(" GHM : CMD=%s, Type=%s, Sévérité=%d → %s",
|
||||
ghm.cmd or "?", ghm.type_ghm or "?",
|
||||
ghm.severite, ghm.ghm_approx or "?")
|
||||
except Exception:
|
||||
logger.warning(" Erreur estimation GHM", exc_info=True)
|
||||
|
||||
dossier.processing_time_s = round(time.time() - t0, 2)
|
||||
results.append((anonymized_text, dossier, report))
|
||||
|
||||
logger.info(" Temps total : %.2fs", time.time() - t0)
|
||||
@@ -120,6 +131,7 @@ def write_outputs(
|
||||
dossier: DossierMedical,
|
||||
report: AnonymizationReport,
|
||||
subdir: str | None = None,
|
||||
export_rum_flag: bool = False,
|
||||
) -> None:
|
||||
"""Écrit les fichiers de sortie."""
|
||||
anon_dir = ANONYMIZED_DIR / subdir if subdir else ANONYMIZED_DIR
|
||||
@@ -151,6 +163,17 @@ def write_outputs(
|
||||
)
|
||||
logger.info(" → %s", report_path)
|
||||
|
||||
# Export RUM
|
||||
if export_rum_flag:
|
||||
from .export.rum_export import save_rum
|
||||
rum_dir = OUTPUT_DIR / "rum"
|
||||
if subdir:
|
||||
rum_dir = rum_dir / subdir
|
||||
rum_dir.mkdir(parents=True, exist_ok=True)
|
||||
rum_path = rum_dir / f"{stem}_rum.txt"
|
||||
save_rum(dossier, rum_path)
|
||||
logger.info(" → %s", rum_path)
|
||||
|
||||
|
||||
def main(input_path: str | None = None) -> None:
|
||||
"""Point d'entrée principal."""
|
||||
@@ -197,6 +220,16 @@ def main(input_path: str | None = None) -> None:
|
||||
action="store_true",
|
||||
help="Forcer la reconstruction de l'index FAISS",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--export-rum",
|
||||
action="store_true",
|
||||
help="Exporter les dossiers au format RUM V016 (pour groupeur ATIH)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--control-cpam",
|
||||
metavar="PATH",
|
||||
help="Fichier Excel de contrôle CPAM (enrichit les dossiers avec contre-argumentation)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.build_dict:
|
||||
@@ -226,6 +259,16 @@ def main(input_path: str | None = None) -> None:
|
||||
if args.no_rag:
|
||||
_use_rag = False
|
||||
|
||||
export_rum_flag = args.export_rum
|
||||
|
||||
# Chargement contrôle CPAM
|
||||
cpam_data = None
|
||||
if args.control_cpam:
|
||||
from .control.cpam_parser import parse_cpam_excel
|
||||
cpam_data = parse_cpam_excel(args.control_cpam)
|
||||
if not cpam_data:
|
||||
logger.warning("Aucun contrôle CPAM chargé depuis %s", args.control_cpam)
|
||||
|
||||
input_paths = args.input
|
||||
|
||||
# Collecte des groupes (pdfs, subdir) à traiter
|
||||
@@ -274,12 +317,13 @@ def main(input_path: str | None = None) -> None:
|
||||
multi = len(pdf_results) > 1
|
||||
for part_idx, (anonymized_text, dossier, report) in enumerate(pdf_results):
|
||||
part_stem = f"{stem}_part{part_idx + 1}" if multi else stem
|
||||
write_outputs(part_stem, anonymized_text, dossier, report, subdir=subdir)
|
||||
write_outputs(part_stem, anonymized_text, dossier, report, subdir=subdir, export_rum_flag=export_rum_flag)
|
||||
group_dossiers.append(dossier)
|
||||
except Exception:
|
||||
logger.exception("Erreur lors du traitement de %s", pdf_path.name)
|
||||
|
||||
# Fusion multi-PDFs si plusieurs documents dans le même groupe
|
||||
merged = None
|
||||
if len(group_dossiers) > 1 and subdir:
|
||||
try:
|
||||
from .medical.fusion import merge_dossiers
|
||||
@@ -287,13 +331,47 @@ def main(input_path: str | None = None) -> None:
|
||||
struct_dir = STRUCTURED_DIR / subdir
|
||||
struct_dir.mkdir(parents=True, exist_ok=True)
|
||||
merged_path = struct_dir / f"{subdir}_fusionne_cim10.json"
|
||||
|
||||
# Export RUM du dossier fusionné
|
||||
if export_rum_flag:
|
||||
from .export.rum_export import save_rum
|
||||
rum_dir = OUTPUT_DIR / "rum" / subdir
|
||||
rum_dir.mkdir(parents=True, exist_ok=True)
|
||||
rum_path = rum_dir / f"{subdir}_fusionne_rum.txt"
|
||||
save_rum(merged, rum_path)
|
||||
logger.info(" → RUM fusionné : %s", rum_path)
|
||||
except Exception:
|
||||
logger.exception("Erreur lors de la fusion du groupe %s", subdir)
|
||||
merged = None
|
||||
|
||||
# Contrôle CPAM : enrichir le dossier principal (fusionné ou dernier)
|
||||
if cpam_data and subdir:
|
||||
from .control.cpam_parser import match_dossier_ogc
|
||||
controles = match_dossier_ogc(subdir, cpam_data)
|
||||
if controles:
|
||||
from .control.cpam_response import generate_cpam_response
|
||||
target = merged if merged else (group_dossiers[-1] if group_dossiers else None)
|
||||
if target:
|
||||
logger.info(" CPAM : %d contrôle(s) pour %s", len(controles), subdir)
|
||||
for ctrl in controles:
|
||||
text, sources = generate_cpam_response(target, ctrl)
|
||||
ctrl.contre_argumentation = text
|
||||
ctrl.sources_reponse = sources
|
||||
target.controles_cpam = controles
|
||||
|
||||
# Écrire le dossier fusionné (après enrichissement CPAM éventuel)
|
||||
if merged is not None and subdir:
|
||||
try:
|
||||
struct_dir = STRUCTURED_DIR / subdir
|
||||
struct_dir.mkdir(parents=True, exist_ok=True)
|
||||
merged_path = struct_dir / f"{subdir}_fusionne_cim10.json"
|
||||
merged_path.write_text(
|
||||
merged.model_dump_json(indent=2, exclude_none=True),
|
||||
encoding="utf-8",
|
||||
)
|
||||
logger.info(" → Dossier fusionné : %s", merged_path)
|
||||
except Exception:
|
||||
logger.exception("Erreur lors de la fusion du groupe %s", subdir)
|
||||
logger.exception("Erreur écriture dossier fusionné %s", subdir)
|
||||
|
||||
logger.info("Terminé.")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user