"""CLI + orchestrateur du pipeline d'anonymisation et extraction CIM-10.""" from __future__ import annotations import argparse import json import logging import sys import time from pathlib import Path from .anonymization.anonymizer import Anonymizer from .config import ANONYMIZED_DIR, INPUT_DIR, OUTPUT_DIR, REPORTS_DIR, STRUCTURED_DIR, AnonymizationReport, DossierMedical, VetoReport from .extraction.document_classifier import classify from .extraction.crh_parser import parse_crh from .extraction.document_splitter import split_documents from .extraction.pdf_extractor import extract_text, extract_text_with_pages from .extraction.trackare_parser import parse_trackare from .medical.cim10_extractor import extract_medical_info from .medical.ghm import estimate_ghm from .quality.veto_engine import apply_vetos from .quality.decision_engine import apply_decisions, decision_summaries logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", ) logger = logging.getLogger(__name__) def _inject_veto_alerts(dossier: DossierMedical, veto: VetoReport, scope: str = "FINAL") -> None: """Injecte les alertes liées aux vetos dans alertes_codage en évitant les doublons. On *remplace* la section VETO précédente (qu'elle vienne d'un PDF individuel ou d'une passe de fusion), afin que le JSON fusionné reste lisible. """ cleaned: list[str] = [] for line in (dossier.alertes_codage or []): if isinstance(line, str) and (line.startswith("VETOS:") or line.startswith("VETOS[") or line.startswith("VETO-")): continue cleaned.append(line) dossier.alertes_codage = cleaned if veto.verdict != "PASS": dossier.alertes_codage.append(f"VETOS[{scope}]: {veto.verdict} (score={veto.score_contestabilite})") for it in veto.issues[:25]: dossier.alertes_codage.append(f"{it.veto} [{it.severity}] {it.where}: {it.message}") def _inject_decision_alerts(dossier: DossierMedical, scope: str = "FINAL") -> None: """Injecte les décisions (downgrade/suppression) dans alertes_codage. On remplace la section DECISION précédente pour garder un JSON lisible. """ cleaned: list[str] = [] for line in (dossier.alertes_codage or []): if isinstance(line, str) and line.startswith("DECISION:"): continue cleaned.append(line) dossier.alertes_codage = cleaned lines = decision_summaries(dossier) if lines: dossier.alertes_codage.append(f"DECISIONS[{scope}]: {len(lines)} ligne(s)") dossier.alertes_codage.extend(lines[:30]) # Flags globaux _use_edsnlp = True _use_rag = True def process_pdf(pdf_path: Path) -> list[tuple[str, DossierMedical, AnonymizationReport]]: """Traite un PDF : extraction → splitting → parsing → anonymisation → extraction CIM-10. Retourne une liste de (texte_anonymisé, dossier, rapport) — un par dossier détecté. """ t0 = time.time() logger.info("Traitement de %s", pdf_path.name) # 1. Extraction texte avec pages raw_text, page_tracker = extract_text_with_pages(pdf_path) logger.info(" Texte extrait : %d caractères", len(raw_text)) # 2. Classification doc_type = classify(raw_text) logger.info(" Type de document : %s", doc_type) # 3. Splitting multi-dossiers chunks = split_documents(raw_text, doc_type) if len(chunks) > 1: logger.info(" Découpage : %d dossiers détectés dans %s", len(chunks), pdf_path.name) results: list[tuple[str, DossierMedical, AnonymizationReport]] = [] for i, chunk_text in enumerate(chunks): part_label = f" [part {i+1}/{len(chunks)}]" if len(chunks) > 1 else "" logger.info(" Traitement%s...", part_label) # 4. Parsing if doc_type == "trackare": parsed = parse_trackare(chunk_text) else: parsed = parse_crh(chunk_text) # 5. Anonymisation anonymizer = Anonymizer(parsed_data=parsed) anonymized_text = anonymizer.anonymize(chunk_text) report = anonymizer.report report.source_file = pdf_path.name logger.info( " Anonymisation%s : %d remplacements (regex=%d, ner=%d, sweep=%d)", part_label, report.total_replacements, report.regex_replacements, report.ner_replacements, report.sweep_replacements, ) # 6. Analyse edsnlp (optionnelle) edsnlp_result = None if _use_edsnlp: edsnlp_result = _run_edsnlp(anonymized_text) # 7. Extraction médicale CIM-10 dossier = extract_medical_info( parsed, anonymized_text, edsnlp_result, use_rag=_use_rag, page_tracker=page_tracker, raw_text=raw_text, ) dossier.source_file = pdf_path.name dossier.document_type = doc_type logger.info(" DP%s : %s", part_label, dossier.diagnostic_principal) logger.info(" DAS : %d, Actes : %d", len(dossier.diagnostics_associes), len(dossier.actes_ccam)) # 8. Estimation GHM try: ghm = estimate_ghm(dossier) dossier.ghm_estimation = ghm logger.info(" GHM : CMD=%s, Type=%s, Sévérité=%d → %s", ghm.cmd or "?", ghm.type_ghm or "?", ghm.severite, ghm.ghm_approx or "?") except Exception: logger.warning(" Erreur estimation GHM", exc_info=True) # 9. Vetos (contestabilité) try: veto = apply_vetos(dossier) dossier.veto_report = veto apply_decisions(dossier) _inject_decision_alerts(dossier, scope="PDF") _inject_veto_alerts(dossier, veto, scope="PDF") except Exception: logger.warning(" Vetos : erreur lors du contrôle", exc_info=True) dossier.processing_time_s = round(time.time() - t0, 2) results.append((anonymized_text, dossier, report)) logger.info(" Temps total : %.2fs", time.time() - t0) return results def _run_edsnlp(text: str): """Exécute l'analyse edsnlp avec fallback gracieux.""" try: from .medical.edsnlp_pipeline import analyze, is_available if not is_available(): logger.info(" edsnlp non disponible, utilisation du mode regex seul") return None result = analyze(text) logger.info( " edsnlp : %d CIM-10, %d médicaments, %d dates", len(result.cim10_entities), len(result.drug_entities), len(result.date_entities), ) return result except Exception: logger.warning(" edsnlp : erreur lors de l'analyse, fallback regex", exc_info=True) return None def write_outputs( stem: str, anonymized_text: str, dossier: DossierMedical, report: AnonymizationReport, subdir: str | None = None, export_rum_flag: bool = False, ) -> None: """Écrit les fichiers de sortie.""" anon_dir = ANONYMIZED_DIR / subdir if subdir else ANONYMIZED_DIR struct_dir = STRUCTURED_DIR / subdir if subdir else STRUCTURED_DIR rep_dir = REPORTS_DIR / subdir if subdir else REPORTS_DIR anon_dir.mkdir(parents=True, exist_ok=True) struct_dir.mkdir(parents=True, exist_ok=True) rep_dir.mkdir(parents=True, exist_ok=True) # Texte anonymisé anon_path = anon_dir / f"{stem}_anonymized.txt" anon_path.write_text(anonymized_text, encoding="utf-8") logger.info(" → %s", anon_path) # JSON structuré json_path = struct_dir / f"{stem}_cim10.json" json_path.write_text( dossier.model_dump_json(indent=2, exclude_none=True), encoding="utf-8", ) logger.info(" → %s", json_path) # Rapport d'anonymisation report_path = rep_dir / f"{stem}_report.json" report_path.write_text( report.model_dump_json(indent=2), encoding="utf-8", ) logger.info(" → %s", report_path) # Export RUM if export_rum_flag: from .export.rum_export import save_rum rum_dir = OUTPUT_DIR / "rum" if subdir: rum_dir = rum_dir / subdir rum_dir.mkdir(parents=True, exist_ok=True) rum_path = rum_dir / f"{stem}_rum.txt" save_rum(dossier, rum_path) logger.info(" → %s", rum_path) def main(input_path: str | None = None) -> None: """Point d'entrée principal.""" global _use_edsnlp, _use_rag parser = argparse.ArgumentParser( description="Anonymisation de documents médicaux PDF et extraction CIM-10", ) parser.add_argument( "input", nargs="*", default=[input_path or "input/"], help="Chemin(s) vers des PDFs, dossiers patients, ou le dossier racine (défaut: input/)", ) parser.add_argument( "--no-ner", action="store_true", help="Désactiver la phase NER (plus rapide, moins précis)", ) parser.add_argument( "--no-edsnlp", action="store_true", help="Désactiver l'analyse edsnlp (mode regex seul)", ) parser.add_argument( "--no-rag", action="store_true", help="Désactiver l'enrichissement RAG (FAISS + Ollama)", ) parser.add_argument( "--build-dict", action="store_true", help="Générer le dictionnaire CIM-10 depuis metadata.json et quitter", ) parser.add_argument( "--build-ccam-dict", nargs="?", const="CCAM_V81.xls", metavar="PATH", help="Générer le dictionnaire CCAM depuis un fichier XLS (défaut: CCAM_V81.xls)", ) parser.add_argument( "--rebuild-index", action="store_true", help="Forcer la reconstruction de l'index FAISS", ) parser.add_argument( "--export-rum", action="store_true", help="Exporter les dossiers au format RUM V016 (pour groupeur ATIH)", ) parser.add_argument( "--control-cpam", metavar="PATH", help="Fichier Excel de contrôle CPAM (enrichit les dossiers avec contre-argumentation)", ) args = parser.parse_args() if args.build_dict: from .medical.cim10_dict import build_dict build_dict() return if args.build_ccam_dict: from .medical.ccam_dict import build_dict as build_ccam result = build_ccam(args.build_ccam_dict) logger.info("Dictionnaire CCAM : %d codes générés", len(result)) return if args.rebuild_index: from .medical.rag_index import build_index build_index(force=True) return if args.no_ner: # Monkey-patch pour désactiver NER from .anonymization import ner_anonymizer ner_anonymizer.extract_person_entities = lambda text: [] if args.no_edsnlp: _use_edsnlp = False if args.no_rag: _use_rag = False export_rum_flag = args.export_rum # Chargement contrôle CPAM (auto-détection ou flag explicite) cpam_data = None cpam_path = args.control_cpam if not cpam_path: # Auto-détection : chercher un .xlsx dans input/Control_cpam/ cpam_dir = INPUT_DIR / "Control_cpam" if cpam_dir.is_dir(): xlsx_files = sorted(cpam_dir.glob("*.xlsx")) if xlsx_files: cpam_path = str(xlsx_files[0]) logger.info("CPAM : fichier détecté automatiquement → %s", cpam_path) if cpam_path: from .control.cpam_parser import parse_cpam_excel cpam_data = parse_cpam_excel(cpam_path) if not cpam_data: logger.warning("Aucun contrôle CPAM chargé depuis %s", cpam_path) input_paths = args.input # Collecte des groupes (pdfs, subdir) à traiter groups: list[tuple[list[Path], str | None]] = [] for p in input_paths: input_p = Path(p) if input_p.is_file(): # Fichier unique → subdir = nom du dossier parent (si ce n'est pas input/) subdir = input_p.parent.name if input_p.parent.name != "input" else None groups.append(([input_p], subdir)) elif input_p.is_dir(): # Vérifier s'il y a des PDFs directement dans ce dossier root_pdfs = sorted(input_p.glob("*.pdf")) # Vérifier s'il y a des sous-dossiers avec PDFs sub_dirs = [c for c in sorted(input_p.iterdir()) if c.is_dir() and list(c.glob("*.pdf"))] if sub_dirs: # C'est un dossier racine (comme input/) → traiter chaque sous-dossier for child in sub_dirs: sub_pdfs = sorted(child.glob("*.pdf")) groups.append((sub_pdfs, child.name)) elif root_pdfs: # C'est un dossier patient directement → utiliser son nom comme subdir groups.append((root_pdfs, input_p.name)) else: logger.error("Chemin introuvable : %s", input_p) sys.exit(1) total = sum(len(pdfs) for pdfs, _ in groups) if total == 0: logger.warning("Aucun PDF trouvé dans %s", input_p) sys.exit(0) logger.info("Traitement de %d PDF(s)...", total) for pdfs, subdir in groups: if subdir: logger.info("--- Dossier %s (%d PDFs) ---", subdir, len(pdfs)) group_dossiers: list[DossierMedical] = [] for pdf_path in pdfs: try: pdf_results = process_pdf(pdf_path) stem = pdf_path.stem.replace(" ", "_") multi = len(pdf_results) > 1 for part_idx, (anonymized_text, dossier, report) in enumerate(pdf_results): part_stem = f"{stem}_part{part_idx + 1}" if multi else stem write_outputs(part_stem, anonymized_text, dossier, report, subdir=subdir, export_rum_flag=export_rum_flag) group_dossiers.append(dossier) except Exception: logger.exception("Erreur lors du traitement de %s", pdf_path.name) # Fusion multi-PDFs si plusieurs documents dans le même groupe merged = None if len(group_dossiers) > 1 and subdir: try: from .medical.fusion import merge_dossiers merged = merge_dossiers(group_dossiers) # Re-estimer le GHM sur le dossier fusionné (DP/DAS consolidés) try: ghm = estimate_ghm(merged) merged.ghm_estimation = ghm logger.info(" GHM fusionné : CMD=%s, Type=%s, Sévérité=%d → %s", ghm.cmd or "?", ghm.type_ghm or "?", ghm.severite, ghm.ghm_approx or "?") except Exception: logger.warning(" Erreur estimation GHM fusionné", exc_info=True) struct_dir = STRUCTURED_DIR / subdir struct_dir.mkdir(parents=True, exist_ok=True) merged_path = struct_dir / f"{subdir}_fusionne_cim10.json" # Export RUM du dossier fusionné if export_rum_flag: from .export.rum_export import save_rum rum_dir = OUTPUT_DIR / "rum" / subdir rum_dir.mkdir(parents=True, exist_ok=True) rum_path = rum_dir / f"{subdir}_fusionne_rum.txt" save_rum(merged, rum_path) logger.info(" → RUM fusionné : %s", rum_path) except Exception: logger.exception("Erreur lors de la fusion du groupe %s", subdir) merged = None # Contrôle CPAM : enrichir le dossier principal (fusionné ou dernier) if cpam_data and subdir: try: from .control.cpam_parser import match_dossier_ogc controles = match_dossier_ogc(subdir, cpam_data) if controles: from .control.cpam_response import generate_cpam_response target = merged if merged else (group_dossiers[-1] if group_dossiers else None) if target: logger.info(" CPAM : %d contrôle(s) pour %s", len(controles), subdir) for ctrl in controles: text, response_data, sources = generate_cpam_response(target, ctrl) ctrl.contre_argumentation = text ctrl.response_data = response_data ctrl.sources_reponse = sources target.controles_cpam = controles except Exception: logger.exception("Erreur CPAM pour %s", subdir) # Écrire le dossier fusionné (après enrichissement CPAM éventuel) if merged is not None and subdir: try: # Vetos sur la version finale (fusion + CPAM) try: veto = apply_vetos(merged) merged.veto_report = veto apply_decisions(merged) _inject_decision_alerts(merged, scope="FINAL") _inject_veto_alerts(merged, veto, scope="FINAL") except Exception: logger.warning(" Vetos fusionné : erreur lors du contrôle", exc_info=True) struct_dir = STRUCTURED_DIR / subdir struct_dir.mkdir(parents=True, exist_ok=True) merged_path = struct_dir / f"{subdir}_fusionne_cim10.json" merged_path.write_text( merged.model_dump_json(indent=2, exclude_none=True), encoding="utf-8", ) logger.info(" → Dossier fusionné : %s", merged_path) except Exception: logger.exception("Erreur écriture dossier fusionné %s", subdir) logger.info("Terminé.") if __name__ == "__main__": main()