"""Fusion de dossiers médicaux multi-PDFs pour un même patient. Combine les informations de plusieurs documents (Trackare, CRH, CRO) en un dossier unique avec des règles de priorité et de déduplication. """ from __future__ import annotations import logging from ..config import ( ActeCCAM, BiologieCle, Diagnostic, DossierMedical, Imagerie, Sejour, Traitement, ) from ..medical.das_filter import is_valid_diagnostic_text, apply_semantic_dedup from ..medical.cim10_extractor import _is_dp_family_redundant logger = logging.getLogger(__name__) # Priorité des types de documents pour les données de séjour _DOC_PRIORITY = {"trackare": 0, "crh": 1, "cro": 2} def _cim10_specificity(code: str | None) -> int: """Score de spécificité d'un code CIM-10 : longueur sans le point.""" if not code: return 0 return len(code.replace(".", "")) def _prefer_most_specific_dp(dossiers: list[DossierMedical]) -> Diagnostic | None: """Sélectionne le DP le plus spécifique parmi tous les dossiers.""" candidates: list[tuple[Diagnostic, int]] = [] for d in dossiers: if d.diagnostic_principal: spec = _cim10_specificity(d.diagnostic_principal.cim10_suggestion) candidates.append((d.diagnostic_principal, spec)) if not candidates: return None # Tri : spécificité décroissante, puis confiance (high > medium > low) conf_order = {"high": 0, "medium": 1, "low": 2} candidates.sort( key=lambda x: (-x[1], conf_order.get(x[0].cim10_confidence or "", 3)) ) return candidates[0][0] def _merge_sejour(dossiers: list[DossierMedical]) -> Sejour: """Fusionne les informations de séjour avec priorité Trackare > CRH > CRO.""" # Trier par priorité de type de document sorted_dossiers = sorted( dossiers, key=lambda d: _DOC_PRIORITY.get(d.document_type, 99), ) merged = Sejour() for d in sorted_dossiers: s = d.sejour if s.sexe and not merged.sexe: merged.sexe = s.sexe if s.age is not None and merged.age is None: merged.age = s.age if s.date_entree and not merged.date_entree: merged.date_entree = s.date_entree if s.date_sortie and not merged.date_sortie: merged.date_sortie = s.date_sortie if s.duree_sejour is not None and merged.duree_sejour is None: merged.duree_sejour = s.duree_sejour if s.mode_entree and not merged.mode_entree: merged.mode_entree = s.mode_entree if s.mode_sortie and not merged.mode_sortie: merged.mode_sortie = s.mode_sortie if s.imc is not None and merged.imc is None: merged.imc = s.imc if s.poids is not None and merged.poids is None: merged.poids = s.poids if s.taille is not None and merged.taille is None: merged.taille = s.taille return merged def _is_enriched(d: Diagnostic) -> bool: """Retourne True si le diagnostic a une justification RAG.""" return bool(d.justification or d.sources_rag) def _dedup_diagnostics(all_das: list[Diagnostic]) -> list[Diagnostic]: """Déduplique les diagnostics associés par code CIM-10, garde la meilleure confiance.""" conf_order = {"high": 0, "medium": 1, "low": 2} seen: dict[str | None, Diagnostic] = {} for d in all_das: key = d.cim10_suggestion if key is None: # Sans code, dédup par texte normalisé key = f"__text__{d.texte.lower().strip()}" if key not in seen: seen[key] = d else: existing = seen[key] new_conf = conf_order.get(d.cim10_confidence or "", 3) old_conf = conf_order.get(existing.cim10_confidence or "", 3) # Garder celui avec la meilleure confiance, ou à confiance égale celui enrichi if new_conf < old_conf or (new_conf == old_conf and _is_enriched(d) and not _is_enriched(existing)): seen[key] = d # Supprimer les codes parents quand un code plus spécifique existe # Ex: K85 retiré si K85.9 présent (K85 est préfixe strict de K859) codes = {k for k in seen if k and not k.startswith("__text__")} normalized = {c: c.replace(".", "") for c in codes} parents_to_remove: set[str] = set() for code_a in codes: norm_a = normalized[code_a] for code_b in codes: if code_a == code_b: continue norm_b = normalized[code_b] if norm_b.startswith(norm_a) and len(norm_b) > len(norm_a): parents_to_remove.add(code_a) break for parent in parents_to_remove: del seen[parent] return list(seen.values()) def _dedup_actes(all_actes: list[ActeCCAM]) -> list[ActeCCAM]: """Déduplique les actes CCAM par code.""" seen: dict[str | None, ActeCCAM] = {} for a in all_actes: key = a.code_ccam_suggestion if key is None: key = f"__text__{a.texte.lower().strip()}" if key not in seen: seen[key] = a else: existing = seen[key] # Garder celui avec date si possible if a.date and not existing.date: seen[key] = a return list(seen.values()) def merge_dossiers(dossiers: list[DossierMedical]) -> DossierMedical: """Fusionne plusieurs dossiers médicaux d'un même patient. Args: dossiers: Liste de DossierMedical issus de PDFs différents. Returns: Un DossierMedical fusionné. """ if len(dossiers) == 1: result = dossiers[0].model_copy(deep=True) result.source_files = [result.source_file] # Appliquer la dédup famille DP + sémantique même pour un seul dossier dp_code = result.diagnostic_principal.cim10_suggestion if result.diagnostic_principal else None if dp_code: result.diagnostics_associes = [ d for d in result.diagnostics_associes if not d.cim10_suggestion or not _is_dp_family_redundant(d.cim10_suggestion, dp_code) ] result.diagnostics_associes = apply_semantic_dedup(result.diagnostics_associes) return result merged = DossierMedical() # Source files merged.source_files = [d.source_file for d in dossiers if d.source_file] # Séjour merged.sejour = _merge_sejour(dossiers) # Diagnostic principal : le plus spécifique merged.diagnostic_principal = _prefer_most_specific_dp(dossiers) # Collecter tous les DAS + DP non retenus comme DAS all_das: list[Diagnostic] = [] for d in dossiers: all_das.extend(d.diagnostics_associes) # Si le DP de ce dossier est différent du DP fusionné, l'ajouter comme DAS # mais seulement si le texte est un diagnostic valide (filtre artefacts OCR) if ( d.diagnostic_principal and merged.diagnostic_principal and d.diagnostic_principal.cim10_suggestion != merged.diagnostic_principal.cim10_suggestion and is_valid_diagnostic_text(d.diagnostic_principal.texte) ): all_das.append(d.diagnostic_principal) merged.diagnostics_associes = _dedup_diagnostics(all_das) # Retirer les DAS redondants avec le DP (même code, famille, parent/enfant) dp_code = merged.diagnostic_principal.cim10_suggestion if merged.diagnostic_principal else None if dp_code: merged.diagnostics_associes = [ d for d in merged.diagnostics_associes if not d.cim10_suggestion or not _is_dp_family_redundant(d.cim10_suggestion, dp_code) ] # Redondances sémantiques entre DAS merged.diagnostics_associes = apply_semantic_dedup(merged.diagnostics_associes) # Actes CCAM all_actes: list[ActeCCAM] = [] for d in dossiers: all_actes.extend(d.actes_ccam) merged.actes_ccam = _dedup_actes(all_actes) # Biologie : union, dédup par (test, valeur) bio_seen: set[tuple[str, str | None]] = set() for d in dossiers: for b in d.biologie_cle: key = (b.test, b.valeur) if key not in bio_seen: merged.biologie_cle.append(b) bio_seen.add(key) # Imagerie : union, dédup par (type, conclusion) img_seen: set[tuple[str, str | None]] = set() for d in dossiers: for i in d.imagerie: key = (i.type, i.conclusion) if key not in img_seen: merged.imagerie.append(i) img_seen.add(key) # Traitements : union, dédup par médicament (normalisé) med_seen: set[str] = set() for d in dossiers: for t in d.traitements_sortie: key = t.medicament.lower().strip() if key not in med_seen: merged.traitements_sortie.append(t) med_seen.add(key) # Antécédents : union, dédup par texte normalisé ant_seen: set[str] = set() for d in dossiers: for a in d.antecedents: key = a.lower().strip() if key not in ant_seen: merged.antecedents.append(a) ant_seen.add(key) # Complications : union, dédup par texte normalisé comp_seen: set[str] = set() for d in dossiers: for c in d.complications: key = c.lower().strip() if key not in comp_seen: merged.complications.append(c) comp_seen.add(key) # Alertes : alerte de fusion en tête + union merged.alertes_codage = [f"FUSION: {len(dossiers)} documents fusionnés"] alert_seen: set[str] = set() for d in dossiers: for a in d.alertes_codage: if a not in alert_seen: merged.alertes_codage.append(a) alert_seen.add(a) # Document type : le type prioritaire sorted_by_prio = sorted( dossiers, key=lambda d: _DOC_PRIORITY.get(d.document_type, 99), ) merged.document_type = sorted_by_prio[0].document_type logger.info( "Fusion de %d dossiers : DP=%s, %d DAS, %d actes", len(dossiers), merged.diagnostic_principal.cim10_suggestion if merged.diagnostic_principal else "aucun", len(merged.diagnostics_associes), len(merged.actes_ccam), ) return merged