""" Tests for MetricsCollector. Validates: Requirements 18.1-18.9 """ import pytest from datetime import datetime, timedelta from sqlalchemy.orm import Session from src.pipeline_mco_pmsi.metrics import MetricsCollector, QualityMetrics, MetricsThresholds from pipeline_mco_pmsi.database.models import ( StayDB, CodeDB, ClinicalFactDB, EvidenceDB, QuestionDB, TIMCorrectionDB, ) def create_stay_with_codes( session: Session, stay_id: int, codes_data: list, facts_data: list = None ) -> StayDB: """Helper to create a stay with codes and facts.""" stay = StayDB( id=stay_id, stay_id=f"stay_{stay_id}", admission_date=datetime.now() - timedelta(days=7), discharge_date=datetime.now() - timedelta(days=1), specialty="MCO", created_at=datetime.now() ) session.add(stay) session.flush() # Add facts if provided if facts_data: for fact_data in facts_data: fact = ClinicalFactDB( stay_id=stay.id, fact_id=f"fact_{stay_id}_{len(stay.facts)}", evidence_document_id="doc_1", evidence_span_start=0, evidence_span_end=10, evidence_text=fact_data.get("text", ""), **fact_data ) session.add(fact) # Add codes for code_data in codes_data: evidence_data = code_data.pop("evidence", []) # Map code_type to type and confidence_score to confidence if "code_type" in code_data: code_data["type"] = code_data.pop("code_type") if "confidence_score" in code_data: code_data["confidence"] = code_data.pop("confidence_score") # Add required fields code_data.setdefault("label", f"Label for {code_data['code']}") code_data.setdefault("reasoning", "Test reasoning") code_data.setdefault("referentiel_version", "2026") code_data.setdefault("model_name", "test_model") code_data.setdefault("model_digest", "test_digest") code_data.setdefault("prompt_version", "1.0") code = CodeDB( stay_id=stay.id, **code_data ) session.add(code) session.flush() # Add evidence for ev_data in evidence_data: evidence = EvidenceDB( code_id=code.id, **ev_data ) session.add(evidence) session.commit() return stay def test_metrics_collector_initialization(db_session): """Test MetricsCollector initialization.""" collector = MetricsCollector(db_session) assert collector.session == db_session assert collector.thresholds is not None assert isinstance(collector.thresholds, MetricsThresholds) def test_calculate_codes_without_evidence(db_session): """ Test calculation of codes without evidence percentage. Validates: Requirement 18.1 """ # Create stay with codes - some with evidence, some without create_stay_with_codes( db_session, stay_id=1, codes_data=[ { "code": "K29.7", "code_type": "dp", "confidence_score": 0.9, "evidence": [ {"document_id": 1, "span_start": 0, "span_end": 10, "text": "Gastrite"} ] }, { "code": "E11.9", "code_type": "das", "confidence_score": 0.8, "evidence": [] # No evidence }, { "code": "YYYY001", "code_type": "ccam", "confidence_score": 0.7, "evidence": [] # No evidence } ] ) collector = MetricsCollector(db_session) metrics = collector.calculate_metrics() # 2 out of 3 codes have no evidence = 66.67% assert metrics.codes_without_evidence_pct == pytest.approx(66.67, rel=0.1) assert metrics.total_codes == 3 def test_calculate_negated_coded_as_affirmed(db_session): """ Test calculation of negated diagnoses coded as affirmed. Validates: Requirement 18.2 """ # Create stay with negated fact but coded anyway create_stay_with_codes( db_session, stay_id=2, facts_data=[ { "type": "diagnostic", "text": "Gastrite", "qualifier_certainty": "nié", "qualifier_markers": [], "qualifier_confidence": 0.9, "temporality": "actuel", "confidence": 0.9 } ], codes_data=[ { "code": "K29.7", "code_type": "dp", "confidence_score": 0.9, "evidence": [ {"document_id": 1, "span_start": 0, "span_end": 10, "text": "Gastrite"} ] } ] ) collector = MetricsCollector(db_session) metrics = collector.calculate_metrics() # 1 out of 1 diagnostic code is negated = 100% assert metrics.negated_coded_as_affirmed_pct == 100.0 def test_calculate_dp_accuracy_with_gold_standard(db_session): """ Test calculation of DP accuracy vs gold standard. Validates: Requirement 18.3 """ # Create stays with correct and incorrect DP create_stay_with_codes( db_session, stay_id=3, codes_data=[ { "code": "K29.7", "code_type": "dp", "confidence_score": 0.9, "evidence": [ {"document_id": 1, "span_start": 0, "span_end": 10, "text": "Gastrite"} ] } ] ) create_stay_with_codes( db_session, stay_id=4, codes_data=[ { "code": "E11.9", "code_type": "dp", "confidence_score": 0.8, "evidence": [ {"document_id": 1, "span_start": 0, "span_end": 10, "text": "Diabète"} ] } ] ) gold_standard = { 3: "K29.7", # Correct 4: "K29.7", # Incorrect (proposed E11.9) } collector = MetricsCollector(db_session) metrics = collector.calculate_metrics(gold_standard=gold_standard) # 1 out of 2 correct = 50% assert metrics.dp_accuracy_pct == 50.0 def test_calculate_phantom_das(db_session): """ Test calculation of phantom DAS percentage. Validates: Requirement 18.4 """ # Create stay with DAS - some phantom (no/weak evidence) create_stay_with_codes( db_session, stay_id=5, codes_data=[ { "code": "E11.9", "code_type": "das", "confidence_score": 0.9, "evidence": [ {"document_id": 1, "span_start": 0, "span_end": 10, "text": "Diabète"} ] }, { "code": "I10", "code_type": "das", "confidence_score": 0.3, # Low confidence = phantom "evidence": [ {"document_id": 1, "span_start": 0, "span_end": 10, "text": "HTA"} ] }, { "code": "Z86.7", "code_type": "das", "confidence_score": 0.8, "evidence": [] # No evidence = phantom } ] ) collector = MetricsCollector(db_session) metrics = collector.calculate_metrics() # 2 out of 3 DAS are phantom = 66.67% assert metrics.phantom_das_pct == pytest.approx(66.67, rel=0.1) def test_calculate_ccam_without_evidence(db_session): """ Test calculation of CCAM acts without evidence. Validates: Requirement 18.5 """ # Create stay with CCAM codes - some without evidence create_stay_with_codes( db_session, stay_id=6, codes_data=[ { "code": "YYYY001", "code_type": "ccam", "confidence_score": 0.9, "evidence": [ {"document_id": 1, "span_start": 0, "span_end": 10, "text": "Intervention"} ] }, { "code": "ZZZZ002", "code_type": "ccam", "confidence_score": 0.8, "evidence": [] # No evidence } ] ) collector = MetricsCollector(db_session) metrics = collector.calculate_metrics() # 1 out of 2 CCAM without evidence = 50% assert metrics.ccam_without_evidence_pct == 50.0 def test_calculate_one_click_validation(db_session): """ Test calculation of one-click validation rate. Validates: Requirement 18.6 """ # Create validated stay without corrections (one-click) stay1 = create_stay_with_codes( db_session, stay_id=7, codes_data=[ { "code": "K29.7", "code_type": "dp", "confidence_score": 0.9, "status": "accepted", "evidence": [ {"document_id": 1, "span_start": 0, "span_end": 10, "text": "Gastrite"} ] } ] ) db_session.commit() # Create validated stay with corrections (not one-click) stay2 = create_stay_with_codes( db_session, stay_id=8, codes_data=[ { "code": "E11.9", "code_type": "dp", "confidence_score": 0.8, "status": "accepted", "evidence": [ {"document_id": 1, "span_start": 0, "span_end": 10, "text": "Diabète"} ] } ] ) db_session.commit() # Add correction to stay2 correction = TIMCorrectionDB( original_code_id=stay2.codes[0].id, corrected_code="E11.0", corrected_label="Diabète de type 1", corrected_type="dp", user_id="tim_001", timestamp=datetime.now() ) db_session.add(correction) db_session.commit() collector = MetricsCollector(db_session) metrics = collector.calculate_metrics() # 1 out of 2 validated stays is one-click = 50% assert metrics.one_click_validation_pct == 50.0 def test_calculate_question_relevance(db_session): """ Test calculation of question relevance percentage. Validates: Requirement 18.7 """ # Create stay with questions - some with relevance feedback stay = create_stay_with_codes( db_session, stay_id=9, codes_data=[ { "code": "K29.7", "code_type": "dp", "confidence_score": 0.9, "evidence": [ {"document_id": 1, "span_start": 0, "span_end": 10, "text": "Gastrite"} ] } ] ) # Add questions with relevance scores q1 = QuestionDB( stay_id=stay.id, question_id="q1", text="Date de début des symptômes?", priority=1, category="clinical", context="Missing information", suggested_answers=[] ) q1.relevance_score = 0.9 # Relevant db_session.add(q1) q2 = QuestionDB( stay_id=stay.id, question_id="q2", text="Antécédents familiaux?", priority=2, category="clinical", context="Missing information", suggested_answers=[] ) q2.relevance_score = 0.5 # Not relevant db_session.add(q2) db_session.commit() collector = MetricsCollector(db_session) metrics = collector.calculate_metrics() # 1 out of 2 questions is relevant (score >= 0.7) = 50% assert metrics.question_relevance_pct == 50.0 def test_detect_drift(db_session): """ Test drift detection between baseline and current metrics. Validates: Requirement 18.8 """ baseline = QualityMetrics( codes_without_evidence_pct=2.0, negated_coded_as_affirmed_pct=0.5, dp_accuracy_pct=85.0, phantom_das_pct=5.0, ccam_without_evidence_pct=1.0, one_click_validation_pct=60.0 ) # Current metrics with significant drift current = QualityMetrics( codes_without_evidence_pct=5.0, # 150% increase negated_coded_as_affirmed_pct=0.5, dp_accuracy_pct=70.0, # 17.6% decrease phantom_das_pct=5.0, ccam_without_evidence_pct=1.0, one_click_validation_pct=60.0 ) collector = MetricsCollector(db_session) drift_detected = collector.detect_drift(current, baseline, drift_threshold=10.0) assert drift_detected is True assert "codes_without_evidence_pct" in current.drift_metrics assert "dp_accuracy_pct" in current.drift_metrics def test_check_thresholds_no_alerts(db_session): """Test threshold checking with metrics within thresholds.""" metrics = QualityMetrics( codes_without_evidence_pct=2.0, negated_coded_as_affirmed_pct=0.5, dp_accuracy_pct=85.0, phantom_das_pct=5.0, ccam_without_evidence_pct=1.0, one_click_validation_pct=60.0, question_relevance_pct=85.0 ) collector = MetricsCollector(db_session) alerts = collector.check_thresholds(metrics) assert len(alerts) == 0 def test_check_thresholds_with_alerts(db_session): """ Test threshold checking with metrics exceeding thresholds. Validates: Requirement 18.9 """ metrics = QualityMetrics( codes_without_evidence_pct=10.0, # Exceeds 5% negated_coded_as_affirmed_pct=2.0, # Exceeds 1% dp_accuracy_pct=70.0, # Below 80% phantom_das_pct=15.0, # Exceeds 10% ccam_without_evidence_pct=5.0, # Exceeds 2% one_click_validation_pct=40.0, # Below 50% question_relevance_pct=70.0 # Below 80% ) collector = MetricsCollector(db_session) alerts = collector.check_thresholds(metrics) # Should have 7 alerts (one for each exceeded threshold) assert len(alerts) == 7 assert any("Codes without evidence" in alert for alert in alerts) assert any("Negated diagnoses" in alert for alert in alerts) assert any("DP accuracy" in alert for alert in alerts) assert any("Phantom DAS" in alert for alert in alerts) assert any("CCAM without evidence" in alert for alert in alerts) assert any("One-click validation" in alert for alert in alerts) assert any("Question relevance" in alert for alert in alerts) def test_custom_thresholds(db_session): """Test MetricsCollector with custom thresholds.""" custom_thresholds = MetricsThresholds( max_codes_without_evidence_pct=10.0, max_negated_coded_as_affirmed_pct=2.0, min_dp_accuracy_pct=70.0 ) collector = MetricsCollector(db_session, thresholds=custom_thresholds) assert collector.thresholds.max_codes_without_evidence_pct == 10.0 assert collector.thresholds.max_negated_coded_as_affirmed_pct == 2.0 assert collector.thresholds.min_dp_accuracy_pct == 70.0 def test_calculate_metrics_with_date_filter(db_session): """Test metrics calculation with date filtering.""" # Create stay from 10 days ago old_stay = create_stay_with_codes( db_session, stay_id=10, codes_data=[ { "code": "K29.7", "code_type": "dp", "confidence_score": 0.9, "evidence": [] } ] ) old_stay.created_at = datetime.now() - timedelta(days=10) db_session.commit() # Create stay from 2 days ago recent_stay = create_stay_with_codes( db_session, stay_id=11, codes_data=[ { "code": "E11.9", "code_type": "dp", "confidence_score": 0.8, "evidence": [ {"document_id": 1, "span_start": 0, "span_end": 10, "text": "Diabète"} ] } ] ) recent_stay.created_at = datetime.now() - timedelta(days=2) db_session.commit() collector = MetricsCollector(db_session) # Calculate metrics for last 5 days only start_date = datetime.now() - timedelta(days=5) metrics = collector.calculate_metrics(start_date=start_date) # Should only include recent stay assert metrics.total_stays == 1 assert metrics.total_codes == 1 def test_calculate_metrics_empty_database(db_session): """Test metrics calculation with no stays.""" collector = MetricsCollector(db_session) metrics = collector.calculate_metrics() assert metrics.total_stays == 0 assert metrics.total_codes == 0 assert metrics.codes_without_evidence_pct == 0.0 def test_calculate_metrics_with_stay_ids(db_session): """Test metrics calculation with specific stay IDs.""" # Create multiple stays create_stay_with_codes( db_session, stay_id=12, codes_data=[ { "code": "K29.7", "code_type": "dp", "confidence_score": 0.9, "evidence": [] } ] ) create_stay_with_codes( db_session, stay_id=13, codes_data=[ { "code": "E11.9", "code_type": "dp", "confidence_score": 0.8, "evidence": [ {"document_id": 1, "span_start": 0, "span_end": 10, "text": "Diabète"} ] } ] ) collector = MetricsCollector(db_session) # Calculate metrics for specific stay only metrics = collector.calculate_metrics(stay_ids=[12]) assert metrics.total_stays == 1 assert metrics.total_codes == 1 assert metrics.codes_without_evidence_pct == 100.0