Files
aivanov_CIM/tests/test_metrics_collector.py
2026-03-05 01:20:14 +01:00

616 lines
18 KiB
Python

"""
Tests for MetricsCollector.
Validates: Requirements 18.1-18.9
"""
import pytest
from datetime import datetime, timedelta
from sqlalchemy.orm import Session
from src.pipeline_mco_pmsi.metrics import MetricsCollector, QualityMetrics, MetricsThresholds
from pipeline_mco_pmsi.database.models import (
StayDB,
CodeDB,
ClinicalFactDB,
EvidenceDB,
QuestionDB,
TIMCorrectionDB,
)
def create_stay_with_codes(
session: Session,
stay_id: int,
codes_data: list,
facts_data: list = None
) -> StayDB:
"""Helper to create a stay with codes and facts."""
stay = StayDB(
id=stay_id,
stay_id=f"stay_{stay_id}",
admission_date=datetime.now() - timedelta(days=7),
discharge_date=datetime.now() - timedelta(days=1),
specialty="MCO",
created_at=datetime.now()
)
session.add(stay)
session.flush()
# Add facts if provided
if facts_data:
for fact_data in facts_data:
fact = ClinicalFactDB(
stay_id=stay.id,
fact_id=f"fact_{stay_id}_{len(stay.facts)}",
evidence_document_id="doc_1",
evidence_span_start=0,
evidence_span_end=10,
evidence_text=fact_data.get("text", ""),
**fact_data
)
session.add(fact)
# Add codes
for code_data in codes_data:
evidence_data = code_data.pop("evidence", [])
# Map code_type to type and confidence_score to confidence
if "code_type" in code_data:
code_data["type"] = code_data.pop("code_type")
if "confidence_score" in code_data:
code_data["confidence"] = code_data.pop("confidence_score")
# Add required fields
code_data.setdefault("label", f"Label for {code_data['code']}")
code_data.setdefault("reasoning", "Test reasoning")
code_data.setdefault("referentiel_version", "2026")
code_data.setdefault("model_name", "test_model")
code_data.setdefault("model_digest", "test_digest")
code_data.setdefault("prompt_version", "1.0")
code = CodeDB(
stay_id=stay.id,
**code_data
)
session.add(code)
session.flush()
# Add evidence
for ev_data in evidence_data:
evidence = EvidenceDB(
code_id=code.id,
**ev_data
)
session.add(evidence)
session.commit()
return stay
def test_metrics_collector_initialization(db_session):
"""Test MetricsCollector initialization."""
collector = MetricsCollector(db_session)
assert collector.session == db_session
assert collector.thresholds is not None
assert isinstance(collector.thresholds, MetricsThresholds)
def test_calculate_codes_without_evidence(db_session):
"""
Test calculation of codes without evidence percentage.
Validates: Requirement 18.1
"""
# Create stay with codes - some with evidence, some without
create_stay_with_codes(
db_session,
stay_id=1,
codes_data=[
{
"code": "K29.7",
"code_type": "dp",
"confidence_score": 0.9,
"evidence": [
{"document_id": 1, "span_start": 0, "span_end": 10, "text": "Gastrite"}
]
},
{
"code": "E11.9",
"code_type": "das",
"confidence_score": 0.8,
"evidence": [] # No evidence
},
{
"code": "YYYY001",
"code_type": "ccam",
"confidence_score": 0.7,
"evidence": [] # No evidence
}
]
)
collector = MetricsCollector(db_session)
metrics = collector.calculate_metrics()
# 2 out of 3 codes have no evidence = 66.67%
assert metrics.codes_without_evidence_pct == pytest.approx(66.67, rel=0.1)
assert metrics.total_codes == 3
def test_calculate_negated_coded_as_affirmed(db_session):
"""
Test calculation of negated diagnoses coded as affirmed.
Validates: Requirement 18.2
"""
# Create stay with negated fact but coded anyway
create_stay_with_codes(
db_session,
stay_id=2,
facts_data=[
{
"type": "diagnostic",
"text": "Gastrite",
"qualifier_certainty": "nié",
"qualifier_markers": [],
"qualifier_confidence": 0.9,
"temporality": "actuel",
"confidence": 0.9
}
],
codes_data=[
{
"code": "K29.7",
"code_type": "dp",
"confidence_score": 0.9,
"evidence": [
{"document_id": 1, "span_start": 0, "span_end": 10, "text": "Gastrite"}
]
}
]
)
collector = MetricsCollector(db_session)
metrics = collector.calculate_metrics()
# 1 out of 1 diagnostic code is negated = 100%
assert metrics.negated_coded_as_affirmed_pct == 100.0
def test_calculate_dp_accuracy_with_gold_standard(db_session):
"""
Test calculation of DP accuracy vs gold standard.
Validates: Requirement 18.3
"""
# Create stays with correct and incorrect DP
create_stay_with_codes(
db_session,
stay_id=3,
codes_data=[
{
"code": "K29.7",
"code_type": "dp",
"confidence_score": 0.9,
"evidence": [
{"document_id": 1, "span_start": 0, "span_end": 10, "text": "Gastrite"}
]
}
]
)
create_stay_with_codes(
db_session,
stay_id=4,
codes_data=[
{
"code": "E11.9",
"code_type": "dp",
"confidence_score": 0.8,
"evidence": [
{"document_id": 1, "span_start": 0, "span_end": 10, "text": "Diabète"}
]
}
]
)
gold_standard = {
3: "K29.7", # Correct
4: "K29.7", # Incorrect (proposed E11.9)
}
collector = MetricsCollector(db_session)
metrics = collector.calculate_metrics(gold_standard=gold_standard)
# 1 out of 2 correct = 50%
assert metrics.dp_accuracy_pct == 50.0
def test_calculate_phantom_das(db_session):
"""
Test calculation of phantom DAS percentage.
Validates: Requirement 18.4
"""
# Create stay with DAS - some phantom (no/weak evidence)
create_stay_with_codes(
db_session,
stay_id=5,
codes_data=[
{
"code": "E11.9",
"code_type": "das",
"confidence_score": 0.9,
"evidence": [
{"document_id": 1, "span_start": 0, "span_end": 10, "text": "Diabète"}
]
},
{
"code": "I10",
"code_type": "das",
"confidence_score": 0.3, # Low confidence = phantom
"evidence": [
{"document_id": 1, "span_start": 0, "span_end": 10, "text": "HTA"}
]
},
{
"code": "Z86.7",
"code_type": "das",
"confidence_score": 0.8,
"evidence": [] # No evidence = phantom
}
]
)
collector = MetricsCollector(db_session)
metrics = collector.calculate_metrics()
# 2 out of 3 DAS are phantom = 66.67%
assert metrics.phantom_das_pct == pytest.approx(66.67, rel=0.1)
def test_calculate_ccam_without_evidence(db_session):
"""
Test calculation of CCAM acts without evidence.
Validates: Requirement 18.5
"""
# Create stay with CCAM codes - some without evidence
create_stay_with_codes(
db_session,
stay_id=6,
codes_data=[
{
"code": "YYYY001",
"code_type": "ccam",
"confidence_score": 0.9,
"evidence": [
{"document_id": 1, "span_start": 0, "span_end": 10, "text": "Intervention"}
]
},
{
"code": "ZZZZ002",
"code_type": "ccam",
"confidence_score": 0.8,
"evidence": [] # No evidence
}
]
)
collector = MetricsCollector(db_session)
metrics = collector.calculate_metrics()
# 1 out of 2 CCAM without evidence = 50%
assert metrics.ccam_without_evidence_pct == 50.0
def test_calculate_one_click_validation(db_session):
"""
Test calculation of one-click validation rate.
Validates: Requirement 18.6
"""
# Create validated stay without corrections (one-click)
stay1 = create_stay_with_codes(
db_session,
stay_id=7,
codes_data=[
{
"code": "K29.7",
"code_type": "dp",
"confidence_score": 0.9,
"status": "accepted",
"evidence": [
{"document_id": 1, "span_start": 0, "span_end": 10, "text": "Gastrite"}
]
}
]
)
db_session.commit()
# Create validated stay with corrections (not one-click)
stay2 = create_stay_with_codes(
db_session,
stay_id=8,
codes_data=[
{
"code": "E11.9",
"code_type": "dp",
"confidence_score": 0.8,
"status": "accepted",
"evidence": [
{"document_id": 1, "span_start": 0, "span_end": 10, "text": "Diabète"}
]
}
]
)
db_session.commit()
# Add correction to stay2
correction = TIMCorrectionDB(
original_code_id=stay2.codes[0].id,
corrected_code="E11.0",
corrected_label="Diabète de type 1",
corrected_type="dp",
user_id="tim_001",
timestamp=datetime.now()
)
db_session.add(correction)
db_session.commit()
collector = MetricsCollector(db_session)
metrics = collector.calculate_metrics()
# 1 out of 2 validated stays is one-click = 50%
assert metrics.one_click_validation_pct == 50.0
def test_calculate_question_relevance(db_session):
"""
Test calculation of question relevance percentage.
Validates: Requirement 18.7
"""
# Create stay with questions - some with relevance feedback
stay = create_stay_with_codes(
db_session,
stay_id=9,
codes_data=[
{
"code": "K29.7",
"code_type": "dp",
"confidence_score": 0.9,
"evidence": [
{"document_id": 1, "span_start": 0, "span_end": 10, "text": "Gastrite"}
]
}
]
)
# Add questions with relevance scores
q1 = QuestionDB(
stay_id=stay.id,
question_id="q1",
text="Date de début des symptômes?",
priority=1,
category="clinical",
context="Missing information",
suggested_answers=[]
)
q1.relevance_score = 0.9 # Relevant
db_session.add(q1)
q2 = QuestionDB(
stay_id=stay.id,
question_id="q2",
text="Antécédents familiaux?",
priority=2,
category="clinical",
context="Missing information",
suggested_answers=[]
)
q2.relevance_score = 0.5 # Not relevant
db_session.add(q2)
db_session.commit()
collector = MetricsCollector(db_session)
metrics = collector.calculate_metrics()
# 1 out of 2 questions is relevant (score >= 0.7) = 50%
assert metrics.question_relevance_pct == 50.0
def test_detect_drift(db_session):
"""
Test drift detection between baseline and current metrics.
Validates: Requirement 18.8
"""
baseline = QualityMetrics(
codes_without_evidence_pct=2.0,
negated_coded_as_affirmed_pct=0.5,
dp_accuracy_pct=85.0,
phantom_das_pct=5.0,
ccam_without_evidence_pct=1.0,
one_click_validation_pct=60.0
)
# Current metrics with significant drift
current = QualityMetrics(
codes_without_evidence_pct=5.0, # 150% increase
negated_coded_as_affirmed_pct=0.5,
dp_accuracy_pct=70.0, # 17.6% decrease
phantom_das_pct=5.0,
ccam_without_evidence_pct=1.0,
one_click_validation_pct=60.0
)
collector = MetricsCollector(db_session)
drift_detected = collector.detect_drift(current, baseline, drift_threshold=10.0)
assert drift_detected is True
assert "codes_without_evidence_pct" in current.drift_metrics
assert "dp_accuracy_pct" in current.drift_metrics
def test_check_thresholds_no_alerts(db_session):
"""Test threshold checking with metrics within thresholds."""
metrics = QualityMetrics(
codes_without_evidence_pct=2.0,
negated_coded_as_affirmed_pct=0.5,
dp_accuracy_pct=85.0,
phantom_das_pct=5.0,
ccam_without_evidence_pct=1.0,
one_click_validation_pct=60.0,
question_relevance_pct=85.0
)
collector = MetricsCollector(db_session)
alerts = collector.check_thresholds(metrics)
assert len(alerts) == 0
def test_check_thresholds_with_alerts(db_session):
"""
Test threshold checking with metrics exceeding thresholds.
Validates: Requirement 18.9
"""
metrics = QualityMetrics(
codes_without_evidence_pct=10.0, # Exceeds 5%
negated_coded_as_affirmed_pct=2.0, # Exceeds 1%
dp_accuracy_pct=70.0, # Below 80%
phantom_das_pct=15.0, # Exceeds 10%
ccam_without_evidence_pct=5.0, # Exceeds 2%
one_click_validation_pct=40.0, # Below 50%
question_relevance_pct=70.0 # Below 80%
)
collector = MetricsCollector(db_session)
alerts = collector.check_thresholds(metrics)
# Should have 7 alerts (one for each exceeded threshold)
assert len(alerts) == 7
assert any("Codes without evidence" in alert for alert in alerts)
assert any("Negated diagnoses" in alert for alert in alerts)
assert any("DP accuracy" in alert for alert in alerts)
assert any("Phantom DAS" in alert for alert in alerts)
assert any("CCAM without evidence" in alert for alert in alerts)
assert any("One-click validation" in alert for alert in alerts)
assert any("Question relevance" in alert for alert in alerts)
def test_custom_thresholds(db_session):
"""Test MetricsCollector with custom thresholds."""
custom_thresholds = MetricsThresholds(
max_codes_without_evidence_pct=10.0,
max_negated_coded_as_affirmed_pct=2.0,
min_dp_accuracy_pct=70.0
)
collector = MetricsCollector(db_session, thresholds=custom_thresholds)
assert collector.thresholds.max_codes_without_evidence_pct == 10.0
assert collector.thresholds.max_negated_coded_as_affirmed_pct == 2.0
assert collector.thresholds.min_dp_accuracy_pct == 70.0
def test_calculate_metrics_with_date_filter(db_session):
"""Test metrics calculation with date filtering."""
# Create stay from 10 days ago
old_stay = create_stay_with_codes(
db_session,
stay_id=10,
codes_data=[
{
"code": "K29.7",
"code_type": "dp",
"confidence_score": 0.9,
"evidence": []
}
]
)
old_stay.created_at = datetime.now() - timedelta(days=10)
db_session.commit()
# Create stay from 2 days ago
recent_stay = create_stay_with_codes(
db_session,
stay_id=11,
codes_data=[
{
"code": "E11.9",
"code_type": "dp",
"confidence_score": 0.8,
"evidence": [
{"document_id": 1, "span_start": 0, "span_end": 10, "text": "Diabète"}
]
}
]
)
recent_stay.created_at = datetime.now() - timedelta(days=2)
db_session.commit()
collector = MetricsCollector(db_session)
# Calculate metrics for last 5 days only
start_date = datetime.now() - timedelta(days=5)
metrics = collector.calculate_metrics(start_date=start_date)
# Should only include recent stay
assert metrics.total_stays == 1
assert metrics.total_codes == 1
def test_calculate_metrics_empty_database(db_session):
"""Test metrics calculation with no stays."""
collector = MetricsCollector(db_session)
metrics = collector.calculate_metrics()
assert metrics.total_stays == 0
assert metrics.total_codes == 0
assert metrics.codes_without_evidence_pct == 0.0
def test_calculate_metrics_with_stay_ids(db_session):
"""Test metrics calculation with specific stay IDs."""
# Create multiple stays
create_stay_with_codes(
db_session,
stay_id=12,
codes_data=[
{
"code": "K29.7",
"code_type": "dp",
"confidence_score": 0.9,
"evidence": []
}
]
)
create_stay_with_codes(
db_session,
stay_id=13,
codes_data=[
{
"code": "E11.9",
"code_type": "dp",
"confidence_score": 0.8,
"evidence": [
{"document_id": 1, "span_start": 0, "span_end": 10, "text": "Diabète"}
]
}
]
)
collector = MetricsCollector(db_session)
# Calculate metrics for specific stay only
metrics = collector.calculate_metrics(stay_ids=[12])
assert metrics.total_stays == 1
assert metrics.total_codes == 1
assert metrics.codes_without_evidence_pct == 100.0