616 lines
18 KiB
Python
616 lines
18 KiB
Python
"""
|
|
Tests for MetricsCollector.
|
|
|
|
Validates: Requirements 18.1-18.9
|
|
"""
|
|
|
|
import pytest
|
|
from datetime import datetime, timedelta
|
|
from sqlalchemy.orm import Session
|
|
|
|
from src.pipeline_mco_pmsi.metrics import MetricsCollector, QualityMetrics, MetricsThresholds
|
|
from pipeline_mco_pmsi.database.models import (
|
|
StayDB,
|
|
CodeDB,
|
|
ClinicalFactDB,
|
|
EvidenceDB,
|
|
QuestionDB,
|
|
TIMCorrectionDB,
|
|
)
|
|
|
|
|
|
def create_stay_with_codes(
|
|
session: Session,
|
|
stay_id: int,
|
|
codes_data: list,
|
|
facts_data: list = None
|
|
) -> StayDB:
|
|
"""Helper to create a stay with codes and facts."""
|
|
stay = StayDB(
|
|
id=stay_id,
|
|
stay_id=f"stay_{stay_id}",
|
|
admission_date=datetime.now() - timedelta(days=7),
|
|
discharge_date=datetime.now() - timedelta(days=1),
|
|
specialty="MCO",
|
|
created_at=datetime.now()
|
|
)
|
|
session.add(stay)
|
|
session.flush()
|
|
|
|
# Add facts if provided
|
|
if facts_data:
|
|
for fact_data in facts_data:
|
|
fact = ClinicalFactDB(
|
|
stay_id=stay.id,
|
|
fact_id=f"fact_{stay_id}_{len(stay.facts)}",
|
|
evidence_document_id="doc_1",
|
|
evidence_span_start=0,
|
|
evidence_span_end=10,
|
|
evidence_text=fact_data.get("text", ""),
|
|
**fact_data
|
|
)
|
|
session.add(fact)
|
|
|
|
# Add codes
|
|
for code_data in codes_data:
|
|
evidence_data = code_data.pop("evidence", [])
|
|
# Map code_type to type and confidence_score to confidence
|
|
if "code_type" in code_data:
|
|
code_data["type"] = code_data.pop("code_type")
|
|
if "confidence_score" in code_data:
|
|
code_data["confidence"] = code_data.pop("confidence_score")
|
|
|
|
# Add required fields
|
|
code_data.setdefault("label", f"Label for {code_data['code']}")
|
|
code_data.setdefault("reasoning", "Test reasoning")
|
|
code_data.setdefault("referentiel_version", "2026")
|
|
code_data.setdefault("model_name", "test_model")
|
|
code_data.setdefault("model_digest", "test_digest")
|
|
code_data.setdefault("prompt_version", "1.0")
|
|
|
|
code = CodeDB(
|
|
stay_id=stay.id,
|
|
**code_data
|
|
)
|
|
session.add(code)
|
|
session.flush()
|
|
|
|
# Add evidence
|
|
for ev_data in evidence_data:
|
|
evidence = EvidenceDB(
|
|
code_id=code.id,
|
|
**ev_data
|
|
)
|
|
session.add(evidence)
|
|
|
|
session.commit()
|
|
return stay
|
|
|
|
|
|
def test_metrics_collector_initialization(db_session):
|
|
"""Test MetricsCollector initialization."""
|
|
collector = MetricsCollector(db_session)
|
|
assert collector.session == db_session
|
|
assert collector.thresholds is not None
|
|
assert isinstance(collector.thresholds, MetricsThresholds)
|
|
|
|
|
|
def test_calculate_codes_without_evidence(db_session):
|
|
"""
|
|
Test calculation of codes without evidence percentage.
|
|
|
|
Validates: Requirement 18.1
|
|
"""
|
|
# Create stay with codes - some with evidence, some without
|
|
create_stay_with_codes(
|
|
db_session,
|
|
stay_id=1,
|
|
codes_data=[
|
|
{
|
|
"code": "K29.7",
|
|
"code_type": "dp",
|
|
"confidence_score": 0.9,
|
|
"evidence": [
|
|
{"document_id": 1, "span_start": 0, "span_end": 10, "text": "Gastrite"}
|
|
]
|
|
},
|
|
{
|
|
"code": "E11.9",
|
|
"code_type": "das",
|
|
"confidence_score": 0.8,
|
|
"evidence": [] # No evidence
|
|
},
|
|
{
|
|
"code": "YYYY001",
|
|
"code_type": "ccam",
|
|
"confidence_score": 0.7,
|
|
"evidence": [] # No evidence
|
|
}
|
|
]
|
|
)
|
|
|
|
collector = MetricsCollector(db_session)
|
|
metrics = collector.calculate_metrics()
|
|
|
|
# 2 out of 3 codes have no evidence = 66.67%
|
|
assert metrics.codes_without_evidence_pct == pytest.approx(66.67, rel=0.1)
|
|
assert metrics.total_codes == 3
|
|
|
|
|
|
def test_calculate_negated_coded_as_affirmed(db_session):
|
|
"""
|
|
Test calculation of negated diagnoses coded as affirmed.
|
|
|
|
Validates: Requirement 18.2
|
|
"""
|
|
# Create stay with negated fact but coded anyway
|
|
create_stay_with_codes(
|
|
db_session,
|
|
stay_id=2,
|
|
facts_data=[
|
|
{
|
|
"type": "diagnostic",
|
|
"text": "Gastrite",
|
|
"qualifier_certainty": "nié",
|
|
"qualifier_markers": [],
|
|
"qualifier_confidence": 0.9,
|
|
"temporality": "actuel",
|
|
"confidence": 0.9
|
|
}
|
|
],
|
|
codes_data=[
|
|
{
|
|
"code": "K29.7",
|
|
"code_type": "dp",
|
|
"confidence_score": 0.9,
|
|
"evidence": [
|
|
{"document_id": 1, "span_start": 0, "span_end": 10, "text": "Gastrite"}
|
|
]
|
|
}
|
|
]
|
|
)
|
|
|
|
collector = MetricsCollector(db_session)
|
|
metrics = collector.calculate_metrics()
|
|
|
|
# 1 out of 1 diagnostic code is negated = 100%
|
|
assert metrics.negated_coded_as_affirmed_pct == 100.0
|
|
|
|
|
|
def test_calculate_dp_accuracy_with_gold_standard(db_session):
|
|
"""
|
|
Test calculation of DP accuracy vs gold standard.
|
|
|
|
Validates: Requirement 18.3
|
|
"""
|
|
# Create stays with correct and incorrect DP
|
|
create_stay_with_codes(
|
|
db_session,
|
|
stay_id=3,
|
|
codes_data=[
|
|
{
|
|
"code": "K29.7",
|
|
"code_type": "dp",
|
|
"confidence_score": 0.9,
|
|
"evidence": [
|
|
{"document_id": 1, "span_start": 0, "span_end": 10, "text": "Gastrite"}
|
|
]
|
|
}
|
|
]
|
|
)
|
|
|
|
create_stay_with_codes(
|
|
db_session,
|
|
stay_id=4,
|
|
codes_data=[
|
|
{
|
|
"code": "E11.9",
|
|
"code_type": "dp",
|
|
"confidence_score": 0.8,
|
|
"evidence": [
|
|
{"document_id": 1, "span_start": 0, "span_end": 10, "text": "Diabète"}
|
|
]
|
|
}
|
|
]
|
|
)
|
|
|
|
gold_standard = {
|
|
3: "K29.7", # Correct
|
|
4: "K29.7", # Incorrect (proposed E11.9)
|
|
}
|
|
|
|
collector = MetricsCollector(db_session)
|
|
metrics = collector.calculate_metrics(gold_standard=gold_standard)
|
|
|
|
# 1 out of 2 correct = 50%
|
|
assert metrics.dp_accuracy_pct == 50.0
|
|
|
|
|
|
def test_calculate_phantom_das(db_session):
|
|
"""
|
|
Test calculation of phantom DAS percentage.
|
|
|
|
Validates: Requirement 18.4
|
|
"""
|
|
# Create stay with DAS - some phantom (no/weak evidence)
|
|
create_stay_with_codes(
|
|
db_session,
|
|
stay_id=5,
|
|
codes_data=[
|
|
{
|
|
"code": "E11.9",
|
|
"code_type": "das",
|
|
"confidence_score": 0.9,
|
|
"evidence": [
|
|
{"document_id": 1, "span_start": 0, "span_end": 10, "text": "Diabète"}
|
|
]
|
|
},
|
|
{
|
|
"code": "I10",
|
|
"code_type": "das",
|
|
"confidence_score": 0.3, # Low confidence = phantom
|
|
"evidence": [
|
|
{"document_id": 1, "span_start": 0, "span_end": 10, "text": "HTA"}
|
|
]
|
|
},
|
|
{
|
|
"code": "Z86.7",
|
|
"code_type": "das",
|
|
"confidence_score": 0.8,
|
|
"evidence": [] # No evidence = phantom
|
|
}
|
|
]
|
|
)
|
|
|
|
collector = MetricsCollector(db_session)
|
|
metrics = collector.calculate_metrics()
|
|
|
|
# 2 out of 3 DAS are phantom = 66.67%
|
|
assert metrics.phantom_das_pct == pytest.approx(66.67, rel=0.1)
|
|
|
|
|
|
def test_calculate_ccam_without_evidence(db_session):
|
|
"""
|
|
Test calculation of CCAM acts without evidence.
|
|
|
|
Validates: Requirement 18.5
|
|
"""
|
|
# Create stay with CCAM codes - some without evidence
|
|
create_stay_with_codes(
|
|
db_session,
|
|
stay_id=6,
|
|
codes_data=[
|
|
{
|
|
"code": "YYYY001",
|
|
"code_type": "ccam",
|
|
"confidence_score": 0.9,
|
|
"evidence": [
|
|
{"document_id": 1, "span_start": 0, "span_end": 10, "text": "Intervention"}
|
|
]
|
|
},
|
|
{
|
|
"code": "ZZZZ002",
|
|
"code_type": "ccam",
|
|
"confidence_score": 0.8,
|
|
"evidence": [] # No evidence
|
|
}
|
|
]
|
|
)
|
|
|
|
collector = MetricsCollector(db_session)
|
|
metrics = collector.calculate_metrics()
|
|
|
|
# 1 out of 2 CCAM without evidence = 50%
|
|
assert metrics.ccam_without_evidence_pct == 50.0
|
|
|
|
|
|
def test_calculate_one_click_validation(db_session):
|
|
"""
|
|
Test calculation of one-click validation rate.
|
|
|
|
Validates: Requirement 18.6
|
|
"""
|
|
# Create validated stay without corrections (one-click)
|
|
stay1 = create_stay_with_codes(
|
|
db_session,
|
|
stay_id=7,
|
|
codes_data=[
|
|
{
|
|
"code": "K29.7",
|
|
"code_type": "dp",
|
|
"confidence_score": 0.9,
|
|
"status": "accepted",
|
|
"evidence": [
|
|
{"document_id": 1, "span_start": 0, "span_end": 10, "text": "Gastrite"}
|
|
]
|
|
}
|
|
]
|
|
)
|
|
db_session.commit()
|
|
|
|
# Create validated stay with corrections (not one-click)
|
|
stay2 = create_stay_with_codes(
|
|
db_session,
|
|
stay_id=8,
|
|
codes_data=[
|
|
{
|
|
"code": "E11.9",
|
|
"code_type": "dp",
|
|
"confidence_score": 0.8,
|
|
"status": "accepted",
|
|
"evidence": [
|
|
{"document_id": 1, "span_start": 0, "span_end": 10, "text": "Diabète"}
|
|
]
|
|
}
|
|
]
|
|
)
|
|
db_session.commit()
|
|
|
|
# Add correction to stay2
|
|
correction = TIMCorrectionDB(
|
|
original_code_id=stay2.codes[0].id,
|
|
corrected_code="E11.0",
|
|
corrected_label="Diabète de type 1",
|
|
corrected_type="dp",
|
|
user_id="tim_001",
|
|
timestamp=datetime.now()
|
|
)
|
|
db_session.add(correction)
|
|
db_session.commit()
|
|
|
|
collector = MetricsCollector(db_session)
|
|
metrics = collector.calculate_metrics()
|
|
|
|
# 1 out of 2 validated stays is one-click = 50%
|
|
assert metrics.one_click_validation_pct == 50.0
|
|
|
|
|
|
def test_calculate_question_relevance(db_session):
|
|
"""
|
|
Test calculation of question relevance percentage.
|
|
|
|
Validates: Requirement 18.7
|
|
"""
|
|
# Create stay with questions - some with relevance feedback
|
|
stay = create_stay_with_codes(
|
|
db_session,
|
|
stay_id=9,
|
|
codes_data=[
|
|
{
|
|
"code": "K29.7",
|
|
"code_type": "dp",
|
|
"confidence_score": 0.9,
|
|
"evidence": [
|
|
{"document_id": 1, "span_start": 0, "span_end": 10, "text": "Gastrite"}
|
|
]
|
|
}
|
|
]
|
|
)
|
|
|
|
# Add questions with relevance scores
|
|
q1 = QuestionDB(
|
|
stay_id=stay.id,
|
|
question_id="q1",
|
|
text="Date de début des symptômes?",
|
|
priority=1,
|
|
category="clinical",
|
|
context="Missing information",
|
|
suggested_answers=[]
|
|
)
|
|
q1.relevance_score = 0.9 # Relevant
|
|
db_session.add(q1)
|
|
|
|
q2 = QuestionDB(
|
|
stay_id=stay.id,
|
|
question_id="q2",
|
|
text="Antécédents familiaux?",
|
|
priority=2,
|
|
category="clinical",
|
|
context="Missing information",
|
|
suggested_answers=[]
|
|
)
|
|
q2.relevance_score = 0.5 # Not relevant
|
|
db_session.add(q2)
|
|
|
|
db_session.commit()
|
|
|
|
collector = MetricsCollector(db_session)
|
|
metrics = collector.calculate_metrics()
|
|
|
|
# 1 out of 2 questions is relevant (score >= 0.7) = 50%
|
|
assert metrics.question_relevance_pct == 50.0
|
|
|
|
|
|
def test_detect_drift(db_session):
|
|
"""
|
|
Test drift detection between baseline and current metrics.
|
|
|
|
Validates: Requirement 18.8
|
|
"""
|
|
baseline = QualityMetrics(
|
|
codes_without_evidence_pct=2.0,
|
|
negated_coded_as_affirmed_pct=0.5,
|
|
dp_accuracy_pct=85.0,
|
|
phantom_das_pct=5.0,
|
|
ccam_without_evidence_pct=1.0,
|
|
one_click_validation_pct=60.0
|
|
)
|
|
|
|
# Current metrics with significant drift
|
|
current = QualityMetrics(
|
|
codes_without_evidence_pct=5.0, # 150% increase
|
|
negated_coded_as_affirmed_pct=0.5,
|
|
dp_accuracy_pct=70.0, # 17.6% decrease
|
|
phantom_das_pct=5.0,
|
|
ccam_without_evidence_pct=1.0,
|
|
one_click_validation_pct=60.0
|
|
)
|
|
|
|
collector = MetricsCollector(db_session)
|
|
drift_detected = collector.detect_drift(current, baseline, drift_threshold=10.0)
|
|
|
|
assert drift_detected is True
|
|
assert "codes_without_evidence_pct" in current.drift_metrics
|
|
assert "dp_accuracy_pct" in current.drift_metrics
|
|
|
|
|
|
def test_check_thresholds_no_alerts(db_session):
|
|
"""Test threshold checking with metrics within thresholds."""
|
|
metrics = QualityMetrics(
|
|
codes_without_evidence_pct=2.0,
|
|
negated_coded_as_affirmed_pct=0.5,
|
|
dp_accuracy_pct=85.0,
|
|
phantom_das_pct=5.0,
|
|
ccam_without_evidence_pct=1.0,
|
|
one_click_validation_pct=60.0,
|
|
question_relevance_pct=85.0
|
|
)
|
|
|
|
collector = MetricsCollector(db_session)
|
|
alerts = collector.check_thresholds(metrics)
|
|
|
|
assert len(alerts) == 0
|
|
|
|
|
|
def test_check_thresholds_with_alerts(db_session):
|
|
"""
|
|
Test threshold checking with metrics exceeding thresholds.
|
|
|
|
Validates: Requirement 18.9
|
|
"""
|
|
metrics = QualityMetrics(
|
|
codes_without_evidence_pct=10.0, # Exceeds 5%
|
|
negated_coded_as_affirmed_pct=2.0, # Exceeds 1%
|
|
dp_accuracy_pct=70.0, # Below 80%
|
|
phantom_das_pct=15.0, # Exceeds 10%
|
|
ccam_without_evidence_pct=5.0, # Exceeds 2%
|
|
one_click_validation_pct=40.0, # Below 50%
|
|
question_relevance_pct=70.0 # Below 80%
|
|
)
|
|
|
|
collector = MetricsCollector(db_session)
|
|
alerts = collector.check_thresholds(metrics)
|
|
|
|
# Should have 7 alerts (one for each exceeded threshold)
|
|
assert len(alerts) == 7
|
|
assert any("Codes without evidence" in alert for alert in alerts)
|
|
assert any("Negated diagnoses" in alert for alert in alerts)
|
|
assert any("DP accuracy" in alert for alert in alerts)
|
|
assert any("Phantom DAS" in alert for alert in alerts)
|
|
assert any("CCAM without evidence" in alert for alert in alerts)
|
|
assert any("One-click validation" in alert for alert in alerts)
|
|
assert any("Question relevance" in alert for alert in alerts)
|
|
|
|
|
|
def test_custom_thresholds(db_session):
|
|
"""Test MetricsCollector with custom thresholds."""
|
|
custom_thresholds = MetricsThresholds(
|
|
max_codes_without_evidence_pct=10.0,
|
|
max_negated_coded_as_affirmed_pct=2.0,
|
|
min_dp_accuracy_pct=70.0
|
|
)
|
|
|
|
collector = MetricsCollector(db_session, thresholds=custom_thresholds)
|
|
|
|
assert collector.thresholds.max_codes_without_evidence_pct == 10.0
|
|
assert collector.thresholds.max_negated_coded_as_affirmed_pct == 2.0
|
|
assert collector.thresholds.min_dp_accuracy_pct == 70.0
|
|
|
|
|
|
def test_calculate_metrics_with_date_filter(db_session):
|
|
"""Test metrics calculation with date filtering."""
|
|
# Create stay from 10 days ago
|
|
old_stay = create_stay_with_codes(
|
|
db_session,
|
|
stay_id=10,
|
|
codes_data=[
|
|
{
|
|
"code": "K29.7",
|
|
"code_type": "dp",
|
|
"confidence_score": 0.9,
|
|
"evidence": []
|
|
}
|
|
]
|
|
)
|
|
old_stay.created_at = datetime.now() - timedelta(days=10)
|
|
db_session.commit()
|
|
|
|
# Create stay from 2 days ago
|
|
recent_stay = create_stay_with_codes(
|
|
db_session,
|
|
stay_id=11,
|
|
codes_data=[
|
|
{
|
|
"code": "E11.9",
|
|
"code_type": "dp",
|
|
"confidence_score": 0.8,
|
|
"evidence": [
|
|
{"document_id": 1, "span_start": 0, "span_end": 10, "text": "Diabète"}
|
|
]
|
|
}
|
|
]
|
|
)
|
|
recent_stay.created_at = datetime.now() - timedelta(days=2)
|
|
db_session.commit()
|
|
|
|
collector = MetricsCollector(db_session)
|
|
|
|
# Calculate metrics for last 5 days only
|
|
start_date = datetime.now() - timedelta(days=5)
|
|
metrics = collector.calculate_metrics(start_date=start_date)
|
|
|
|
# Should only include recent stay
|
|
assert metrics.total_stays == 1
|
|
assert metrics.total_codes == 1
|
|
|
|
|
|
def test_calculate_metrics_empty_database(db_session):
|
|
"""Test metrics calculation with no stays."""
|
|
collector = MetricsCollector(db_session)
|
|
metrics = collector.calculate_metrics()
|
|
|
|
assert metrics.total_stays == 0
|
|
assert metrics.total_codes == 0
|
|
assert metrics.codes_without_evidence_pct == 0.0
|
|
|
|
|
|
def test_calculate_metrics_with_stay_ids(db_session):
|
|
"""Test metrics calculation with specific stay IDs."""
|
|
# Create multiple stays
|
|
create_stay_with_codes(
|
|
db_session,
|
|
stay_id=12,
|
|
codes_data=[
|
|
{
|
|
"code": "K29.7",
|
|
"code_type": "dp",
|
|
"confidence_score": 0.9,
|
|
"evidence": []
|
|
}
|
|
]
|
|
)
|
|
|
|
create_stay_with_codes(
|
|
db_session,
|
|
stay_id=13,
|
|
codes_data=[
|
|
{
|
|
"code": "E11.9",
|
|
"code_type": "dp",
|
|
"confidence_score": 0.8,
|
|
"evidence": [
|
|
{"document_id": 1, "span_start": 0, "span_end": 10, "text": "Diabète"}
|
|
]
|
|
}
|
|
]
|
|
)
|
|
|
|
collector = MetricsCollector(db_session)
|
|
|
|
# Calculate metrics for specific stay only
|
|
metrics = collector.calculate_metrics(stay_ids=[12])
|
|
|
|
assert metrics.total_stays == 1
|
|
assert metrics.total_codes == 1
|
|
assert metrics.codes_without_evidence_pct == 100.0
|