Initial commit

2026-03-05 01:20:14 +01:00
commit 2163e574c1
184 changed files with 354881 additions and 0 deletions
--- a/tests/test_gold_set_validator.py
+++ b/tests/test_gold_set_validator.py
@@ -0,0 +1,620 @@
+"""
+Tests unitaires pour le GoldSetValidator.
+
+Ces tests vérifient le chargement du jeu gold, l'exécution du pipeline,
+le calcul des métriques et la validation des releases.
+"""
+
+import json
+import tempfile
+from datetime import datetime
+from pathlib import Path
+from unittest.mock import Mock, MagicMock
+
+import pytest
+
+from pipeline_mco_pmsi.validation import GoldSetValidator
+from pipeline_mco_pmsi.validation.gold_set_validator import (
+    GoldSetMetrics,
+    GoldStayResult
+)
+from pipeline_mco_pmsi.models.coding import Code
+
+
+@pytest.fixture
+def temp_gold_dir():
+    """Crée un répertoire temporaire pour le jeu gold."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        yield Path(tmpdir)
+
+
+@pytest.fixture
+def sample_gold_set():
+    """Crée un jeu gold de test avec 200 séjours."""
+    gold_stays = []
+    
+    for i in range(200):
+        stay = {
+            "stay_id": f"SEJ{i:03d}",
+            "documents": [
+                {
+                    "document_id": f"DOC{i:03d}",
+                    "content": f"Patient {i} avec diagnostic test",
+                    "document_type": "CRO"
+                }
+            ],
+            "expected_codes": {
+                "dp": f"I{i%10:02d}.{i%10}",
+                "das": [f"E{i%5:02d}.{i%5}", f"K{i%3:02d}.{i%3}"],
+                "ccam": [f"YYYY{i%100:03d}"]
+            }
+        }
+        gold_stays.append(stay)
+    
+    return gold_stays
+
+
+class TestGoldSetValidatorInit:
+    """Tests d'initialisation du GoldSetValidator."""
+    
+    def test_init_with_defaults(self, temp_gold_dir):
+        """Test l'initialisation avec valeurs par défaut."""
+        validator = GoldSetValidator(gold_set_path=temp_gold_dir)
+        
+        assert validator.gold_set_path == temp_gold_dir
+        assert validator.min_dp_accuracy == 0.70
+        assert validator.min_das_f1 == 0.60
+        assert validator.min_ccam_f1 == 0.65
+        assert validator.max_degradation == 0.05
+    
+    def test_init_with_custom_thresholds(self, temp_gold_dir):
+        """Test l'initialisation avec seuils personnalisés."""
+        validator = GoldSetValidator(
+            gold_set_path=temp_gold_dir,
+            min_dp_accuracy=0.80,
+            min_das_f1=0.70,
+            min_ccam_f1=0.75,
+            max_degradation=0.03
+        )
+        
+        assert validator.min_dp_accuracy == 0.80
+        assert validator.min_das_f1 == 0.70
+        assert validator.min_ccam_f1 == 0.75
+        assert validator.max_degradation == 0.03
+
+
+class TestLoadGoldSet:
+    """Tests de chargement du jeu gold."""
+    
+    def test_load_gold_set_success(self, temp_gold_dir, sample_gold_set):
+        """Test le chargement réussi d'un jeu gold."""
+        # Créer le fichier gold
+        gold_file = temp_gold_dir / "gold_set.json"
+        with open(gold_file, "w", encoding="utf-8") as f:
+            json.dump(sample_gold_set, f)
+        
+        validator = GoldSetValidator(gold_set_path=temp_gold_dir)
+        gold_stays = validator.load_gold_set()
+        
+        assert len(gold_stays) == 200
+        assert gold_stays[0]["stay_id"] == "SEJ000"
+        assert "expected_codes" in gold_stays[0]
+    
+    def test_load_gold_set_file_not_found(self, temp_gold_dir):
+        """Test que load_gold_set échoue si le fichier n'existe pas."""
+        validator = GoldSetValidator(gold_set_path=temp_gold_dir)
+        
+        with pytest.raises(FileNotFoundError, match="Fichier jeu gold introuvable"):
+            validator.load_gold_set()
+    
+    def test_load_gold_set_too_few_stays(self, temp_gold_dir):
+        """Test que load_gold_set rejette un jeu gold trop petit."""
+        # Créer un jeu gold avec seulement 50 séjours
+        small_gold_set = [
+            {
+                "stay_id": f"SEJ{i:03d}",
+                "documents": [],
+                "expected_codes": {"dp": "I21.0", "das": [], "ccam": []}
+            }
+            for i in range(50)
+        ]
+        
+        gold_file = temp_gold_dir / "gold_set.json"
+        with open(gold_file, "w", encoding="utf-8") as f:
+            json.dump(small_gold_set, f)
+        
+        validator = GoldSetValidator(gold_set_path=temp_gold_dir)
+        
+        with pytest.raises(ValueError, match="au moins 200 séjours"):
+            validator.load_gold_set()
+
+
+class TestRunGoldSet:
+    """Tests d'exécution du pipeline sur le jeu gold."""
+    
+    def test_run_gold_set_success(self, temp_gold_dir):
+        """Test l'exécution réussie du pipeline sur le jeu gold."""
+        from pipeline_mco_pmsi.models.clinical import Evidence, Span
+        
+        validator = GoldSetValidator(gold_set_path=temp_gold_dir)
+        
+        # Créer un mock du pipeline
+        mock_pipeline = Mock()
+        mock_result = Mock()
+        mock_result.proposed_codes = [
+            Code(
+                code="I21.0",
+                type="dp",
+                label="Infarctus",
+                confidence=0.9,
+                reasoning="Test",
+                evidence=[Evidence(document_id="DOC001", span=Span(start=0, end=10), text="test")],
+                referentiel_version="2026"
+            ),
+            Code(
+                code="I10",
+                type="das",
+                label="HTA",
+                confidence=0.8,
+                reasoning="Test",
+                evidence=[Evidence(document_id="DOC001", span=Span(start=0, end=10), text="test")],
+                referentiel_version="2026"
+            ),
+            Code(
+                code="YYYY001",
+                type="ccam",
+                label="Acte",
+                confidence=0.85,
+                reasoning="Test",
+                evidence=[Evidence(document_id="DOC001", span=Span(start=0, end=10), text="test")],
+                referentiel_version="2026"
+            )
+        ]
+        mock_pipeline.process_stay.return_value = mock_result
+        
+        # Jeu gold minimal (3 séjours pour le test)
+        gold_stays = [
+            {
+                "stay_id": "SEJ001",
+                "documents": [],
+                "expected_codes": {"dp": "I21.0", "das": ["I10"], "ccam": ["YYYY001"]}
+            },
+            {
+                "stay_id": "SEJ002",
+                "documents": [],
+                "expected_codes": {"dp": "I21.0", "das": ["I10", "E11.9"], "ccam": ["YYYY001"]}
+            },
+            {
+                "stay_id": "SEJ003",
+                "documents": [],
+                "expected_codes": {"dp": "I21.0", "das": [], "ccam": []}
+            }
+        ]
+        
+        results = validator.run_gold_set(mock_pipeline, gold_stays)
+        
+        assert len(results) == 3
+        assert all(isinstance(r, GoldStayResult) for r in results)
+        assert results[0].stay_id == "SEJ001"
+        assert results[0].dp_correct is True
+        assert results[0].dp_predicted == "I21.0"
+    
+    def test_run_gold_set_with_errors(self, temp_gold_dir):
+        """Test l'exécution avec erreurs."""
+        validator = GoldSetValidator(gold_set_path=temp_gold_dir)
+        
+        # Mock du pipeline qui lève une exception
+        mock_pipeline = Mock()
+        mock_pipeline.process_stay.side_effect = Exception("Erreur de traitement")
+        
+        gold_stays = [
+            {
+                "stay_id": "SEJ001",
+                "documents": [],
+                "expected_codes": {"dp": "I21.0", "das": [], "ccam": []}
+            }
+        ]
+        
+        results = validator.run_gold_set(mock_pipeline, gold_stays)
+        
+        assert len(results) == 1
+        assert results[0].dp_correct is False
+        assert len(results[0].errors) > 0
+        assert "Erreur de traitement" in results[0].errors[0]
+
+
+class TestCalculateMetrics:
+    """Tests de calcul des métriques."""
+    
+    def test_calculate_metrics_perfect_score(self, temp_gold_dir):
+        """Test le calcul avec score parfait."""
+        validator = GoldSetValidator(gold_set_path=temp_gold_dir)
+        
+        # Résultats parfaits
+        results = [
+            GoldStayResult(
+                stay_id=f"SEJ{i:03d}",
+                dp_correct=True,
+                dp_predicted=f"I{i}.0",
+                dp_expected=f"I{i}.0",
+                das_predicted=["I10", "E11.9"],
+                das_expected=["I10", "E11.9"],
+                das_precision=1.0,
+                das_recall=1.0,
+                das_f1=1.0,
+                ccam_predicted=["YYYY001"],
+                ccam_expected=["YYYY001"],
+                ccam_precision=1.0,
+                ccam_recall=1.0,
+                ccam_f1=1.0,
+                processing_time_seconds=1.5,
+                errors=[]
+            )
+            for i in range(10)
+        ]
+        
+        metrics = validator.calculate_metrics(results)
+        
+        assert metrics.total_stays == 10
+        assert metrics.dp_accuracy == 1.0
+        assert metrics.das_f1 == 1.0
+        assert metrics.ccam_f1 == 1.0
+        assert metrics.error_rate == 0.0
+        assert metrics.avg_processing_time == 1.5
+    
+    def test_calculate_metrics_partial_score(self, temp_gold_dir):
+        """Test le calcul avec score partiel."""
+        validator = GoldSetValidator(gold_set_path=temp_gold_dir)
+        
+        results = [
+            # 50% DP correct
+            GoldStayResult(
+                stay_id="SEJ001",
+                dp_correct=True,
+                dp_predicted="I21.0",
+                dp_expected="I21.0",
+                das_predicted=["I10"],
+                das_expected=["I10", "E11.9"],
+                das_precision=1.0,
+                das_recall=0.5,
+                das_f1=0.67,
+                ccam_predicted=["YYYY001"],
+                ccam_expected=["YYYY001"],
+                ccam_precision=1.0,
+                ccam_recall=1.0,
+                ccam_f1=1.0,
+                processing_time_seconds=2.0,
+                errors=[]
+            ),
+            GoldStayResult(
+                stay_id="SEJ002",
+                dp_correct=False,
+                dp_predicted="I22.0",
+                dp_expected="I21.0",
+                das_predicted=["I10", "E11.9"],
+                das_expected=["I10"],
+                das_precision=0.5,
+                das_recall=1.0,
+                das_f1=0.67,
+                ccam_predicted=[],
+                ccam_expected=["YYYY001"],
+                ccam_precision=0.0,
+                ccam_recall=0.0,
+                ccam_f1=0.0,
+                processing_time_seconds=1.5,
+                errors=[]
+            )
+        ]
+        
+        metrics = validator.calculate_metrics(results)
+        
+        assert metrics.total_stays == 2
+        assert metrics.dp_accuracy == 0.5  # 1/2
+        assert 0.6 < metrics.das_f1 < 0.7  # Moyenne de 0.67 et 0.67
+        assert metrics.ccam_f1 == 0.5  # Moyenne de 1.0 et 0.0
+        assert metrics.avg_processing_time == 1.75  # Moyenne de 2.0 et 1.5
+
+
+class TestCompareMetrics:
+    """Tests de comparaison des métriques."""
+    
+    def test_compare_metrics_improvement(self, temp_gold_dir):
+        """Test la comparaison avec amélioration."""
+        validator = GoldSetValidator(gold_set_path=temp_gold_dir)
+        
+        before = GoldSetMetrics(
+            total_stays=200,
+            dp_accuracy=0.70,
+            das_precision=0.65,
+            das_recall=0.60,
+            das_f1=0.62,
+            ccam_precision=0.70,
+            ccam_recall=0.68,
+            ccam_f1=0.69,
+            avg_processing_time=25.0,
+            error_rate=0.05
+        )
+        
+        after = GoldSetMetrics(
+            total_stays=200,
+            dp_accuracy=0.75,
+            das_precision=0.70,
+            das_recall=0.65,
+            das_f1=0.67,
+            ccam_precision=0.75,
+            ccam_recall=0.73,
+            ccam_f1=0.74,
+            avg_processing_time=23.0,
+            error_rate=0.03
+        )
+        
+        differences = validator.compare_metrics(before, after)
+        
+        assert differences["dp_accuracy"] == pytest.approx(0.05)  # Amélioration
+        assert differences["das_f1"] == pytest.approx(0.05)  # Amélioration
+        assert differences["ccam_f1"] == pytest.approx(0.05)  # Amélioration
+        assert differences["error_rate"] == pytest.approx(-0.02)  # Amélioration (moins d'erreurs)
+    
+    def test_compare_metrics_degradation(self, temp_gold_dir):
+        """Test la comparaison avec dégradation."""
+        validator = GoldSetValidator(gold_set_path=temp_gold_dir)
+        
+        before = GoldSetMetrics(
+            total_stays=200,
+            dp_accuracy=0.75,
+            das_precision=0.70,
+            das_recall=0.65,
+            das_f1=0.67,
+            ccam_precision=0.75,
+            ccam_recall=0.73,
+            ccam_f1=0.74,
+            avg_processing_time=23.0,
+            error_rate=0.03
+        )
+        
+        after = GoldSetMetrics(
+            total_stays=200,
+            dp_accuracy=0.68,
+            das_precision=0.63,
+            das_recall=0.58,
+            das_f1=0.60,
+            ccam_precision=0.68,
+            ccam_recall=0.66,
+            ccam_f1=0.67,
+            avg_processing_time=25.0,
+            error_rate=0.06
+        )
+        
+        differences = validator.compare_metrics(before, after)
+        
+        assert differences["dp_accuracy"] == pytest.approx(-0.07)  # Dégradation
+        assert differences["das_f1"] == pytest.approx(-0.07)  # Dégradation
+        assert differences["ccam_f1"] == pytest.approx(-0.07)  # Dégradation
+
+
+class TestValidateRelease:
+    """Tests de validation de release."""
+    
+    def test_validate_release_success(self, temp_gold_dir):
+        """Test la validation réussie d'une release."""
+        validator = GoldSetValidator(gold_set_path=temp_gold_dir)
+        
+        before = GoldSetMetrics(
+            total_stays=200,
+            dp_accuracy=0.72,
+            das_precision=0.65,
+            das_recall=0.60,
+            das_f1=0.62,
+            ccam_precision=0.70,
+            ccam_recall=0.68,
+            ccam_f1=0.69,
+            avg_processing_time=25.0,
+            error_rate=0.05
+        )
+        
+        after = GoldSetMetrics(
+            total_stays=200,
+            dp_accuracy=0.75,
+            das_precision=0.68,
+            das_recall=0.63,
+            das_f1=0.65,
+            ccam_precision=0.73,
+            ccam_recall=0.71,
+            ccam_f1=0.72,
+            avg_processing_time=23.0,
+            error_rate=0.03
+        )
+        
+        release_ok, reasons = validator.validate_release(before, after)
+        
+        assert release_ok is True
+        assert len(reasons) == 0
+    
+    def test_validate_release_below_threshold(self, temp_gold_dir):
+        """Test le blocage si en dessous des seuils."""
+        validator = GoldSetValidator(
+            gold_set_path=temp_gold_dir,
+            min_dp_accuracy=0.70,
+            min_das_f1=0.60,
+            min_ccam_f1=0.65
+        )
+        
+        before = GoldSetMetrics(
+            total_stays=200,
+            dp_accuracy=0.72,
+            das_precision=0.65,
+            das_recall=0.60,
+            das_f1=0.62,
+            ccam_precision=0.70,
+            ccam_recall=0.68,
+            ccam_f1=0.69,
+            avg_processing_time=25.0,
+            error_rate=0.05
+        )
+        
+        # Métriques en dessous des seuils
+        after = GoldSetMetrics(
+            total_stays=200,
+            dp_accuracy=0.65,  # < 0.70
+            das_precision=0.60,
+            das_recall=0.55,
+            das_f1=0.57,  # < 0.60
+            ccam_precision=0.63,
+            ccam_recall=0.61,
+            ccam_f1=0.62,  # < 0.65
+            avg_processing_time=23.0,
+            error_rate=0.03
+        )
+        
+        release_ok, reasons = validator.validate_release(before, after)
+        
+        assert release_ok is False
+        # On s'attend à 6 raisons: 3 pour les seuils minimums + 3 pour les dégradations
+        assert len(reasons) == 6
+        assert any("DP accuracy" in r and "seuil minimum" in r for r in reasons)
+        assert any("DAS F1" in r and "seuil minimum" in r for r in reasons)
+        assert any("CCAM F1" in r and "seuil minimum" in r for r in reasons)
+        assert any("Dégradation DP" in r for r in reasons)
+        assert any("Dégradation DAS" in r for r in reasons)
+        assert any("Dégradation CCAM" in r for r in reasons)
+    
+    def test_validate_release_excessive_degradation(self, temp_gold_dir):
+        """Test le blocage si dégradation excessive."""
+        validator = GoldSetValidator(
+            gold_set_path=temp_gold_dir,
+            max_degradation=0.05
+        )
+        
+        before = GoldSetMetrics(
+            total_stays=200,
+            dp_accuracy=0.75,
+            das_precision=0.70,
+            das_recall=0.65,
+            das_f1=0.67,
+            ccam_precision=0.75,
+            ccam_recall=0.73,
+            ccam_f1=0.74,
+            avg_processing_time=23.0,
+            error_rate=0.03
+        )
+        
+        # Dégradation de 8% (> 5%)
+        after = GoldSetMetrics(
+            total_stays=200,
+            dp_accuracy=0.67,  # -8%
+            das_precision=0.62,
+            das_recall=0.57,
+            das_f1=0.59,  # -8%
+            ccam_precision=0.67,
+            ccam_recall=0.65,
+            ccam_f1=0.66,  # -8%
+            avg_processing_time=25.0,
+            error_rate=0.06
+        )
+        
+        release_ok, reasons = validator.validate_release(before, after)
+        
+        assert release_ok is False
+        assert len(reasons) >= 3
+        assert any("Dégradation DP" in r for r in reasons)
+        assert any("Dégradation DAS" in r for r in reasons)
+        assert any("Dégradation CCAM" in r for r in reasons)
+
+
+class TestSaveMetrics:
+    """Tests de sauvegarde des métriques."""
+    
+    def test_save_metrics(self, temp_gold_dir):
+        """Test la sauvegarde des métriques."""
+        validator = GoldSetValidator(gold_set_path=temp_gold_dir)
+        
+        metrics = GoldSetMetrics(
+            total_stays=200,
+            dp_accuracy=0.75,
+            das_precision=0.70,
+            das_recall=0.65,
+            das_f1=0.67,
+            ccam_precision=0.75,
+            ccam_recall=0.73,
+            ccam_f1=0.74,
+            avg_processing_time=23.0,
+            error_rate=0.03
+        )
+        
+        output_path = temp_gold_dir / "metrics" / "test_metrics.json"
+        validator.save_metrics(metrics, output_path)
+        
+        assert output_path.exists()
+        
+        # Vérifier le contenu
+        with open(output_path, "r", encoding="utf-8") as f:
+            saved_data = json.load(f)
+        
+        assert saved_data["total_stays"] == 200
+        assert saved_data["dp_accuracy"] == 0.75
+        assert saved_data["das_f1"] == 0.67
+
+
+class TestCalculateMetricsHelper:
+    """Tests de la méthode helper _calculate_metrics."""
+    
+    def test_calculate_metrics_perfect_match(self, temp_gold_dir):
+        """Test avec correspondance parfaite."""
+        validator = GoldSetValidator(gold_set_path=temp_gold_dir)
+        
+        predicted = ["I10", "E11.9", "K29.7"]
+        expected = ["I10", "E11.9", "K29.7"]
+        
+        precision, recall, f1 = validator._calculate_metrics(predicted, expected)
+        
+        assert precision == 1.0
+        assert recall == 1.0
+        assert f1 == 1.0
+    
+    def test_calculate_metrics_partial_match(self, temp_gold_dir):
+        """Test avec correspondance partielle."""
+        validator = GoldSetValidator(gold_set_path=temp_gold_dir)
+        
+        predicted = ["I10", "E11.9"]
+        expected = ["I10", "E11.9", "K29.7"]
+        
+        precision, recall, f1 = validator._calculate_metrics(predicted, expected)
+        
+        assert precision == 1.0  # 2/2
+        assert recall == 2/3  # 2/3
+        assert 0.79 < f1 < 0.81  # 2 * (1.0 * 0.67) / (1.0 + 0.67) ≈ 0.80
+    
+    def test_calculate_metrics_no_match(self, temp_gold_dir):
+        """Test sans correspondance."""
+        validator = GoldSetValidator(gold_set_path=temp_gold_dir)
+        
+        predicted = ["I10", "E11.9"]
+        expected = ["K29.7", "J44.0"]
+        
+        precision, recall, f1 = validator._calculate_metrics(predicted, expected)
+        
+        assert precision == 0.0
+        assert recall == 0.0
+        assert f1 == 0.0
+    
+    def test_calculate_metrics_empty_lists(self, temp_gold_dir):
+        """Test avec listes vides."""
+        validator = GoldSetValidator(gold_set_path=temp_gold_dir)
+        
+        # Les deux vides = match parfait
+        precision, recall, f1 = validator._calculate_metrics([], [])
+        assert precision == 1.0
+        assert recall == 1.0
+        assert f1 == 1.0
+        
+        # Predicted vide, expected non vide
+        precision, recall, f1 = validator._calculate_metrics([], ["I10"])
+        assert precision == 0.0
+        assert recall == 0.0
+        assert f1 == 0.0
+        
+        # Predicted non vide, expected vide
+        precision, recall, f1 = validator._calculate_metrics(["I10"], [])
+        assert precision == 0.0
+        assert recall == 0.0
+        assert f1 == 0.0