Files
t2a_v2/tests/test_rag.py
dom 4d6fbef2b9 feat: ajout RAG CIM-10 avec FAISS + Ollama
Implémente un système RAG (Retrieval Augmented Generation) qui indexe
les documents de référence ATIH (CIM-10 FR 2026, Guide Métho MCO,
CCAM PMSI) et utilise Ollama (mistral-small3.2:24b) pour justifier
et valider le codage CIM-10 des diagnostics.

- Nouveaux modèles Pydantic : RAGSource, Diagnostic étendu (confidence,
  justification, sources_rag) — rétrocompatible
- Module rag_index.py : chunking des 3 PDFs, embedding sentence-camembert-large,
  index FAISS IndexFlatIP (3630 vecteurs)
- Module rag_search.py : recherche FAISS + appel Ollama avec fallback double
- Flag CLI --no-rag pour désactiver l'enrichissement RAG
- 18 nouveaux tests (88/88 passent)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-10 17:47:08 +01:00

272 lines
9.9 KiB
Python

"""Tests pour le RAG CIM-10 (modèles, chunking, intégration)."""
from __future__ import annotations
from pathlib import Path
from unittest.mock import patch, MagicMock
import pytest
from src.config import RAGSource, Diagnostic, DossierMedical, CIM10_PDF, GUIDE_METHODO_PDF, CCAM_PDF
class TestRAGSource:
def test_create_minimal(self):
src = RAGSource(document="cim10")
assert src.document == "cim10"
assert src.page is None
assert src.code is None
assert src.extrait is None
def test_create_full(self):
src = RAGSource(
document="guide_methodo",
page=42,
code="K85",
extrait="Pancréatite aiguë biliaire...",
)
assert src.document == "guide_methodo"
assert src.page == 42
assert src.code == "K85"
assert src.extrait == "Pancréatite aiguë biliaire..."
def test_serialization(self):
src = RAGSource(document="ccam", page=1, code="HMFC004")
data = src.model_dump(exclude_none=True)
assert data == {"document": "ccam", "page": 1, "code": "HMFC004"}
class TestDiagnosticExtended:
def test_backward_compatible(self):
"""Les nouveaux champs sont optionnels — rétrocompatible."""
d = Diagnostic(texte="Pancréatite aiguë", cim10_suggestion="K85.9")
assert d.texte == "Pancréatite aiguë"
assert d.cim10_suggestion == "K85.9"
assert d.cim10_confidence is None
assert d.justification is None
assert d.sources_rag == []
def test_with_rag_fields(self):
d = Diagnostic(
texte="Lithiase cholédoque",
cim10_suggestion="K80.5",
cim10_confidence="high",
justification="Code K80.5 correspond à la lithiase du cholédoque",
sources_rag=[
RAGSource(document="cim10", page=480, code="K80"),
],
)
assert d.cim10_confidence == "high"
assert d.justification is not None
assert len(d.sources_rag) == 1
assert d.sources_rag[0].code == "K80"
def test_serialization_exclude_none(self):
"""Vérifier que le JSON n'inclut pas les champs None."""
d = Diagnostic(texte="Test", cim10_suggestion="K85.9")
data = d.model_dump(exclude_none=True)
assert "cim10_confidence" not in data
assert "justification" not in data
assert "sources_rag" in data # list vide incluse
def test_dossier_with_extended_diagnostic(self):
"""Un DossierMedical avec des diagnostics enrichis par le RAG."""
dossier = DossierMedical(
diagnostic_principal=Diagnostic(
texte="Pancréatite aiguë biliaire",
cim10_suggestion="K85.1",
cim10_confidence="high",
justification="Confirmé par CIM-10 FR 2026",
sources_rag=[
RAGSource(document="cim10", page=496, code="K85"),
RAGSource(document="guide_methodo", page=30),
],
),
)
assert dossier.diagnostic_principal.cim10_confidence == "high"
assert len(dossier.diagnostic_principal.sources_rag) == 2
class TestExtractMedicalInfoRAGFlag:
def test_use_rag_false_no_change(self):
"""use_rag=False ne modifie pas le comportement existant."""
from src.medical.cim10_extractor import extract_medical_info
parsed = {
"type": "crh",
"patient": {"sexe": "M"},
"sejour": {},
"diagnostics": [],
}
text = "Pancréatite aiguë biliaire.\nTTT de sortie :\nParacétamol\n\nDevenir : retour."
dossier = extract_medical_info(parsed, text, use_rag=False)
assert dossier.diagnostic_principal is not None
assert dossier.diagnostic_principal.cim10_suggestion == "K85.1"
# Pas de sources RAG
assert dossier.diagnostic_principal.sources_rag == []
assert dossier.diagnostic_principal.justification is None
def test_use_rag_true_calls_enrich(self):
"""use_rag=True appelle _enrich_with_rag (mocké)."""
from src.medical.cim10_extractor import extract_medical_info
parsed = {
"type": "crh",
"patient": {"sexe": "M"},
"sejour": {},
"diagnostics": [],
}
text = "Pancréatite aiguë biliaire.\nTTT de sortie :\nParacétamol\n\nDevenir : retour."
with patch("src.medical.cim10_extractor._enrich_with_rag") as mock_enrich:
dossier = extract_medical_info(parsed, text, use_rag=True)
mock_enrich.assert_called_once_with(dossier)
def test_use_rag_default_false(self):
"""Par défaut, use_rag=False."""
from src.medical.cim10_extractor import extract_medical_info
parsed = {
"type": "crh",
"patient": {"sexe": "M"},
"sejour": {},
"diagnostics": [],
}
text = "Test simple."
with patch("src.medical.cim10_extractor._enrich_with_rag") as mock_enrich:
extract_medical_info(parsed, text)
mock_enrich.assert_not_called()
class TestChunkingCIM10:
@pytest.mark.skipif(
not CIM10_PDF.exists(),
reason=f"PDF CIM-10 non trouvé : {CIM10_PDF}",
)
def test_chunks_contain_known_codes(self):
from src.medical.rag_index import _chunk_cim10
chunks = _chunk_cim10(CIM10_PDF)
assert len(chunks) > 100, f"Trop peu de chunks : {len(chunks)}"
codes = {c.code for c in chunks if c.code}
assert "K85" in codes, "K85 (pancréatite) non trouvé"
assert "K80" in codes, "K80 (lithiase biliaire) non trouvé"
assert "E66" in codes, "E66 (obésité) non trouvé"
@pytest.mark.skipif(
not CIM10_PDF.exists(),
reason=f"PDF CIM-10 non trouvé : {CIM10_PDF}",
)
def test_chunk_content(self):
from src.medical.rag_index import _chunk_cim10
chunks = _chunk_cim10(CIM10_PDF)
k85_chunks = [c for c in chunks if c.code == "K85"]
assert len(k85_chunks) >= 1
assert "pancréatite" in k85_chunks[0].text.lower() or "pancreatite" in k85_chunks[0].text.lower()
class TestChunkingGuideMethodo:
@pytest.mark.skipif(
not GUIDE_METHODO_PDF.exists(),
reason=f"PDF Guide Métho non trouvé : {GUIDE_METHODO_PDF}",
)
def test_chunks_extracted(self):
from src.medical.rag_index import _chunk_guide_methodo
chunks = _chunk_guide_methodo(GUIDE_METHODO_PDF)
assert len(chunks) >= 10, f"Trop peu de chunks : {len(chunks)}"
assert all(c.document == "guide_methodo" for c in chunks)
class TestChunkingCCAM:
@pytest.mark.skipif(
not CCAM_PDF.exists(),
reason=f"PDF CCAM non trouvé : {CCAM_PDF}",
)
def test_chunks_extracted(self):
from src.medical.rag_index import _chunk_ccam
chunks = _chunk_ccam(CCAM_PDF)
assert len(chunks) >= 1, f"Aucun chunk CCAM extrait"
assert all(c.document == "ccam" for c in chunks)
class TestRAGSearchMocked:
def test_search_similar_no_index(self):
"""search_similar retourne une liste vide si l'index n'existe pas."""
from src.medical.rag_search import search_similar
with patch("src.medical.rag_index.get_index", return_value=None):
results = search_similar("pancréatite aiguë")
assert results == []
def test_enrich_diagnostic_no_sources(self):
"""enrich_diagnostic ne plante pas si aucune source trouvée."""
from src.medical.rag_search import enrich_diagnostic
diag = Diagnostic(texte="test quelconque", cim10_suggestion="Z99.9")
with patch("src.medical.rag_search.search_similar", return_value=[]):
enrich_diagnostic(diag, {"sexe": "M", "age": 50})
assert diag.sources_rag == []
assert diag.justification is None
def test_enrich_diagnostic_with_sources_no_ollama(self):
"""Enrichissement avec sources FAISS mais sans Ollama."""
from src.medical.rag_search import enrich_diagnostic
diag = Diagnostic(texte="Pancréatite aiguë", cim10_suggestion="K85.9")
mock_sources = [
{
"document": "cim10",
"page": 496,
"code": "K85",
"extrait": "K85 Pancréatite aiguë...",
"score": 0.92,
},
]
with patch("src.medical.rag_search.search_similar", return_value=mock_sources), \
patch("src.medical.rag_search._call_ollama", return_value=None):
enrich_diagnostic(diag, {"sexe": "M", "age": 50})
assert len(diag.sources_rag) == 1
assert diag.sources_rag[0].document == "cim10"
assert diag.sources_rag[0].code == "K85"
# Pas de justification (Ollama non disponible)
assert diag.justification is None
def test_enrich_diagnostic_with_ollama(self):
"""Enrichissement complet avec sources + Ollama."""
from src.medical.rag_search import enrich_diagnostic
diag = Diagnostic(texte="Pancréatite aiguë biliaire")
mock_sources = [
{
"document": "cim10",
"page": 496,
"code": "K85",
"extrait": "K85 Pancréatite aiguë...",
"score": 0.95,
},
]
mock_llm = {
"code": "K85.1",
"confidence": "high",
"justification": "Pancréatite aiguë d'origine biliaire = K85.1",
}
with patch("src.medical.rag_search.search_similar", return_value=mock_sources), \
patch("src.medical.rag_search._call_ollama", return_value=mock_llm):
enrich_diagnostic(diag, {"sexe": "F", "age": 43})
assert diag.cim10_suggestion == "K85.1"
assert diag.cim10_confidence == "high"
assert diag.justification == "Pancréatite aiguë d'origine biliaire = K85.1"
assert len(diag.sources_rag) == 1