528 lines
19 KiB
Python
528 lines
19 KiB
Python
"""
|
|
Tests pour le Document Processor.
|
|
|
|
Ce module teste la segmentation de documents cliniques et la gestion multi-documents.
|
|
"""
|
|
|
|
import pytest
|
|
from datetime import datetime
|
|
|
|
from src.pipeline_mco_pmsi.processors.document_processor import (
|
|
DocumentProcessor,
|
|
DOCUMENT_TYPE_PRIORITIES,
|
|
)
|
|
from src.pipeline_mco_pmsi.models.clinical import (
|
|
ClinicalDocument,
|
|
Section,
|
|
StructuredStay,
|
|
)
|
|
from src.pipeline_mco_pmsi.models.metadata import StayMetadata
|
|
|
|
|
|
@pytest.fixture
|
|
def document_processor():
|
|
"""Fixture pour créer un DocumentProcessor."""
|
|
return DocumentProcessor()
|
|
|
|
|
|
@pytest.fixture
|
|
def stay_metadata():
|
|
"""Fixture pour créer des métadonnées de séjour."""
|
|
return StayMetadata(
|
|
stay_id="stay_001",
|
|
admission_date=datetime(2024, 1, 1, 10, 0),
|
|
discharge_date=datetime(2024, 1, 5, 14, 0),
|
|
specialty="Chirurgie",
|
|
unit="Bloc A",
|
|
age=45,
|
|
sex="M",
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def simple_document():
|
|
"""Fixture pour créer un document simple."""
|
|
return ClinicalDocument(
|
|
document_id="doc_001",
|
|
document_type="cr_medical",
|
|
content="Patient admis pour douleurs abdominales.",
|
|
creation_date=datetime(2024, 1, 1, 12, 0),
|
|
author="Dr. Martin",
|
|
priority=2,
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def structured_document():
|
|
"""Fixture pour créer un document avec sections structurées."""
|
|
content = """Anamnèse
|
|
Patient de 45 ans, admis pour douleurs abdominales aiguës.
|
|
|
|
Examen clinique
|
|
Abdomen tendu, défense généralisée.
|
|
|
|
Diagnostic
|
|
Appendicite aiguë.
|
|
|
|
Traitement
|
|
Appendicectomie sous cœlioscopie."""
|
|
|
|
return ClinicalDocument(
|
|
document_id="doc_002",
|
|
document_type="cr_operatoire",
|
|
content=content,
|
|
creation_date=datetime(2024, 1, 2, 9, 0),
|
|
author="Dr. Dupont",
|
|
priority=1,
|
|
)
|
|
|
|
|
|
class TestDocumentProcessor:
|
|
"""Tests pour la classe DocumentProcessor."""
|
|
|
|
def test_init(self, document_processor):
|
|
"""Test l'initialisation du DocumentProcessor."""
|
|
assert document_processor is not None
|
|
assert hasattr(document_processor, "_compiled_patterns")
|
|
assert len(document_processor._compiled_patterns) > 0
|
|
|
|
def test_process_documents_empty_list(
|
|
self, document_processor, stay_metadata
|
|
):
|
|
"""Test que process_documents lève une erreur avec une liste vide."""
|
|
with pytest.raises(ValueError, match="ne peut pas être vide"):
|
|
document_processor.process_documents([], stay_metadata)
|
|
|
|
def test_process_documents_single_document(
|
|
self, document_processor, simple_document, stay_metadata
|
|
):
|
|
"""Test le traitement d'un seul document."""
|
|
result = document_processor.process_documents(
|
|
[simple_document], stay_metadata
|
|
)
|
|
|
|
assert isinstance(result, StructuredStay)
|
|
assert result.stay_id == stay_metadata.stay_id
|
|
assert len(result.documents) == 1
|
|
assert result.documents[0].document_id == simple_document.document_id
|
|
assert len(result.sections) > 0
|
|
|
|
def test_process_documents_assigns_priorities(
|
|
self, document_processor, stay_metadata
|
|
):
|
|
"""Test que les priorités sont correctement assignées."""
|
|
# Créer des documents de différents types
|
|
docs = [
|
|
ClinicalDocument(
|
|
document_id="doc_courrier",
|
|
document_type="courrier",
|
|
content="Courrier de sortie.",
|
|
creation_date=datetime(2024, 1, 1, 10, 0),
|
|
author="Dr. A",
|
|
priority=5, # Sera réassigné
|
|
),
|
|
ClinicalDocument(
|
|
document_id="doc_cro",
|
|
document_type="cr_operatoire",
|
|
content="Compte rendu opératoire.",
|
|
creation_date=datetime(2024, 1, 2, 10, 0),
|
|
author="Dr. B",
|
|
priority=5, # Sera réassigné
|
|
),
|
|
ClinicalDocument(
|
|
document_id="doc_crm",
|
|
document_type="cr_medical",
|
|
content="Compte rendu médical.",
|
|
creation_date=datetime(2024, 1, 3, 10, 0),
|
|
author="Dr. C",
|
|
priority=5, # Sera réassigné
|
|
),
|
|
]
|
|
|
|
result = document_processor.process_documents(docs, stay_metadata)
|
|
|
|
# Vérifier que les documents sont triés par priorité
|
|
assert len(result.documents) == 3
|
|
assert result.documents[0].document_type == "cr_operatoire" # priorité 1
|
|
assert result.documents[0].priority == 1
|
|
assert result.documents[1].document_type == "cr_medical" # priorité 2
|
|
assert result.documents[1].priority == 2
|
|
assert result.documents[2].document_type == "courrier" # priorité 5
|
|
assert result.documents[2].priority == 5
|
|
|
|
def test_process_documents_multi_documents(
|
|
self, document_processor, stay_metadata
|
|
):
|
|
"""Test le traitement de plusieurs documents."""
|
|
docs = [
|
|
ClinicalDocument(
|
|
document_id="doc_1",
|
|
document_type="biologie",
|
|
content="Résultats de biologie.",
|
|
creation_date=datetime(2024, 1, 1, 10, 0),
|
|
author="Lab",
|
|
priority=4,
|
|
),
|
|
ClinicalDocument(
|
|
document_id="doc_2",
|
|
document_type="cr_operatoire",
|
|
content="Intervention chirurgicale.",
|
|
creation_date=datetime(2024, 1, 2, 10, 0),
|
|
author="Dr. Chirurgien",
|
|
priority=1,
|
|
),
|
|
ClinicalDocument(
|
|
document_id="doc_3",
|
|
document_type="imagerie",
|
|
content="Scanner abdominal.",
|
|
creation_date=datetime(2024, 1, 1, 15, 0),
|
|
author="Dr. Radiologue",
|
|
priority=3,
|
|
),
|
|
]
|
|
|
|
result = document_processor.process_documents(docs, stay_metadata)
|
|
|
|
# Vérifier l'ordre des priorités (CRO > imagerie > biologie)
|
|
assert len(result.documents) == 3
|
|
assert result.documents[0].document_type == "cr_operatoire"
|
|
assert result.documents[1].document_type == "imagerie"
|
|
assert result.documents[2].document_type == "biologie"
|
|
|
|
# Vérifier que toutes les sections sont présentes
|
|
assert len(result.sections) >= 3 # Au moins une section par document
|
|
|
|
def test_segment_document_no_sections(
|
|
self, document_processor, simple_document
|
|
):
|
|
"""Test la segmentation d'un document sans sections détectables."""
|
|
sections = document_processor.segment_document(simple_document)
|
|
|
|
# Doit créer une section "autre" avec tout le contenu
|
|
assert len(sections) == 1
|
|
assert sections[0].section_type == "autre"
|
|
assert sections[0].content == simple_document.content
|
|
assert sections[0].span.start == 0
|
|
assert sections[0].span.end == len(simple_document.content)
|
|
|
|
def test_segment_document_with_sections(
|
|
self, document_processor, structured_document
|
|
):
|
|
"""Test la segmentation d'un document avec sections structurées."""
|
|
sections = document_processor.segment_document(structured_document)
|
|
|
|
# Doit détecter plusieurs sections
|
|
assert len(sections) >= 4
|
|
|
|
# Vérifier les types de sections détectés
|
|
section_types = [s.section_type for s in sections]
|
|
assert "anamnese" in section_types
|
|
assert "examen" in section_types
|
|
assert "diagnostic" in section_types
|
|
assert "traitement" in section_types
|
|
|
|
# Vérifier que chaque section a un contenu non vide
|
|
for section in sections:
|
|
assert len(section.content) > 0
|
|
assert section.span.end > section.span.start
|
|
|
|
def test_segment_document_section_order(
|
|
self, document_processor, structured_document
|
|
):
|
|
"""Test que les sections sont dans l'ordre du document."""
|
|
sections = document_processor.segment_document(structured_document)
|
|
|
|
# Vérifier que les positions sont croissantes
|
|
for i in range(len(sections) - 1):
|
|
assert sections[i].span.start <= sections[i + 1].span.start
|
|
|
|
def test_detect_section_type_anamnese(self, document_processor):
|
|
"""Test la détection de sections anamnèse."""
|
|
assert document_processor._detect_section_type("Anamnèse") == "anamnese"
|
|
assert document_processor._detect_section_type("Histoire de la maladie") == "anamnese"
|
|
assert document_processor._detect_section_type("ANTÉCÉDENTS") == "anamnese"
|
|
|
|
def test_detect_section_type_examen(self, document_processor):
|
|
"""Test la détection de sections examen."""
|
|
assert document_processor._detect_section_type("Examen clinique") == "examen"
|
|
assert document_processor._detect_section_type("Examen physique") == "examen"
|
|
assert document_processor._detect_section_type("Inspection") == "examen"
|
|
|
|
def test_detect_section_type_diagnostic(self, document_processor):
|
|
"""Test la détection de sections diagnostic."""
|
|
assert document_processor._detect_section_type("Diagnostic") == "diagnostic"
|
|
assert document_processor._detect_section_type("Conclusion diagnostique") == "diagnostic"
|
|
assert document_processor._detect_section_type("Diagnostic retenu") == "diagnostic"
|
|
|
|
def test_detect_section_type_traitement(self, document_processor):
|
|
"""Test la détection de sections traitement."""
|
|
assert document_processor._detect_section_type("Traitement") == "traitement"
|
|
assert document_processor._detect_section_type("Traitement prescrit") == "traitement"
|
|
assert document_processor._detect_section_type("Prescription") == "traitement"
|
|
|
|
def test_detect_section_type_evolution(self, document_processor):
|
|
"""Test la détection de sections évolution."""
|
|
assert document_processor._detect_section_type("Évolution") == "evolution"
|
|
assert document_processor._detect_section_type("Suites") == "evolution"
|
|
assert document_processor._detect_section_type("Évolution post-opératoire") == "evolution"
|
|
|
|
def test_detect_section_type_conclusion(self, document_processor):
|
|
"""Test la détection de sections conclusion."""
|
|
assert document_processor._detect_section_type("Conclusion") == "conclusion"
|
|
assert document_processor._detect_section_type("Synthèse") == "conclusion"
|
|
|
|
def test_detect_section_type_no_match(self, document_processor):
|
|
"""Test qu'aucune section n'est détectée pour du texte normal."""
|
|
assert document_processor._detect_section_type("Patient admis") is None
|
|
assert document_processor._detect_section_type("") is None
|
|
assert document_processor._detect_section_type(" ") is None
|
|
|
|
def test_detect_section_type_case_insensitive(self, document_processor):
|
|
"""Test que la détection est insensible à la casse."""
|
|
assert document_processor._detect_section_type("ANAMNÈSE") == "anamnese"
|
|
assert document_processor._detect_section_type("anamnèse") == "anamnese"
|
|
assert document_processor._detect_section_type("Anamnèse") == "anamnese"
|
|
|
|
def test_create_section(self, document_processor):
|
|
"""Test la création d'une section."""
|
|
lines = ["Anamnèse", "Patient de 45 ans", "Douleurs abdominales"]
|
|
section = document_processor._create_section(
|
|
document_id="doc_001",
|
|
section_type="anamnese",
|
|
lines=lines,
|
|
start_pos=0,
|
|
section_idx=0,
|
|
)
|
|
|
|
assert section.section_id == "doc_001_section_0"
|
|
assert section.section_type == "anamnese"
|
|
assert "Anamnèse" in section.content
|
|
assert "Patient de 45 ans" in section.content
|
|
assert section.span.start == 0
|
|
assert section.span.end > 0
|
|
|
|
def test_document_type_priorities_complete(self):
|
|
"""Test que toutes les priorités de types de documents sont définies."""
|
|
expected_types = [
|
|
"cr_operatoire",
|
|
"cr_medical",
|
|
"imagerie",
|
|
"biologie",
|
|
"courrier",
|
|
"autre",
|
|
]
|
|
|
|
for doc_type in expected_types:
|
|
assert doc_type in DOCUMENT_TYPE_PRIORITIES
|
|
assert 1 <= DOCUMENT_TYPE_PRIORITIES[doc_type] <= 5
|
|
|
|
def test_document_type_priorities_order(self):
|
|
"""Test que les priorités respectent l'ordre attendu."""
|
|
# CRO doit avoir la priorité la plus haute (1)
|
|
assert DOCUMENT_TYPE_PRIORITIES["cr_operatoire"] == 1
|
|
|
|
# CRM doit avoir la deuxième priorité
|
|
assert DOCUMENT_TYPE_PRIORITIES["cr_medical"] == 2
|
|
|
|
# Imagerie doit avoir la troisième priorité
|
|
assert DOCUMENT_TYPE_PRIORITIES["imagerie"] == 3
|
|
|
|
# Biologie doit avoir la quatrième priorité
|
|
assert DOCUMENT_TYPE_PRIORITIES["biologie"] == 4
|
|
|
|
# Courriers et autres doivent avoir la priorité la plus basse
|
|
assert DOCUMENT_TYPE_PRIORITIES["courrier"] == 5
|
|
assert DOCUMENT_TYPE_PRIORITIES["autre"] == 5
|
|
|
|
def test_segment_document_complex_structure(self, document_processor):
|
|
"""Test la segmentation d'un document avec structure complexe."""
|
|
content = """COMPTE RENDU OPÉRATOIRE
|
|
|
|
Anamnèse
|
|
Patient de 65 ans, diabétique, hypertendu.
|
|
Admis pour douleurs abdominales aiguës depuis 24h.
|
|
|
|
Examen clinique
|
|
Abdomen tendu, défense généralisée.
|
|
Température 38.5°C.
|
|
|
|
Diagnostic
|
|
Appendicite aiguë compliquée.
|
|
Suspicion de péritonite.
|
|
|
|
Traitement
|
|
Appendicectomie en urgence sous cœlioscopie.
|
|
Antibiothérapie large spectre.
|
|
|
|
Évolution
|
|
Suites opératoires simples.
|
|
Sortie à J+3.
|
|
|
|
Conclusion
|
|
Intervention réussie sans complication."""
|
|
|
|
doc = ClinicalDocument(
|
|
document_id="doc_complex",
|
|
document_type="cr_operatoire",
|
|
content=content,
|
|
creation_date=datetime(2024, 1, 1, 10, 0),
|
|
author="Dr. Chirurgien",
|
|
priority=1,
|
|
)
|
|
|
|
sections = document_processor.segment_document(doc)
|
|
|
|
# Doit détecter toutes les sections
|
|
assert len(sections) >= 6
|
|
|
|
section_types = [s.section_type for s in sections]
|
|
assert "anamnese" in section_types
|
|
assert "examen" in section_types
|
|
assert "diagnostic" in section_types
|
|
assert "traitement" in section_types
|
|
assert "evolution" in section_types
|
|
assert "conclusion" in section_types
|
|
|
|
def test_segment_document_preserves_content(self, document_processor):
|
|
"""Test que la segmentation préserve tout le contenu."""
|
|
content = """Anamnèse
|
|
Patient admis.
|
|
|
|
Diagnostic
|
|
Appendicite."""
|
|
|
|
doc = ClinicalDocument(
|
|
document_id="doc_test",
|
|
document_type="cr_medical",
|
|
content=content,
|
|
creation_date=datetime(2024, 1, 1, 10, 0),
|
|
author="Dr. Test",
|
|
priority=2,
|
|
)
|
|
|
|
sections = document_processor.segment_document(doc)
|
|
|
|
# Reconstituer le contenu à partir des sections
|
|
reconstructed = "\n".join(s.content for s in sections)
|
|
|
|
# Le contenu doit être préservé (modulo les sauts de ligne)
|
|
assert "Patient admis" in reconstructed
|
|
assert "Appendicite" in reconstructed
|
|
|
|
def test_process_documents_validates_stay_id(
|
|
self, document_processor, simple_document, stay_metadata
|
|
):
|
|
"""Test que le stay_id est correctement propagé."""
|
|
result = document_processor.process_documents(
|
|
[simple_document], stay_metadata
|
|
)
|
|
|
|
assert result.stay_id == stay_metadata.stay_id
|
|
|
|
def test_segment_document_empty_content(self, document_processor):
|
|
"""Test la segmentation d'un document avec contenu minimal."""
|
|
doc = ClinicalDocument(
|
|
document_id="doc_empty",
|
|
document_type="cr_medical",
|
|
content="X", # Contenu minimal (min_length=1)
|
|
creation_date=datetime(2024, 1, 1, 10, 0),
|
|
author="Dr. Test",
|
|
priority=2,
|
|
)
|
|
|
|
sections = document_processor.segment_document(doc)
|
|
|
|
# Doit créer au moins une section
|
|
assert len(sections) >= 1
|
|
assert sections[0].content == "X"
|
|
|
|
|
|
class TestDocumentProcessorIntegration:
|
|
"""Tests d'intégration pour le DocumentProcessor."""
|
|
|
|
def test_full_workflow_single_document(
|
|
self, document_processor, structured_document, stay_metadata
|
|
):
|
|
"""Test le workflow complet avec un seul document."""
|
|
result = document_processor.process_documents(
|
|
[structured_document], stay_metadata
|
|
)
|
|
|
|
# Vérifier la structure complète
|
|
assert isinstance(result, StructuredStay)
|
|
assert result.stay_id == stay_metadata.stay_id
|
|
assert len(result.documents) == 1
|
|
assert len(result.sections) >= 4
|
|
assert result.facts == [] # Pas encore de faits extraits
|
|
|
|
# Vérifier que les sections sont valides
|
|
for section in result.sections:
|
|
assert section.section_id.startswith(structured_document.document_id)
|
|
assert section.section_type in [
|
|
"anamnese",
|
|
"examen",
|
|
"diagnostic",
|
|
"traitement",
|
|
"evolution",
|
|
"conclusion",
|
|
"autre",
|
|
]
|
|
assert len(section.content) > 0
|
|
|
|
def test_full_workflow_multi_documents(
|
|
self, document_processor, stay_metadata
|
|
):
|
|
"""Test le workflow complet avec plusieurs documents."""
|
|
# Créer un séjour complet avec plusieurs documents
|
|
docs = [
|
|
ClinicalDocument(
|
|
document_id="doc_admission",
|
|
document_type="cr_medical",
|
|
content="""Anamnèse
|
|
Patient admis pour douleurs abdominales.
|
|
|
|
Examen clinique
|
|
Abdomen sensible.""",
|
|
creation_date=datetime(2024, 1, 1, 10, 0),
|
|
author="Dr. Urgentiste",
|
|
priority=2,
|
|
),
|
|
ClinicalDocument(
|
|
document_id="doc_operation",
|
|
document_type="cr_operatoire",
|
|
content="""Diagnostic
|
|
Appendicite aiguë.
|
|
|
|
Traitement
|
|
Appendicectomie sous cœlioscopie.""",
|
|
creation_date=datetime(2024, 1, 2, 9, 0),
|
|
author="Dr. Chirurgien",
|
|
priority=1,
|
|
),
|
|
ClinicalDocument(
|
|
document_id="doc_imagerie",
|
|
document_type="imagerie",
|
|
content="Scanner abdominal : appendice inflammatoire.",
|
|
creation_date=datetime(2024, 1, 1, 15, 0),
|
|
author="Dr. Radiologue",
|
|
priority=3,
|
|
),
|
|
]
|
|
|
|
result = document_processor.process_documents(docs, stay_metadata)
|
|
|
|
# Vérifier la structure
|
|
assert len(result.documents) == 3
|
|
|
|
# Vérifier l'ordre des priorités
|
|
assert result.documents[0].document_type == "cr_operatoire"
|
|
assert result.documents[1].document_type == "cr_medical"
|
|
assert result.documents[2].document_type == "imagerie"
|
|
|
|
# Vérifier que toutes les sections sont présentes
|
|
assert len(result.sections) >= 3
|
|
|
|
# Vérifier que les sections proviennent de différents documents
|
|
doc_ids = set(s.section_id.split("_section_")[0] for s in result.sections)
|
|
assert len(doc_ids) == 3
|