Initial commit
This commit is contained in:
527
tests/test_document_processor.py
Normal file
527
tests/test_document_processor.py
Normal file
@@ -0,0 +1,527 @@
|
||||
"""
|
||||
Tests pour le Document Processor.
|
||||
|
||||
Ce module teste la segmentation de documents cliniques et la gestion multi-documents.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from datetime import datetime
|
||||
|
||||
from src.pipeline_mco_pmsi.processors.document_processor import (
|
||||
DocumentProcessor,
|
||||
DOCUMENT_TYPE_PRIORITIES,
|
||||
)
|
||||
from src.pipeline_mco_pmsi.models.clinical import (
|
||||
ClinicalDocument,
|
||||
Section,
|
||||
StructuredStay,
|
||||
)
|
||||
from src.pipeline_mco_pmsi.models.metadata import StayMetadata
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def document_processor():
|
||||
"""Fixture pour créer un DocumentProcessor."""
|
||||
return DocumentProcessor()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def stay_metadata():
|
||||
"""Fixture pour créer des métadonnées de séjour."""
|
||||
return StayMetadata(
|
||||
stay_id="stay_001",
|
||||
admission_date=datetime(2024, 1, 1, 10, 0),
|
||||
discharge_date=datetime(2024, 1, 5, 14, 0),
|
||||
specialty="Chirurgie",
|
||||
unit="Bloc A",
|
||||
age=45,
|
||||
sex="M",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def simple_document():
|
||||
"""Fixture pour créer un document simple."""
|
||||
return ClinicalDocument(
|
||||
document_id="doc_001",
|
||||
document_type="cr_medical",
|
||||
content="Patient admis pour douleurs abdominales.",
|
||||
creation_date=datetime(2024, 1, 1, 12, 0),
|
||||
author="Dr. Martin",
|
||||
priority=2,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def structured_document():
|
||||
"""Fixture pour créer un document avec sections structurées."""
|
||||
content = """Anamnèse
|
||||
Patient de 45 ans, admis pour douleurs abdominales aiguës.
|
||||
|
||||
Examen clinique
|
||||
Abdomen tendu, défense généralisée.
|
||||
|
||||
Diagnostic
|
||||
Appendicite aiguë.
|
||||
|
||||
Traitement
|
||||
Appendicectomie sous cœlioscopie."""
|
||||
|
||||
return ClinicalDocument(
|
||||
document_id="doc_002",
|
||||
document_type="cr_operatoire",
|
||||
content=content,
|
||||
creation_date=datetime(2024, 1, 2, 9, 0),
|
||||
author="Dr. Dupont",
|
||||
priority=1,
|
||||
)
|
||||
|
||||
|
||||
class TestDocumentProcessor:
|
||||
"""Tests pour la classe DocumentProcessor."""
|
||||
|
||||
def test_init(self, document_processor):
|
||||
"""Test l'initialisation du DocumentProcessor."""
|
||||
assert document_processor is not None
|
||||
assert hasattr(document_processor, "_compiled_patterns")
|
||||
assert len(document_processor._compiled_patterns) > 0
|
||||
|
||||
def test_process_documents_empty_list(
|
||||
self, document_processor, stay_metadata
|
||||
):
|
||||
"""Test que process_documents lève une erreur avec une liste vide."""
|
||||
with pytest.raises(ValueError, match="ne peut pas être vide"):
|
||||
document_processor.process_documents([], stay_metadata)
|
||||
|
||||
def test_process_documents_single_document(
|
||||
self, document_processor, simple_document, stay_metadata
|
||||
):
|
||||
"""Test le traitement d'un seul document."""
|
||||
result = document_processor.process_documents(
|
||||
[simple_document], stay_metadata
|
||||
)
|
||||
|
||||
assert isinstance(result, StructuredStay)
|
||||
assert result.stay_id == stay_metadata.stay_id
|
||||
assert len(result.documents) == 1
|
||||
assert result.documents[0].document_id == simple_document.document_id
|
||||
assert len(result.sections) > 0
|
||||
|
||||
def test_process_documents_assigns_priorities(
|
||||
self, document_processor, stay_metadata
|
||||
):
|
||||
"""Test que les priorités sont correctement assignées."""
|
||||
# Créer des documents de différents types
|
||||
docs = [
|
||||
ClinicalDocument(
|
||||
document_id="doc_courrier",
|
||||
document_type="courrier",
|
||||
content="Courrier de sortie.",
|
||||
creation_date=datetime(2024, 1, 1, 10, 0),
|
||||
author="Dr. A",
|
||||
priority=5, # Sera réassigné
|
||||
),
|
||||
ClinicalDocument(
|
||||
document_id="doc_cro",
|
||||
document_type="cr_operatoire",
|
||||
content="Compte rendu opératoire.",
|
||||
creation_date=datetime(2024, 1, 2, 10, 0),
|
||||
author="Dr. B",
|
||||
priority=5, # Sera réassigné
|
||||
),
|
||||
ClinicalDocument(
|
||||
document_id="doc_crm",
|
||||
document_type="cr_medical",
|
||||
content="Compte rendu médical.",
|
||||
creation_date=datetime(2024, 1, 3, 10, 0),
|
||||
author="Dr. C",
|
||||
priority=5, # Sera réassigné
|
||||
),
|
||||
]
|
||||
|
||||
result = document_processor.process_documents(docs, stay_metadata)
|
||||
|
||||
# Vérifier que les documents sont triés par priorité
|
||||
assert len(result.documents) == 3
|
||||
assert result.documents[0].document_type == "cr_operatoire" # priorité 1
|
||||
assert result.documents[0].priority == 1
|
||||
assert result.documents[1].document_type == "cr_medical" # priorité 2
|
||||
assert result.documents[1].priority == 2
|
||||
assert result.documents[2].document_type == "courrier" # priorité 5
|
||||
assert result.documents[2].priority == 5
|
||||
|
||||
def test_process_documents_multi_documents(
|
||||
self, document_processor, stay_metadata
|
||||
):
|
||||
"""Test le traitement de plusieurs documents."""
|
||||
docs = [
|
||||
ClinicalDocument(
|
||||
document_id="doc_1",
|
||||
document_type="biologie",
|
||||
content="Résultats de biologie.",
|
||||
creation_date=datetime(2024, 1, 1, 10, 0),
|
||||
author="Lab",
|
||||
priority=4,
|
||||
),
|
||||
ClinicalDocument(
|
||||
document_id="doc_2",
|
||||
document_type="cr_operatoire",
|
||||
content="Intervention chirurgicale.",
|
||||
creation_date=datetime(2024, 1, 2, 10, 0),
|
||||
author="Dr. Chirurgien",
|
||||
priority=1,
|
||||
),
|
||||
ClinicalDocument(
|
||||
document_id="doc_3",
|
||||
document_type="imagerie",
|
||||
content="Scanner abdominal.",
|
||||
creation_date=datetime(2024, 1, 1, 15, 0),
|
||||
author="Dr. Radiologue",
|
||||
priority=3,
|
||||
),
|
||||
]
|
||||
|
||||
result = document_processor.process_documents(docs, stay_metadata)
|
||||
|
||||
# Vérifier l'ordre des priorités (CRO > imagerie > biologie)
|
||||
assert len(result.documents) == 3
|
||||
assert result.documents[0].document_type == "cr_operatoire"
|
||||
assert result.documents[1].document_type == "imagerie"
|
||||
assert result.documents[2].document_type == "biologie"
|
||||
|
||||
# Vérifier que toutes les sections sont présentes
|
||||
assert len(result.sections) >= 3 # Au moins une section par document
|
||||
|
||||
def test_segment_document_no_sections(
|
||||
self, document_processor, simple_document
|
||||
):
|
||||
"""Test la segmentation d'un document sans sections détectables."""
|
||||
sections = document_processor.segment_document(simple_document)
|
||||
|
||||
# Doit créer une section "autre" avec tout le contenu
|
||||
assert len(sections) == 1
|
||||
assert sections[0].section_type == "autre"
|
||||
assert sections[0].content == simple_document.content
|
||||
assert sections[0].span.start == 0
|
||||
assert sections[0].span.end == len(simple_document.content)
|
||||
|
||||
def test_segment_document_with_sections(
|
||||
self, document_processor, structured_document
|
||||
):
|
||||
"""Test la segmentation d'un document avec sections structurées."""
|
||||
sections = document_processor.segment_document(structured_document)
|
||||
|
||||
# Doit détecter plusieurs sections
|
||||
assert len(sections) >= 4
|
||||
|
||||
# Vérifier les types de sections détectés
|
||||
section_types = [s.section_type for s in sections]
|
||||
assert "anamnese" in section_types
|
||||
assert "examen" in section_types
|
||||
assert "diagnostic" in section_types
|
||||
assert "traitement" in section_types
|
||||
|
||||
# Vérifier que chaque section a un contenu non vide
|
||||
for section in sections:
|
||||
assert len(section.content) > 0
|
||||
assert section.span.end > section.span.start
|
||||
|
||||
def test_segment_document_section_order(
|
||||
self, document_processor, structured_document
|
||||
):
|
||||
"""Test que les sections sont dans l'ordre du document."""
|
||||
sections = document_processor.segment_document(structured_document)
|
||||
|
||||
# Vérifier que les positions sont croissantes
|
||||
for i in range(len(sections) - 1):
|
||||
assert sections[i].span.start <= sections[i + 1].span.start
|
||||
|
||||
def test_detect_section_type_anamnese(self, document_processor):
|
||||
"""Test la détection de sections anamnèse."""
|
||||
assert document_processor._detect_section_type("Anamnèse") == "anamnese"
|
||||
assert document_processor._detect_section_type("Histoire de la maladie") == "anamnese"
|
||||
assert document_processor._detect_section_type("ANTÉCÉDENTS") == "anamnese"
|
||||
|
||||
def test_detect_section_type_examen(self, document_processor):
|
||||
"""Test la détection de sections examen."""
|
||||
assert document_processor._detect_section_type("Examen clinique") == "examen"
|
||||
assert document_processor._detect_section_type("Examen physique") == "examen"
|
||||
assert document_processor._detect_section_type("Inspection") == "examen"
|
||||
|
||||
def test_detect_section_type_diagnostic(self, document_processor):
|
||||
"""Test la détection de sections diagnostic."""
|
||||
assert document_processor._detect_section_type("Diagnostic") == "diagnostic"
|
||||
assert document_processor._detect_section_type("Conclusion diagnostique") == "diagnostic"
|
||||
assert document_processor._detect_section_type("Diagnostic retenu") == "diagnostic"
|
||||
|
||||
def test_detect_section_type_traitement(self, document_processor):
|
||||
"""Test la détection de sections traitement."""
|
||||
assert document_processor._detect_section_type("Traitement") == "traitement"
|
||||
assert document_processor._detect_section_type("Traitement prescrit") == "traitement"
|
||||
assert document_processor._detect_section_type("Prescription") == "traitement"
|
||||
|
||||
def test_detect_section_type_evolution(self, document_processor):
|
||||
"""Test la détection de sections évolution."""
|
||||
assert document_processor._detect_section_type("Évolution") == "evolution"
|
||||
assert document_processor._detect_section_type("Suites") == "evolution"
|
||||
assert document_processor._detect_section_type("Évolution post-opératoire") == "evolution"
|
||||
|
||||
def test_detect_section_type_conclusion(self, document_processor):
|
||||
"""Test la détection de sections conclusion."""
|
||||
assert document_processor._detect_section_type("Conclusion") == "conclusion"
|
||||
assert document_processor._detect_section_type("Synthèse") == "conclusion"
|
||||
|
||||
def test_detect_section_type_no_match(self, document_processor):
|
||||
"""Test qu'aucune section n'est détectée pour du texte normal."""
|
||||
assert document_processor._detect_section_type("Patient admis") is None
|
||||
assert document_processor._detect_section_type("") is None
|
||||
assert document_processor._detect_section_type(" ") is None
|
||||
|
||||
def test_detect_section_type_case_insensitive(self, document_processor):
|
||||
"""Test que la détection est insensible à la casse."""
|
||||
assert document_processor._detect_section_type("ANAMNÈSE") == "anamnese"
|
||||
assert document_processor._detect_section_type("anamnèse") == "anamnese"
|
||||
assert document_processor._detect_section_type("Anamnèse") == "anamnese"
|
||||
|
||||
def test_create_section(self, document_processor):
|
||||
"""Test la création d'une section."""
|
||||
lines = ["Anamnèse", "Patient de 45 ans", "Douleurs abdominales"]
|
||||
section = document_processor._create_section(
|
||||
document_id="doc_001",
|
||||
section_type="anamnese",
|
||||
lines=lines,
|
||||
start_pos=0,
|
||||
section_idx=0,
|
||||
)
|
||||
|
||||
assert section.section_id == "doc_001_section_0"
|
||||
assert section.section_type == "anamnese"
|
||||
assert "Anamnèse" in section.content
|
||||
assert "Patient de 45 ans" in section.content
|
||||
assert section.span.start == 0
|
||||
assert section.span.end > 0
|
||||
|
||||
def test_document_type_priorities_complete(self):
|
||||
"""Test que toutes les priorités de types de documents sont définies."""
|
||||
expected_types = [
|
||||
"cr_operatoire",
|
||||
"cr_medical",
|
||||
"imagerie",
|
||||
"biologie",
|
||||
"courrier",
|
||||
"autre",
|
||||
]
|
||||
|
||||
for doc_type in expected_types:
|
||||
assert doc_type in DOCUMENT_TYPE_PRIORITIES
|
||||
assert 1 <= DOCUMENT_TYPE_PRIORITIES[doc_type] <= 5
|
||||
|
||||
def test_document_type_priorities_order(self):
|
||||
"""Test que les priorités respectent l'ordre attendu."""
|
||||
# CRO doit avoir la priorité la plus haute (1)
|
||||
assert DOCUMENT_TYPE_PRIORITIES["cr_operatoire"] == 1
|
||||
|
||||
# CRM doit avoir la deuxième priorité
|
||||
assert DOCUMENT_TYPE_PRIORITIES["cr_medical"] == 2
|
||||
|
||||
# Imagerie doit avoir la troisième priorité
|
||||
assert DOCUMENT_TYPE_PRIORITIES["imagerie"] == 3
|
||||
|
||||
# Biologie doit avoir la quatrième priorité
|
||||
assert DOCUMENT_TYPE_PRIORITIES["biologie"] == 4
|
||||
|
||||
# Courriers et autres doivent avoir la priorité la plus basse
|
||||
assert DOCUMENT_TYPE_PRIORITIES["courrier"] == 5
|
||||
assert DOCUMENT_TYPE_PRIORITIES["autre"] == 5
|
||||
|
||||
def test_segment_document_complex_structure(self, document_processor):
|
||||
"""Test la segmentation d'un document avec structure complexe."""
|
||||
content = """COMPTE RENDU OPÉRATOIRE
|
||||
|
||||
Anamnèse
|
||||
Patient de 65 ans, diabétique, hypertendu.
|
||||
Admis pour douleurs abdominales aiguës depuis 24h.
|
||||
|
||||
Examen clinique
|
||||
Abdomen tendu, défense généralisée.
|
||||
Température 38.5°C.
|
||||
|
||||
Diagnostic
|
||||
Appendicite aiguë compliquée.
|
||||
Suspicion de péritonite.
|
||||
|
||||
Traitement
|
||||
Appendicectomie en urgence sous cœlioscopie.
|
||||
Antibiothérapie large spectre.
|
||||
|
||||
Évolution
|
||||
Suites opératoires simples.
|
||||
Sortie à J+3.
|
||||
|
||||
Conclusion
|
||||
Intervention réussie sans complication."""
|
||||
|
||||
doc = ClinicalDocument(
|
||||
document_id="doc_complex",
|
||||
document_type="cr_operatoire",
|
||||
content=content,
|
||||
creation_date=datetime(2024, 1, 1, 10, 0),
|
||||
author="Dr. Chirurgien",
|
||||
priority=1,
|
||||
)
|
||||
|
||||
sections = document_processor.segment_document(doc)
|
||||
|
||||
# Doit détecter toutes les sections
|
||||
assert len(sections) >= 6
|
||||
|
||||
section_types = [s.section_type for s in sections]
|
||||
assert "anamnese" in section_types
|
||||
assert "examen" in section_types
|
||||
assert "diagnostic" in section_types
|
||||
assert "traitement" in section_types
|
||||
assert "evolution" in section_types
|
||||
assert "conclusion" in section_types
|
||||
|
||||
def test_segment_document_preserves_content(self, document_processor):
|
||||
"""Test que la segmentation préserve tout le contenu."""
|
||||
content = """Anamnèse
|
||||
Patient admis.
|
||||
|
||||
Diagnostic
|
||||
Appendicite."""
|
||||
|
||||
doc = ClinicalDocument(
|
||||
document_id="doc_test",
|
||||
document_type="cr_medical",
|
||||
content=content,
|
||||
creation_date=datetime(2024, 1, 1, 10, 0),
|
||||
author="Dr. Test",
|
||||
priority=2,
|
||||
)
|
||||
|
||||
sections = document_processor.segment_document(doc)
|
||||
|
||||
# Reconstituer le contenu à partir des sections
|
||||
reconstructed = "\n".join(s.content for s in sections)
|
||||
|
||||
# Le contenu doit être préservé (modulo les sauts de ligne)
|
||||
assert "Patient admis" in reconstructed
|
||||
assert "Appendicite" in reconstructed
|
||||
|
||||
def test_process_documents_validates_stay_id(
|
||||
self, document_processor, simple_document, stay_metadata
|
||||
):
|
||||
"""Test que le stay_id est correctement propagé."""
|
||||
result = document_processor.process_documents(
|
||||
[simple_document], stay_metadata
|
||||
)
|
||||
|
||||
assert result.stay_id == stay_metadata.stay_id
|
||||
|
||||
def test_segment_document_empty_content(self, document_processor):
|
||||
"""Test la segmentation d'un document avec contenu minimal."""
|
||||
doc = ClinicalDocument(
|
||||
document_id="doc_empty",
|
||||
document_type="cr_medical",
|
||||
content="X", # Contenu minimal (min_length=1)
|
||||
creation_date=datetime(2024, 1, 1, 10, 0),
|
||||
author="Dr. Test",
|
||||
priority=2,
|
||||
)
|
||||
|
||||
sections = document_processor.segment_document(doc)
|
||||
|
||||
# Doit créer au moins une section
|
||||
assert len(sections) >= 1
|
||||
assert sections[0].content == "X"
|
||||
|
||||
|
||||
class TestDocumentProcessorIntegration:
|
||||
"""Tests d'intégration pour le DocumentProcessor."""
|
||||
|
||||
def test_full_workflow_single_document(
|
||||
self, document_processor, structured_document, stay_metadata
|
||||
):
|
||||
"""Test le workflow complet avec un seul document."""
|
||||
result = document_processor.process_documents(
|
||||
[structured_document], stay_metadata
|
||||
)
|
||||
|
||||
# Vérifier la structure complète
|
||||
assert isinstance(result, StructuredStay)
|
||||
assert result.stay_id == stay_metadata.stay_id
|
||||
assert len(result.documents) == 1
|
||||
assert len(result.sections) >= 4
|
||||
assert result.facts == [] # Pas encore de faits extraits
|
||||
|
||||
# Vérifier que les sections sont valides
|
||||
for section in result.sections:
|
||||
assert section.section_id.startswith(structured_document.document_id)
|
||||
assert section.section_type in [
|
||||
"anamnese",
|
||||
"examen",
|
||||
"diagnostic",
|
||||
"traitement",
|
||||
"evolution",
|
||||
"conclusion",
|
||||
"autre",
|
||||
]
|
||||
assert len(section.content) > 0
|
||||
|
||||
def test_full_workflow_multi_documents(
|
||||
self, document_processor, stay_metadata
|
||||
):
|
||||
"""Test le workflow complet avec plusieurs documents."""
|
||||
# Créer un séjour complet avec plusieurs documents
|
||||
docs = [
|
||||
ClinicalDocument(
|
||||
document_id="doc_admission",
|
||||
document_type="cr_medical",
|
||||
content="""Anamnèse
|
||||
Patient admis pour douleurs abdominales.
|
||||
|
||||
Examen clinique
|
||||
Abdomen sensible.""",
|
||||
creation_date=datetime(2024, 1, 1, 10, 0),
|
||||
author="Dr. Urgentiste",
|
||||
priority=2,
|
||||
),
|
||||
ClinicalDocument(
|
||||
document_id="doc_operation",
|
||||
document_type="cr_operatoire",
|
||||
content="""Diagnostic
|
||||
Appendicite aiguë.
|
||||
|
||||
Traitement
|
||||
Appendicectomie sous cœlioscopie.""",
|
||||
creation_date=datetime(2024, 1, 2, 9, 0),
|
||||
author="Dr. Chirurgien",
|
||||
priority=1,
|
||||
),
|
||||
ClinicalDocument(
|
||||
document_id="doc_imagerie",
|
||||
document_type="imagerie",
|
||||
content="Scanner abdominal : appendice inflammatoire.",
|
||||
creation_date=datetime(2024, 1, 1, 15, 0),
|
||||
author="Dr. Radiologue",
|
||||
priority=3,
|
||||
),
|
||||
]
|
||||
|
||||
result = document_processor.process_documents(docs, stay_metadata)
|
||||
|
||||
# Vérifier la structure
|
||||
assert len(result.documents) == 3
|
||||
|
||||
# Vérifier l'ordre des priorités
|
||||
assert result.documents[0].document_type == "cr_operatoire"
|
||||
assert result.documents[1].document_type == "cr_medical"
|
||||
assert result.documents[2].document_type == "imagerie"
|
||||
|
||||
# Vérifier que toutes les sections sont présentes
|
||||
assert len(result.sections) >= 3
|
||||
|
||||
# Vérifier que les sections proviennent de différents documents
|
||||
doc_ids = set(s.section_id.split("_section_")[0] for s in result.sections)
|
||||
assert len(doc_ids) == 3
|
||||
Reference in New Issue
Block a user