"""Tests pour l'extraction multi-format (PDF, images, DOCX).""" from __future__ import annotations from pathlib import Path from unittest.mock import MagicMock, patch import pytest from src.extraction.document_router import ( SUPPORTED_EXTENSIONS, extract_document_with_pages, ) from src.extraction.page_tracker import PageTracker from src.extraction.pdf_extractor import ( ExtractionMethod, ExtractionStats, _compute_extraction_stats, ) # --------------------------------------------------------------------------- # Tests du router — dispatch par extension # --------------------------------------------------------------------------- class TestDocumentRouter: """Tests unitaires du dispatch par extension.""" def test_router_pdf_dispatches_correctly(self, tmp_path): """Un fichier .pdf est dispatché vers extract_text_with_pages.""" pdf_file = tmp_path / "test.pdf" pdf_file.touch() mock_stats = ExtractionStats(total_pages=1, source_format="pdf") mock_tracker = PageTracker([(0, 10)]) mock_return = ("texte pdf", mock_tracker, mock_stats) with patch( "src.extraction.pdf_extractor.extract_text_with_pages", return_value=mock_return, ): result = extract_document_with_pages(pdf_file) assert len(result) == 3 assert result[0] == "texte pdf" assert result[2].source_format == "pdf" def test_router_image_dispatches_correctly(self, tmp_path): """Un fichier .png est dispatché vers extract_text_from_image.""" png_file = tmp_path / "test.png" png_file.touch() mock_stats = ExtractionStats( total_pages=1, source_format="image", methods=[ExtractionMethod.IMAGE], ) mock_tracker = PageTracker([(0, 10)]) mock_return = ("texte ocr", mock_tracker, mock_stats) with patch( "src.extraction.image_extractor.extract_text_from_image", return_value=mock_return, ): result = extract_document_with_pages(png_file) assert len(result) == 3 assert result[0] == "texte ocr" assert result[2].source_format == "image" def test_router_docx_dispatches_correctly(self, tmp_path): """Un fichier .docx est dispatché vers extract_text_from_docx.""" docx_file = tmp_path / "test.docx" docx_file.touch() mock_stats = ExtractionStats( total_pages=1, source_format="docx", methods=[ExtractionMethod.DOCX], ) mock_tracker = PageTracker([(0, 10)]) mock_return = ("texte docx", mock_tracker, mock_stats) with patch( "src.extraction.docx_extractor.extract_text_from_docx", return_value=mock_return, ): result = extract_document_with_pages(docx_file) assert len(result) == 3 assert result[0] == "texte docx" assert result[2].source_format == "docx" def test_router_unsupported_extension_raises(self, tmp_path): """Une extension non supportée lève ValueError.""" xyz_file = tmp_path / "test.xyz" xyz_file.touch() with pytest.raises(ValueError, match="Format non supporté"): extract_document_with_pages(xyz_file) def test_router_supported_extensions_complete(self): """Vérifie que SUPPORTED_EXTENSIONS contient tous les formats prévus.""" expected = {".pdf", ".jpg", ".jpeg", ".png", ".tiff", ".tif", ".docx"} assert SUPPORTED_EXTENSIONS == expected @pytest.mark.parametrize("ext", [".jpg", ".jpeg", ".tiff", ".tif"]) def test_router_all_image_extensions(self, tmp_path, ext): """Toutes les extensions image sont reconnues.""" img_file = tmp_path / f"test{ext}" img_file.touch() mock_stats = ExtractionStats(total_pages=1, source_format="image") mock_tracker = PageTracker([(0, 5)]) with patch( "src.extraction.image_extractor.extract_text_from_image", return_value=("texte", mock_tracker, mock_stats), ): result = extract_document_with_pages(img_file) assert result[2].source_format == "image" # --------------------------------------------------------------------------- # Tests ExtractionStats enrichi # --------------------------------------------------------------------------- class TestExtractionStats: """Tests des nouveaux champs d'ExtractionStats.""" def test_stats_tracks_method(self): """Vérifie que methods/backend/source_format sont renseignés.""" methods = [ ExtractionMethod.NATIVE_PDFPLUMBER, ExtractionMethod.NATIVE_PDFPLUMBER, ExtractionMethod.OCR_DOCTR, ] stats = _compute_extraction_stats( ["page 1 avec du texte", "page 2 avec du texte", "page 3 ocr"], methods=methods, backend="pdfplumber", ) assert stats.methods == methods assert stats.backend == "pdfplumber" assert stats.source_format == "pdf" assert stats.native_pages == 2 assert stats.ocr_pages == 1 def test_stats_default_values(self): """Les valeurs par défaut sont correctes.""" stats = ExtractionStats() assert stats.methods == [] assert stats.native_pages == 0 assert stats.ocr_pages == 0 assert stats.backend == "pdfplumber" assert stats.source_format == "pdf" def test_stats_image_format(self): """ExtractionStats pour une image.""" stats = ExtractionStats( total_pages=1, source_format="image", methods=[ExtractionMethod.IMAGE], backend="doctr", ) assert stats.source_format == "image" assert stats.methods[0] == ExtractionMethod.IMAGE def test_stats_docx_format(self): """ExtractionStats pour un DOCX.""" stats = ExtractionStats( total_pages=3, source_format="docx", methods=[ExtractionMethod.DOCX] * 3, backend="python-docx", native_pages=3, ) assert stats.source_format == "docx" assert len(stats.methods) == 3 assert all(m == ExtractionMethod.DOCX for m in stats.methods) def test_compute_stats_with_methods(self): """_compute_extraction_stats calcule correctement native_pages et ocr_pages.""" pages = ["Hello world" * 10, "", "Texte OCR récupéré"] methods = [ ExtractionMethod.NATIVE_PDFPLUMBER, ExtractionMethod.OCR_DOCTR, ExtractionMethod.OCR_DOCTR, ] stats = _compute_extraction_stats(pages, methods, "pdfplumber") assert stats.total_pages == 3 assert stats.native_pages == 1 assert stats.ocr_pages == 2 assert 2 in stats.empty_pages # page 2 (1-indexed) est vide def test_extraction_method_enum_values(self): """Vérifie les valeurs de l'enum ExtractionMethod.""" assert ExtractionMethod.NATIVE_PDFPLUMBER.value == "native_pdfplumber" assert ExtractionMethod.NATIVE_PYMUPDF.value == "native_pymupdf" assert ExtractionMethod.OCR_DOCTR.value == "ocr_doctr" assert ExtractionMethod.DOCX.value == "docx" assert ExtractionMethod.IMAGE.value == "image_ocr" # --------------------------------------------------------------------------- # Tests OCR fallback # --------------------------------------------------------------------------- class TestOCRFallback: """Tests du mécanisme de fallback OCR.""" def test_ocr_fallback_disabled_by_default(self): """Le fallback OCR est désactivé par défaut.""" from src.extraction.pdf_extractor import OCR_FALLBACK_ENABLED # Par défaut (sans variable d'environnement), le fallback est désactivé # Note : ce test vérifie le comportement par défaut, pas une variable statique # car elle peut être modifiée par les variables d'environnement du CI assert isinstance(OCR_FALLBACK_ENABLED, bool) def test_ocr_fallback_config_values(self): """Les constantes de config sont cohérentes.""" from src.extraction.pdf_extractor import OCR_FALLBACK_MIN_CHARS, PDF_BACKEND assert isinstance(OCR_FALLBACK_MIN_CHARS, int) assert OCR_FALLBACK_MIN_CHARS > 0 assert PDF_BACKEND in ("pdfplumber", "pymupdf") # --------------------------------------------------------------------------- # Tests DOCX extracteur (avec fixture) # --------------------------------------------------------------------------- class TestDocxExtractor: """Tests de l'extracteur DOCX.""" @pytest.fixture def sample_docx(self, tmp_path): """Crée un petit DOCX de test.""" try: from docx import Document except ImportError: pytest.skip("python-docx non installé") doc = Document() doc.add_paragraph("Premier paragraphe du document médical.") doc.add_paragraph("Diagnostic principal : Pneumopathie J18.9") doc.add_paragraph("Traitement de sortie : Amoxicilline 1g x3/j") docx_path = tmp_path / "test_medical.docx" doc.save(str(docx_path)) return docx_path @pytest.fixture def docx_with_page_breaks(self, tmp_path): """Crée un DOCX avec des sauts de page.""" try: from docx import Document from docx.oxml.ns import qn from docx.oxml import OxmlElement except ImportError: pytest.skip("python-docx non installé") doc = Document() doc.add_paragraph("Page 1 : Antécédents du patient.") # Ajouter un saut de page p = doc.add_paragraph() run = p.add_run() br = OxmlElement("w:br") br.set(qn("w:type"), "page") run._element.append(br) doc.add_paragraph("Page 2 : Compte-rendu opératoire.") docx_path = tmp_path / "test_pages.docx" doc.save(str(docx_path)) return docx_path def test_extract_docx_basic(self, sample_docx): """Extraction basique d'un DOCX.""" from src.extraction.docx_extractor import extract_text_from_docx text, tracker, stats = extract_text_from_docx(sample_docx) assert "Pneumopathie" in text assert "Amoxicilline" in text assert stats.source_format == "docx" assert stats.total_pages >= 1 assert stats.total_chars > 0 assert all(m == ExtractionMethod.DOCX for m in stats.methods) def test_extract_docx_with_page_breaks(self, docx_with_page_breaks): """Extraction d'un DOCX avec sauts de page.""" from src.extraction.docx_extractor import extract_text_from_docx text, tracker, stats = extract_text_from_docx(docx_with_page_breaks) assert stats.total_pages == 2 assert "Antécédents" in text assert "Compte-rendu" in text # PageTracker fonctionne assert tracker.char_to_page(0) == 1 def test_extract_docx_file_not_found(self, tmp_path): """FileNotFoundError si le fichier n'existe pas.""" from src.extraction.docx_extractor import extract_text_from_docx with pytest.raises(FileNotFoundError): extract_text_from_docx(tmp_path / "inexistant.docx") def test_extract_docx_stats_backend(self, sample_docx): """Le backend est bien 'python-docx'.""" from src.extraction.docx_extractor import extract_text_from_docx _, _, stats = extract_text_from_docx(sample_docx) assert stats.backend == "python-docx" # --------------------------------------------------------------------------- # Tests image extracteur (mock OCR) # --------------------------------------------------------------------------- class TestImageExtractor: """Tests de l'extracteur d'images (avec OCR mocké).""" def test_extract_image_file_not_found(self, tmp_path): """FileNotFoundError si l'image n'existe pas.""" from src.extraction.image_extractor import extract_text_from_image with pytest.raises(FileNotFoundError): extract_text_from_image(tmp_path / "inexistant.png") def test_extract_image_stats_format(self, tmp_path): """Vérifie le format des stats pour une image.""" # Créer une petite image PNG from PIL import Image img = Image.new("RGB", (100, 50), color="white") img_path = tmp_path / "test.png" img.save(str(img_path)) with patch("src.extraction.image_extractor.ocr_image", return_value="Texte OCR extrait"): from src.extraction.image_extractor import extract_text_from_image text, tracker, stats = extract_text_from_image(img_path) assert text == "Texte OCR extrait" assert stats.source_format == "image" assert stats.total_pages == 1 assert stats.ocr_pages == 1 assert stats.native_pages == 0 assert stats.methods == [ExtractionMethod.IMAGE] assert stats.backend == "doctr" def test_extract_image_empty_result(self, tmp_path): """Image sans texte détectable.""" from PIL import Image img = Image.new("RGB", (100, 50), color="white") img_path = tmp_path / "blank.png" img.save(str(img_path)) with patch("src.extraction.image_extractor.ocr_image", return_value=""): from src.extraction.image_extractor import extract_text_from_image text, tracker, stats = extract_text_from_image(img_path) assert text == "" assert stats.empty_pages == [1] assert stats.total_chars == 0 # --------------------------------------------------------------------------- # Tests de non-régression # --------------------------------------------------------------------------- class TestBackwardCompat: """Tests de rétrocompatibilité.""" def test_process_pdf_alias_exists(self): """process_pdf est un alias de process_document.""" from src.main import process_document, process_pdf assert process_pdf is process_document def test_extraction_stats_existing_properties(self): """Les propriétés existantes d'ExtractionStats fonctionnent toujours.""" stats = ExtractionStats( total_pages=5, empty_pages=[2, 4], chars_per_page=[100, 0, 200, 0, 300], total_chars=600, ) assert stats.usable_pages == 3 assert stats.coverage_ratio == 0.6 assert stats.has_quality_issues() is True alert = stats.to_alert() assert alert is not None assert "2/5" in alert flags = stats.to_flags() assert flags["extraction_empty_pages"] == [2, 4] assert flags["extraction_total_pages"] == 5 assert flags["extraction_coverage"] == 0.6 def test_extraction_stats_no_issues(self): """Pas d'alerte quand tout va bien.""" stats = ExtractionStats( total_pages=3, chars_per_page=[100, 200, 300], total_chars=600, ) assert not stats.has_quality_issues() assert stats.to_alert() is None assert stats.to_flags() == {}