t2a_v2/tests/test_extraction_multiformat.py

"""Tests pour l'extraction multi-format (PDF, images, DOCX)."""

from __future__ import annotations

from pathlib import Path
from unittest.mock import MagicMock, patch

import pytest

from src.extraction.document_router import (
    SUPPORTED_EXTENSIONS,
    extract_document_with_pages,
)
from src.extraction.page_tracker import PageTracker
from src.extraction.pdf_extractor import (
    ExtractionMethod,
    ExtractionStats,
    _compute_extraction_stats,
)


# ---------------------------------------------------------------------------
# Tests du router — dispatch par extension
# ---------------------------------------------------------------------------


class TestDocumentRouter:
    """Tests unitaires du dispatch par extension."""

    def test_router_pdf_dispatches_correctly(self, tmp_path):
        """Un fichier .pdf est dispatché vers extract_text_with_pages."""
        pdf_file = tmp_path / "test.pdf"
        pdf_file.touch()

        mock_stats = ExtractionStats(total_pages=1, source_format="pdf")
        mock_tracker = PageTracker([(0, 10)])
        mock_return = ("texte pdf", mock_tracker, mock_stats)

        with patch(
            "src.extraction.pdf_extractor.extract_text_with_pages",
            return_value=mock_return,
        ):
            result = extract_document_with_pages(pdf_file)
            assert len(result) == 3
            assert result[0] == "texte pdf"
            assert result[2].source_format == "pdf"

    def test_router_image_dispatches_correctly(self, tmp_path):
        """Un fichier .png est dispatché vers extract_text_from_image."""
        png_file = tmp_path / "test.png"
        png_file.touch()

        mock_stats = ExtractionStats(
            total_pages=1, source_format="image",
            methods=[ExtractionMethod.IMAGE],
        )
        mock_tracker = PageTracker([(0, 10)])
        mock_return = ("texte ocr", mock_tracker, mock_stats)

        with patch(
            "src.extraction.image_extractor.extract_text_from_image",
            return_value=mock_return,
        ):
            result = extract_document_with_pages(png_file)
            assert len(result) == 3
            assert result[0] == "texte ocr"
            assert result[2].source_format == "image"

    def test_router_docx_dispatches_correctly(self, tmp_path):
        """Un fichier .docx est dispatché vers extract_text_from_docx."""
        docx_file = tmp_path / "test.docx"
        docx_file.touch()

        mock_stats = ExtractionStats(
            total_pages=1, source_format="docx",
            methods=[ExtractionMethod.DOCX],
        )
        mock_tracker = PageTracker([(0, 10)])
        mock_return = ("texte docx", mock_tracker, mock_stats)

        with patch(
            "src.extraction.docx_extractor.extract_text_from_docx",
            return_value=mock_return,
        ):
            result = extract_document_with_pages(docx_file)
            assert len(result) == 3
            assert result[0] == "texte docx"
            assert result[2].source_format == "docx"

    def test_router_unsupported_extension_raises(self, tmp_path):
        """Une extension non supportée lève ValueError."""
        xyz_file = tmp_path / "test.xyz"
        xyz_file.touch()

        with pytest.raises(ValueError, match="Format non supporté"):
            extract_document_with_pages(xyz_file)

    def test_router_supported_extensions_complete(self):
        """Vérifie que SUPPORTED_EXTENSIONS contient tous les formats prévus."""
        expected = {".pdf", ".jpg", ".jpeg", ".png", ".tiff", ".tif", ".docx"}
        assert SUPPORTED_EXTENSIONS == expected

    @pytest.mark.parametrize("ext", [".jpg", ".jpeg", ".tiff", ".tif"])
    def test_router_all_image_extensions(self, tmp_path, ext):
        """Toutes les extensions image sont reconnues."""
        img_file = tmp_path / f"test{ext}"
        img_file.touch()

        mock_stats = ExtractionStats(total_pages=1, source_format="image")
        mock_tracker = PageTracker([(0, 5)])

        with patch(
            "src.extraction.image_extractor.extract_text_from_image",
            return_value=("texte", mock_tracker, mock_stats),
        ):
            result = extract_document_with_pages(img_file)
            assert result[2].source_format == "image"


# ---------------------------------------------------------------------------
# Tests ExtractionStats enrichi
# ---------------------------------------------------------------------------


class TestExtractionStats:
    """Tests des nouveaux champs d'ExtractionStats."""

    def test_stats_tracks_method(self):
        """Vérifie que methods/backend/source_format sont renseignés."""
        methods = [
            ExtractionMethod.NATIVE_PDFPLUMBER,
            ExtractionMethod.NATIVE_PDFPLUMBER,
            ExtractionMethod.OCR_DOCTR,
        ]
        stats = _compute_extraction_stats(
            ["page 1 avec du texte", "page 2 avec du texte", "page 3 ocr"],
            methods=methods,
            backend="pdfplumber",
        )

        assert stats.methods == methods
        assert stats.backend == "pdfplumber"
        assert stats.source_format == "pdf"
        assert stats.native_pages == 2
        assert stats.ocr_pages == 1

    def test_stats_default_values(self):
        """Les valeurs par défaut sont correctes."""
        stats = ExtractionStats()
        assert stats.methods == []
        assert stats.native_pages == 0
        assert stats.ocr_pages == 0
        assert stats.backend == "pdfplumber"
        assert stats.source_format == "pdf"

    def test_stats_image_format(self):
        """ExtractionStats pour une image."""
        stats = ExtractionStats(
            total_pages=1,
            source_format="image",
            methods=[ExtractionMethod.IMAGE],
            backend="doctr",
        )
        assert stats.source_format == "image"
        assert stats.methods[0] == ExtractionMethod.IMAGE

    def test_stats_docx_format(self):
        """ExtractionStats pour un DOCX."""
        stats = ExtractionStats(
            total_pages=3,
            source_format="docx",
            methods=[ExtractionMethod.DOCX] * 3,
            backend="python-docx",
            native_pages=3,
        )
        assert stats.source_format == "docx"
        assert len(stats.methods) == 3
        assert all(m == ExtractionMethod.DOCX for m in stats.methods)

    def test_compute_stats_with_methods(self):
        """_compute_extraction_stats calcule correctement native_pages et ocr_pages."""
        pages = ["Hello world" * 10, "", "Texte OCR récupéré"]
        methods = [
            ExtractionMethod.NATIVE_PDFPLUMBER,
            ExtractionMethod.OCR_DOCTR,
            ExtractionMethod.OCR_DOCTR,
        ]
        stats = _compute_extraction_stats(pages, methods, "pdfplumber")

        assert stats.total_pages == 3
        assert stats.native_pages == 1
        assert stats.ocr_pages == 2
        assert 2 in stats.empty_pages  # page 2 (1-indexed) est vide

    def test_extraction_method_enum_values(self):
        """Vérifie les valeurs de l'enum ExtractionMethod."""
        assert ExtractionMethod.NATIVE_PDFPLUMBER.value == "native_pdfplumber"
        assert ExtractionMethod.NATIVE_PYMUPDF.value == "native_pymupdf"
        assert ExtractionMethod.OCR_DOCTR.value == "ocr_doctr"
        assert ExtractionMethod.DOCX.value == "docx"
        assert ExtractionMethod.IMAGE.value == "image_ocr"


# ---------------------------------------------------------------------------
# Tests OCR fallback
# ---------------------------------------------------------------------------


class TestOCRFallback:
    """Tests du mécanisme de fallback OCR."""

    def test_ocr_fallback_disabled_by_default(self):
        """Le fallback OCR est désactivé par défaut."""
        from src.extraction.pdf_extractor import OCR_FALLBACK_ENABLED

        # Par défaut (sans variable d'environnement), le fallback est désactivé
        # Note : ce test vérifie le comportement par défaut, pas une variable statique
        # car elle peut être modifiée par les variables d'environnement du CI
        assert isinstance(OCR_FALLBACK_ENABLED, bool)

    def test_ocr_fallback_config_values(self):
        """Les constantes de config sont cohérentes."""
        from src.extraction.pdf_extractor import OCR_FALLBACK_MIN_CHARS, PDF_BACKEND

        assert isinstance(OCR_FALLBACK_MIN_CHARS, int)
        assert OCR_FALLBACK_MIN_CHARS > 0
        assert PDF_BACKEND in ("pdfplumber", "pymupdf")


# ---------------------------------------------------------------------------
# Tests DOCX extracteur (avec fixture)
# ---------------------------------------------------------------------------


class TestDocxExtractor:
    """Tests de l'extracteur DOCX."""

    @pytest.fixture
    def sample_docx(self, tmp_path):
        """Crée un petit DOCX de test."""
        try:
            from docx import Document
        except ImportError:
            pytest.skip("python-docx non installé")

        doc = Document()
        doc.add_paragraph("Premier paragraphe du document médical.")
        doc.add_paragraph("Diagnostic principal : Pneumopathie J18.9")
        doc.add_paragraph("Traitement de sortie : Amoxicilline 1g x3/j")
        docx_path = tmp_path / "test_medical.docx"
        doc.save(str(docx_path))
        return docx_path

    @pytest.fixture
    def docx_with_page_breaks(self, tmp_path):
        """Crée un DOCX avec des sauts de page."""
        try:
            from docx import Document
            from docx.oxml.ns import qn
            from docx.oxml import OxmlElement
        except ImportError:
            pytest.skip("python-docx non installé")

        doc = Document()
        doc.add_paragraph("Page 1 : Antécédents du patient.")

        # Ajouter un saut de page
        p = doc.add_paragraph()
        run = p.add_run()
        br = OxmlElement("w:br")
        br.set(qn("w:type"), "page")
        run._element.append(br)

        doc.add_paragraph("Page 2 : Compte-rendu opératoire.")
        docx_path = tmp_path / "test_pages.docx"
        doc.save(str(docx_path))
        return docx_path

    def test_extract_docx_basic(self, sample_docx):
        """Extraction basique d'un DOCX."""
        from src.extraction.docx_extractor import extract_text_from_docx

        text, tracker, stats = extract_text_from_docx(sample_docx)

        assert "Pneumopathie" in text
        assert "Amoxicilline" in text
        assert stats.source_format == "docx"
        assert stats.total_pages >= 1
        assert stats.total_chars > 0
        assert all(m == ExtractionMethod.DOCX for m in stats.methods)

    def test_extract_docx_with_page_breaks(self, docx_with_page_breaks):
        """Extraction d'un DOCX avec sauts de page."""
        from src.extraction.docx_extractor import extract_text_from_docx

        text, tracker, stats = extract_text_from_docx(docx_with_page_breaks)

        assert stats.total_pages == 2
        assert "Antécédents" in text
        assert "Compte-rendu" in text
        # PageTracker fonctionne
        assert tracker.char_to_page(0) == 1

    def test_extract_docx_file_not_found(self, tmp_path):
        """FileNotFoundError si le fichier n'existe pas."""
        from src.extraction.docx_extractor import extract_text_from_docx

        with pytest.raises(FileNotFoundError):
            extract_text_from_docx(tmp_path / "inexistant.docx")

    def test_extract_docx_stats_backend(self, sample_docx):
        """Le backend est bien 'python-docx'."""
        from src.extraction.docx_extractor import extract_text_from_docx

        _, _, stats = extract_text_from_docx(sample_docx)
        assert stats.backend == "python-docx"


# ---------------------------------------------------------------------------
# Tests image extracteur (mock OCR)
# ---------------------------------------------------------------------------


class TestImageExtractor:
    """Tests de l'extracteur d'images (avec OCR mocké)."""

    def test_extract_image_file_not_found(self, tmp_path):
        """FileNotFoundError si l'image n'existe pas."""
        from src.extraction.image_extractor import extract_text_from_image

        with pytest.raises(FileNotFoundError):
            extract_text_from_image(tmp_path / "inexistant.png")

    def test_extract_image_stats_format(self, tmp_path):
        """Vérifie le format des stats pour une image."""
        # Créer une petite image PNG
        from PIL import Image

        img = Image.new("RGB", (100, 50), color="white")
        img_path = tmp_path / "test.png"
        img.save(str(img_path))

        with patch("src.extraction.image_extractor.ocr_image", return_value="Texte OCR extrait"):
            from src.extraction.image_extractor import extract_text_from_image

            text, tracker, stats = extract_text_from_image(img_path)

            assert text == "Texte OCR extrait"
            assert stats.source_format == "image"
            assert stats.total_pages == 1
            assert stats.ocr_pages == 1
            assert stats.native_pages == 0
            assert stats.methods == [ExtractionMethod.IMAGE]
            assert stats.backend == "doctr"

    def test_extract_image_empty_result(self, tmp_path):
        """Image sans texte détectable."""
        from PIL import Image

        img = Image.new("RGB", (100, 50), color="white")
        img_path = tmp_path / "blank.png"
        img.save(str(img_path))

        with patch("src.extraction.image_extractor.ocr_image", return_value=""):
            from src.extraction.image_extractor import extract_text_from_image

            text, tracker, stats = extract_text_from_image(img_path)

            assert text == ""
            assert stats.empty_pages == [1]
            assert stats.total_chars == 0


# ---------------------------------------------------------------------------
# Tests de non-régression
# ---------------------------------------------------------------------------


class TestBackwardCompat:
    """Tests de rétrocompatibilité."""

    def test_process_pdf_alias_exists(self):
        """process_pdf est un alias de process_document."""
        from src.main import process_document, process_pdf

        assert process_pdf is process_document

    def test_extraction_stats_existing_properties(self):
        """Les propriétés existantes d'ExtractionStats fonctionnent toujours."""
        stats = ExtractionStats(
            total_pages=5,
            empty_pages=[2, 4],
            chars_per_page=[100, 0, 200, 0, 300],
            total_chars=600,
        )
        assert stats.usable_pages == 3
        assert stats.coverage_ratio == 0.6
        assert stats.has_quality_issues() is True
        alert = stats.to_alert()
        assert alert is not None
        assert "2/5" in alert

        flags = stats.to_flags()
        assert flags["extraction_empty_pages"] == [2, 4]
        assert flags["extraction_total_pages"] == 5
        assert flags["extraction_coverage"] == 0.6

    def test_extraction_stats_no_issues(self):
        """Pas d'alerte quand tout va bien."""
        stats = ExtractionStats(
            total_pages=3,
            chars_per_page=[100, 200, 300],
            total_chars=600,
        )
        assert not stats.has_quality_issues()
        assert stats.to_alert() is None
        assert stats.to_flags() == {}