refactor: réorganisation référentiels, nouveaux modules extraction, nettoyage code obsolète
- Réorganisation data/referentiels/ : pdfs/, dicts/, user/ (structure unifiée) - Fix badges "Source absente" sur page admin référentiels - Ré-indexation COCOA 2025 (555 → 1451 chunks, couverture 94%) - Fix VRAM OOM : embeddings forcés CPU via T2A_EMBED_CPU - Nouveaux modules : document_router, docx_extractor, image_extractor, ocr_engine - Module complétude (quality/completude.py + config YAML) - Template DIM (synthèse dimensionnelle) - Gunicorn config + systemd service t2a-viewer - Suppression t2a_install_rag_cleanup/ (copie obsolète) - Suppression scripts/ et scripts_t2a_v2/ (anciens benchmarks) - Suppression 81 fichiers _doc.txt de test - Cache Ollama : TTL configurable, corrections loader YAML - Dashboard : améliorations templates (base, index, detail, cpam, validation) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
417
tests/test_extraction_multiformat.py
Normal file
417
tests/test_extraction_multiformat.py
Normal file
@@ -0,0 +1,417 @@
|
||||
"""Tests pour l'extraction multi-format (PDF, images, DOCX)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from src.extraction.document_router import (
|
||||
SUPPORTED_EXTENSIONS,
|
||||
extract_document_with_pages,
|
||||
)
|
||||
from src.extraction.page_tracker import PageTracker
|
||||
from src.extraction.pdf_extractor import (
|
||||
ExtractionMethod,
|
||||
ExtractionStats,
|
||||
_compute_extraction_stats,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests du router — dispatch par extension
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestDocumentRouter:
|
||||
"""Tests unitaires du dispatch par extension."""
|
||||
|
||||
def test_router_pdf_dispatches_correctly(self, tmp_path):
|
||||
"""Un fichier .pdf est dispatché vers extract_text_with_pages."""
|
||||
pdf_file = tmp_path / "test.pdf"
|
||||
pdf_file.touch()
|
||||
|
||||
mock_stats = ExtractionStats(total_pages=1, source_format="pdf")
|
||||
mock_tracker = PageTracker([(0, 10)])
|
||||
mock_return = ("texte pdf", mock_tracker, mock_stats)
|
||||
|
||||
with patch(
|
||||
"src.extraction.pdf_extractor.extract_text_with_pages",
|
||||
return_value=mock_return,
|
||||
):
|
||||
result = extract_document_with_pages(pdf_file)
|
||||
assert len(result) == 3
|
||||
assert result[0] == "texte pdf"
|
||||
assert result[2].source_format == "pdf"
|
||||
|
||||
def test_router_image_dispatches_correctly(self, tmp_path):
|
||||
"""Un fichier .png est dispatché vers extract_text_from_image."""
|
||||
png_file = tmp_path / "test.png"
|
||||
png_file.touch()
|
||||
|
||||
mock_stats = ExtractionStats(
|
||||
total_pages=1, source_format="image",
|
||||
methods=[ExtractionMethod.IMAGE],
|
||||
)
|
||||
mock_tracker = PageTracker([(0, 10)])
|
||||
mock_return = ("texte ocr", mock_tracker, mock_stats)
|
||||
|
||||
with patch(
|
||||
"src.extraction.image_extractor.extract_text_from_image",
|
||||
return_value=mock_return,
|
||||
):
|
||||
result = extract_document_with_pages(png_file)
|
||||
assert len(result) == 3
|
||||
assert result[0] == "texte ocr"
|
||||
assert result[2].source_format == "image"
|
||||
|
||||
def test_router_docx_dispatches_correctly(self, tmp_path):
|
||||
"""Un fichier .docx est dispatché vers extract_text_from_docx."""
|
||||
docx_file = tmp_path / "test.docx"
|
||||
docx_file.touch()
|
||||
|
||||
mock_stats = ExtractionStats(
|
||||
total_pages=1, source_format="docx",
|
||||
methods=[ExtractionMethod.DOCX],
|
||||
)
|
||||
mock_tracker = PageTracker([(0, 10)])
|
||||
mock_return = ("texte docx", mock_tracker, mock_stats)
|
||||
|
||||
with patch(
|
||||
"src.extraction.docx_extractor.extract_text_from_docx",
|
||||
return_value=mock_return,
|
||||
):
|
||||
result = extract_document_with_pages(docx_file)
|
||||
assert len(result) == 3
|
||||
assert result[0] == "texte docx"
|
||||
assert result[2].source_format == "docx"
|
||||
|
||||
def test_router_unsupported_extension_raises(self, tmp_path):
|
||||
"""Une extension non supportée lève ValueError."""
|
||||
xyz_file = tmp_path / "test.xyz"
|
||||
xyz_file.touch()
|
||||
|
||||
with pytest.raises(ValueError, match="Format non supporté"):
|
||||
extract_document_with_pages(xyz_file)
|
||||
|
||||
def test_router_supported_extensions_complete(self):
|
||||
"""Vérifie que SUPPORTED_EXTENSIONS contient tous les formats prévus."""
|
||||
expected = {".pdf", ".jpg", ".jpeg", ".png", ".tiff", ".tif", ".docx"}
|
||||
assert SUPPORTED_EXTENSIONS == expected
|
||||
|
||||
@pytest.mark.parametrize("ext", [".jpg", ".jpeg", ".tiff", ".tif"])
|
||||
def test_router_all_image_extensions(self, tmp_path, ext):
|
||||
"""Toutes les extensions image sont reconnues."""
|
||||
img_file = tmp_path / f"test{ext}"
|
||||
img_file.touch()
|
||||
|
||||
mock_stats = ExtractionStats(total_pages=1, source_format="image")
|
||||
mock_tracker = PageTracker([(0, 5)])
|
||||
|
||||
with patch(
|
||||
"src.extraction.image_extractor.extract_text_from_image",
|
||||
return_value=("texte", mock_tracker, mock_stats),
|
||||
):
|
||||
result = extract_document_with_pages(img_file)
|
||||
assert result[2].source_format == "image"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests ExtractionStats enrichi
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestExtractionStats:
|
||||
"""Tests des nouveaux champs d'ExtractionStats."""
|
||||
|
||||
def test_stats_tracks_method(self):
|
||||
"""Vérifie que methods/backend/source_format sont renseignés."""
|
||||
methods = [
|
||||
ExtractionMethod.NATIVE_PDFPLUMBER,
|
||||
ExtractionMethod.NATIVE_PDFPLUMBER,
|
||||
ExtractionMethod.OCR_DOCTR,
|
||||
]
|
||||
stats = _compute_extraction_stats(
|
||||
["page 1 avec du texte", "page 2 avec du texte", "page 3 ocr"],
|
||||
methods=methods,
|
||||
backend="pdfplumber",
|
||||
)
|
||||
|
||||
assert stats.methods == methods
|
||||
assert stats.backend == "pdfplumber"
|
||||
assert stats.source_format == "pdf"
|
||||
assert stats.native_pages == 2
|
||||
assert stats.ocr_pages == 1
|
||||
|
||||
def test_stats_default_values(self):
|
||||
"""Les valeurs par défaut sont correctes."""
|
||||
stats = ExtractionStats()
|
||||
assert stats.methods == []
|
||||
assert stats.native_pages == 0
|
||||
assert stats.ocr_pages == 0
|
||||
assert stats.backend == "pdfplumber"
|
||||
assert stats.source_format == "pdf"
|
||||
|
||||
def test_stats_image_format(self):
|
||||
"""ExtractionStats pour une image."""
|
||||
stats = ExtractionStats(
|
||||
total_pages=1,
|
||||
source_format="image",
|
||||
methods=[ExtractionMethod.IMAGE],
|
||||
backend="doctr",
|
||||
)
|
||||
assert stats.source_format == "image"
|
||||
assert stats.methods[0] == ExtractionMethod.IMAGE
|
||||
|
||||
def test_stats_docx_format(self):
|
||||
"""ExtractionStats pour un DOCX."""
|
||||
stats = ExtractionStats(
|
||||
total_pages=3,
|
||||
source_format="docx",
|
||||
methods=[ExtractionMethod.DOCX] * 3,
|
||||
backend="python-docx",
|
||||
native_pages=3,
|
||||
)
|
||||
assert stats.source_format == "docx"
|
||||
assert len(stats.methods) == 3
|
||||
assert all(m == ExtractionMethod.DOCX for m in stats.methods)
|
||||
|
||||
def test_compute_stats_with_methods(self):
|
||||
"""_compute_extraction_stats calcule correctement native_pages et ocr_pages."""
|
||||
pages = ["Hello world" * 10, "", "Texte OCR récupéré"]
|
||||
methods = [
|
||||
ExtractionMethod.NATIVE_PDFPLUMBER,
|
||||
ExtractionMethod.OCR_DOCTR,
|
||||
ExtractionMethod.OCR_DOCTR,
|
||||
]
|
||||
stats = _compute_extraction_stats(pages, methods, "pdfplumber")
|
||||
|
||||
assert stats.total_pages == 3
|
||||
assert stats.native_pages == 1
|
||||
assert stats.ocr_pages == 2
|
||||
assert 2 in stats.empty_pages # page 2 (1-indexed) est vide
|
||||
|
||||
def test_extraction_method_enum_values(self):
|
||||
"""Vérifie les valeurs de l'enum ExtractionMethod."""
|
||||
assert ExtractionMethod.NATIVE_PDFPLUMBER.value == "native_pdfplumber"
|
||||
assert ExtractionMethod.NATIVE_PYMUPDF.value == "native_pymupdf"
|
||||
assert ExtractionMethod.OCR_DOCTR.value == "ocr_doctr"
|
||||
assert ExtractionMethod.DOCX.value == "docx"
|
||||
assert ExtractionMethod.IMAGE.value == "image_ocr"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests OCR fallback
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestOCRFallback:
|
||||
"""Tests du mécanisme de fallback OCR."""
|
||||
|
||||
def test_ocr_fallback_disabled_by_default(self):
|
||||
"""Le fallback OCR est désactivé par défaut."""
|
||||
from src.extraction.pdf_extractor import OCR_FALLBACK_ENABLED
|
||||
|
||||
# Par défaut (sans variable d'environnement), le fallback est désactivé
|
||||
# Note : ce test vérifie le comportement par défaut, pas une variable statique
|
||||
# car elle peut être modifiée par les variables d'environnement du CI
|
||||
assert isinstance(OCR_FALLBACK_ENABLED, bool)
|
||||
|
||||
def test_ocr_fallback_config_values(self):
|
||||
"""Les constantes de config sont cohérentes."""
|
||||
from src.extraction.pdf_extractor import OCR_FALLBACK_MIN_CHARS, PDF_BACKEND
|
||||
|
||||
assert isinstance(OCR_FALLBACK_MIN_CHARS, int)
|
||||
assert OCR_FALLBACK_MIN_CHARS > 0
|
||||
assert PDF_BACKEND in ("pdfplumber", "pymupdf")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests DOCX extracteur (avec fixture)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestDocxExtractor:
|
||||
"""Tests de l'extracteur DOCX."""
|
||||
|
||||
@pytest.fixture
|
||||
def sample_docx(self, tmp_path):
|
||||
"""Crée un petit DOCX de test."""
|
||||
try:
|
||||
from docx import Document
|
||||
except ImportError:
|
||||
pytest.skip("python-docx non installé")
|
||||
|
||||
doc = Document()
|
||||
doc.add_paragraph("Premier paragraphe du document médical.")
|
||||
doc.add_paragraph("Diagnostic principal : Pneumopathie J18.9")
|
||||
doc.add_paragraph("Traitement de sortie : Amoxicilline 1g x3/j")
|
||||
docx_path = tmp_path / "test_medical.docx"
|
||||
doc.save(str(docx_path))
|
||||
return docx_path
|
||||
|
||||
@pytest.fixture
|
||||
def docx_with_page_breaks(self, tmp_path):
|
||||
"""Crée un DOCX avec des sauts de page."""
|
||||
try:
|
||||
from docx import Document
|
||||
from docx.oxml.ns import qn
|
||||
from docx.oxml import OxmlElement
|
||||
except ImportError:
|
||||
pytest.skip("python-docx non installé")
|
||||
|
||||
doc = Document()
|
||||
doc.add_paragraph("Page 1 : Antécédents du patient.")
|
||||
|
||||
# Ajouter un saut de page
|
||||
p = doc.add_paragraph()
|
||||
run = p.add_run()
|
||||
br = OxmlElement("w:br")
|
||||
br.set(qn("w:type"), "page")
|
||||
run._element.append(br)
|
||||
|
||||
doc.add_paragraph("Page 2 : Compte-rendu opératoire.")
|
||||
docx_path = tmp_path / "test_pages.docx"
|
||||
doc.save(str(docx_path))
|
||||
return docx_path
|
||||
|
||||
def test_extract_docx_basic(self, sample_docx):
|
||||
"""Extraction basique d'un DOCX."""
|
||||
from src.extraction.docx_extractor import extract_text_from_docx
|
||||
|
||||
text, tracker, stats = extract_text_from_docx(sample_docx)
|
||||
|
||||
assert "Pneumopathie" in text
|
||||
assert "Amoxicilline" in text
|
||||
assert stats.source_format == "docx"
|
||||
assert stats.total_pages >= 1
|
||||
assert stats.total_chars > 0
|
||||
assert all(m == ExtractionMethod.DOCX for m in stats.methods)
|
||||
|
||||
def test_extract_docx_with_page_breaks(self, docx_with_page_breaks):
|
||||
"""Extraction d'un DOCX avec sauts de page."""
|
||||
from src.extraction.docx_extractor import extract_text_from_docx
|
||||
|
||||
text, tracker, stats = extract_text_from_docx(docx_with_page_breaks)
|
||||
|
||||
assert stats.total_pages == 2
|
||||
assert "Antécédents" in text
|
||||
assert "Compte-rendu" in text
|
||||
# PageTracker fonctionne
|
||||
assert tracker.char_to_page(0) == 1
|
||||
|
||||
def test_extract_docx_file_not_found(self, tmp_path):
|
||||
"""FileNotFoundError si le fichier n'existe pas."""
|
||||
from src.extraction.docx_extractor import extract_text_from_docx
|
||||
|
||||
with pytest.raises(FileNotFoundError):
|
||||
extract_text_from_docx(tmp_path / "inexistant.docx")
|
||||
|
||||
def test_extract_docx_stats_backend(self, sample_docx):
|
||||
"""Le backend est bien 'python-docx'."""
|
||||
from src.extraction.docx_extractor import extract_text_from_docx
|
||||
|
||||
_, _, stats = extract_text_from_docx(sample_docx)
|
||||
assert stats.backend == "python-docx"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests image extracteur (mock OCR)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestImageExtractor:
|
||||
"""Tests de l'extracteur d'images (avec OCR mocké)."""
|
||||
|
||||
def test_extract_image_file_not_found(self, tmp_path):
|
||||
"""FileNotFoundError si l'image n'existe pas."""
|
||||
from src.extraction.image_extractor import extract_text_from_image
|
||||
|
||||
with pytest.raises(FileNotFoundError):
|
||||
extract_text_from_image(tmp_path / "inexistant.png")
|
||||
|
||||
def test_extract_image_stats_format(self, tmp_path):
|
||||
"""Vérifie le format des stats pour une image."""
|
||||
# Créer une petite image PNG
|
||||
from PIL import Image
|
||||
|
||||
img = Image.new("RGB", (100, 50), color="white")
|
||||
img_path = tmp_path / "test.png"
|
||||
img.save(str(img_path))
|
||||
|
||||
with patch("src.extraction.image_extractor.ocr_image", return_value="Texte OCR extrait"):
|
||||
from src.extraction.image_extractor import extract_text_from_image
|
||||
|
||||
text, tracker, stats = extract_text_from_image(img_path)
|
||||
|
||||
assert text == "Texte OCR extrait"
|
||||
assert stats.source_format == "image"
|
||||
assert stats.total_pages == 1
|
||||
assert stats.ocr_pages == 1
|
||||
assert stats.native_pages == 0
|
||||
assert stats.methods == [ExtractionMethod.IMAGE]
|
||||
assert stats.backend == "doctr"
|
||||
|
||||
def test_extract_image_empty_result(self, tmp_path):
|
||||
"""Image sans texte détectable."""
|
||||
from PIL import Image
|
||||
|
||||
img = Image.new("RGB", (100, 50), color="white")
|
||||
img_path = tmp_path / "blank.png"
|
||||
img.save(str(img_path))
|
||||
|
||||
with patch("src.extraction.image_extractor.ocr_image", return_value=""):
|
||||
from src.extraction.image_extractor import extract_text_from_image
|
||||
|
||||
text, tracker, stats = extract_text_from_image(img_path)
|
||||
|
||||
assert text == ""
|
||||
assert stats.empty_pages == [1]
|
||||
assert stats.total_chars == 0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests de non-régression
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestBackwardCompat:
|
||||
"""Tests de rétrocompatibilité."""
|
||||
|
||||
def test_process_pdf_alias_exists(self):
|
||||
"""process_pdf est un alias de process_document."""
|
||||
from src.main import process_document, process_pdf
|
||||
|
||||
assert process_pdf is process_document
|
||||
|
||||
def test_extraction_stats_existing_properties(self):
|
||||
"""Les propriétés existantes d'ExtractionStats fonctionnent toujours."""
|
||||
stats = ExtractionStats(
|
||||
total_pages=5,
|
||||
empty_pages=[2, 4],
|
||||
chars_per_page=[100, 0, 200, 0, 300],
|
||||
total_chars=600,
|
||||
)
|
||||
assert stats.usable_pages == 3
|
||||
assert stats.coverage_ratio == 0.6
|
||||
assert stats.has_quality_issues() is True
|
||||
alert = stats.to_alert()
|
||||
assert alert is not None
|
||||
assert "2/5" in alert
|
||||
|
||||
flags = stats.to_flags()
|
||||
assert flags["extraction_empty_pages"] == [2, 4]
|
||||
assert flags["extraction_total_pages"] == 5
|
||||
assert flags["extraction_coverage"] == 0.6
|
||||
|
||||
def test_extraction_stats_no_issues(self):
|
||||
"""Pas d'alerte quand tout va bien."""
|
||||
stats = ExtractionStats(
|
||||
total_pages=3,
|
||||
chars_per_page=[100, 200, 300],
|
||||
total_chars=600,
|
||||
)
|
||||
assert not stats.has_quality_issues()
|
||||
assert stats.to_alert() is None
|
||||
assert stats.to_flags() == {}
|
||||
Reference in New Issue
Block a user