""" Tests unitaires pour le moteur d'extraction de donnees. Couvre : ExtractionSchema, ExtractionField, DataStore, FieldExtractor, IterationController, ExtractionEngine. """ import json import os import tempfile from pathlib import Path from unittest.mock import MagicMock, patch import pytest import yaml from core.extraction import ( DataStore, ExtractionEngine, ExtractionField, ExtractionSchema, FieldExtractor, IterationController, ) # ====================================================================== # Fixtures # ====================================================================== @pytest.fixture def sample_schema(): """Schema d'extraction minimal pour les tests.""" return ExtractionSchema( name="test_patient", description="Schema de test", fields=[ ExtractionField(name="nom", description="Nom du patient", field_type="text", required=True), ExtractionField(name="prenom", description="Prenom", field_type="text", required=True), ExtractionField( name="date_naissance", description="Date de naissance", field_type="date", required=True, validation_regex=r"\d{2}/\d{2}/\d{4}", ), ExtractionField(name="ipp", description="IPP", field_type="text", required=True), ExtractionField(name="age", description="Age", field_type="number", required=False), ], navigation={"type": "manual", "max_records": 5, "delay_ms": 0}, ) @pytest.fixture def tmp_db(tmp_path): """Base SQLite temporaire.""" return str(tmp_path / "test_store.db") @pytest.fixture def data_store(tmp_db): """DataStore avec base temporaire.""" return DataStore(db_path=tmp_db) @pytest.fixture def yaml_path(tmp_path, sample_schema): """Fichier YAML temporaire pour un schema.""" path = str(tmp_path / "test_schema.yaml") sample_schema.to_yaml(path) return path # ====================================================================== # ExtractionField # ====================================================================== class TestExtractionField: def test_validate_required_present(self): f = ExtractionField(name="nom", description="Nom", field_type="text", required=True) assert f.validate_value("DUPONT") is True def test_validate_required_missing(self): f = ExtractionField(name="nom", description="Nom", field_type="text", required=True) assert f.validate_value(None) is False assert f.validate_value("") is False def test_validate_optional_missing(self): f = ExtractionField(name="note", description="Note", field_type="text", required=False) assert f.validate_value(None) is True assert f.validate_value("") is True def test_validate_number(self): f = ExtractionField(name="age", description="Age", field_type="number") assert f.validate_value("42") is True assert f.validate_value("3,14") is True # FR format assert f.validate_value("abc") is False def test_validate_boolean(self): f = ExtractionField(name="actif", description="Actif", field_type="boolean") assert f.validate_value("oui") is True assert f.validate_value("true") is True assert f.validate_value("faux") is True assert f.validate_value("maybe") is False def test_validate_date(self): f = ExtractionField(name="date", description="Date", field_type="date") assert f.validate_value("15/03/1965") is True assert f.validate_value("2024-01-15") is True assert f.validate_value("invalid") is False def test_validate_regex(self): f = ExtractionField( name="ipp", description="IPP", field_type="text", validation_regex=r"\d{6}", ) assert f.validate_value("123456") is True assert f.validate_value("12345") is False assert f.validate_value("abcdef") is False # ====================================================================== # ExtractionSchema # ====================================================================== class TestExtractionSchema: def test_from_dict(self, sample_schema): data = sample_schema.to_dict() rebuilt = ExtractionSchema.from_dict(data) assert rebuilt.name == sample_schema.name assert len(rebuilt.fields) == len(sample_schema.fields) assert rebuilt.fields[0].name == "nom" def test_yaml_roundtrip(self, tmp_path, sample_schema): yaml_file = str(tmp_path / "schema.yaml") sample_schema.to_yaml(yaml_file) loaded = ExtractionSchema.from_yaml(yaml_file) assert loaded.name == sample_schema.name assert len(loaded.fields) == len(sample_schema.fields) assert loaded.navigation == sample_schema.navigation def test_from_yaml_not_found(self): with pytest.raises(FileNotFoundError): ExtractionSchema.from_yaml("/tmp/nonexistent_schema.yaml") def test_required_fields(self, sample_schema): required = sample_schema.required_fields names = [f.name for f in required] assert "nom" in names assert "age" not in names def test_field_names(self, sample_schema): names = sample_schema.field_names assert names == ["nom", "prenom", "date_naissance", "ipp", "age"] def test_get_field(self, sample_schema): f = sample_schema.get_field("ipp") assert f is not None assert f.field_type == "text" assert sample_schema.get_field("inconnu") is None def test_validate_record_valid(self, sample_schema): record = { "nom": "DUPONT", "prenom": "Jean", "date_naissance": "15/03/1965", "ipp": "123456", "age": "58", } result = sample_schema.validate_record(record) assert result["valid"] is True assert result["errors"] == [] assert result["completeness"] == 1.0 def test_validate_record_missing_required(self, sample_schema): record = { "nom": "DUPONT", "prenom": "", "date_naissance": "15/03/1965", "ipp": "123456", } result = sample_schema.validate_record(record) assert result["valid"] is False assert len(result["errors"]) > 0 def test_validate_record_invalid_format(self, sample_schema): record = { "nom": "DUPONT", "prenom": "Jean", "date_naissance": "invalid_date", "ipp": "123456", } result = sample_schema.validate_record(record) assert result["valid"] is False def test_load_example_yaml(self): """Charger le fichier d'exemple dossier_patient.yaml""" yaml_path = Path(__file__).parent.parent.parent / "data" / "extraction_schemas" / "dossier_patient.yaml" if yaml_path.exists(): schema = ExtractionSchema.from_yaml(str(yaml_path)) assert schema.name == "dossier_patient" assert len(schema.fields) >= 4 assert schema.navigation["type"] == "list_detail" # ====================================================================== # DataStore # ====================================================================== class TestDataStore: def test_create_extraction(self, data_store, sample_schema): eid = data_store.create_extraction(sample_schema) assert eid is not None assert len(eid) == 36 # UUID format def test_get_extraction(self, data_store, sample_schema): eid = data_store.create_extraction(sample_schema) ext = data_store.get_extraction(eid) assert ext is not None assert ext["schema_name"] == "test_patient" assert ext["status"] == "in_progress" def test_add_and_get_records(self, data_store, sample_schema): eid = data_store.create_extraction(sample_schema) data_store.add_record( extraction_id=eid, data={"nom": "DUPONT", "prenom": "Jean"}, confidence=0.85, ) data_store.add_record( extraction_id=eid, data={"nom": "MARTIN", "prenom": "Marie"}, confidence=0.92, ) records = data_store.get_records(eid) assert len(records) == 2 assert records[0]["data"]["nom"] == "DUPONT" assert records[1]["confidence"] == 0.92 def test_finish_extraction(self, data_store, sample_schema): eid = data_store.create_extraction(sample_schema) data_store.finish_extraction(eid, status="completed") ext = data_store.get_extraction(eid) assert ext["status"] == "completed" def test_list_extractions(self, data_store, sample_schema): data_store.create_extraction(sample_schema) data_store.create_extraction(sample_schema) extractions = data_store.list_extractions() assert len(extractions) == 2 def test_export_csv(self, data_store, sample_schema, tmp_path): eid = data_store.create_extraction(sample_schema) data_store.add_record(eid, {"nom": "DUPONT", "prenom": "Jean"}, confidence=0.9) data_store.add_record(eid, {"nom": "MARTIN", "prenom": "Marie"}, confidence=0.8) csv_path = str(tmp_path / "export.csv") data_store.export_csv(eid, csv_path) content = Path(csv_path).read_text(encoding="utf-8-sig") assert "DUPONT" in content assert "MARTIN" in content # Verifier l'en-tete lines = content.strip().split("\n") assert "nom" in lines[0] assert "prenom" in lines[0] def test_export_csv_empty(self, data_store, sample_schema): eid = data_store.create_extraction(sample_schema) with pytest.raises(ValueError, match="Aucun enregistrement"): data_store.export_csv(eid, "/tmp/empty.csv") def test_get_stats(self, data_store, sample_schema): eid = data_store.create_extraction(sample_schema) data_store.add_record(eid, {"nom": "DUPONT", "prenom": "Jean", "ipp": "123"}, confidence=0.9) data_store.add_record(eid, {"nom": "MARTIN", "prenom": None, "ipp": "456"}, confidence=0.7) stats = data_store.get_stats(eid) assert stats["record_count"] == 2 assert stats["avg_confidence"] == 0.8 assert "field_coverage" in stats def test_delete_extraction(self, data_store, sample_schema): eid = data_store.create_extraction(sample_schema) data_store.add_record(eid, {"nom": "TEST"}, confidence=0.5) assert data_store.delete_extraction(eid) is True assert data_store.get_extraction(eid) is None assert data_store.get_records(eid) == [] def test_record_count_updated(self, data_store, sample_schema): eid = data_store.create_extraction(sample_schema) data_store.add_record(eid, {"nom": "A"}, confidence=0.5) data_store.add_record(eid, {"nom": "B"}, confidence=0.6) ext = data_store.get_extraction(eid) assert ext["record_count"] == 2 # ====================================================================== # FieldExtractor (mock VLM) # ====================================================================== class TestFieldExtractor: def test_extract_file_not_found(self, sample_schema): extractor = FieldExtractor() result = extractor.extract_fields("/tmp/nonexistent.png", sample_schema) assert result["confidence"] == 0.0 assert len(result["errors"]) > 0 def test_parse_vlm_response_valid_json(self): extractor = FieldExtractor() data = extractor._parse_vlm_response('{"nom": "DUPONT", "prenom": "Jean"}') assert data == {"nom": "DUPONT", "prenom": "Jean"} def test_parse_vlm_response_json_in_text(self): extractor = FieldExtractor() text = 'Voici les resultats:\n{"nom": "DUPONT", "prenom": "Jean"}\nFin.' data = extractor._parse_vlm_response(text) assert data is not None assert data["nom"] == "DUPONT" def test_parse_vlm_response_markdown_json(self): extractor = FieldExtractor() text = '```json\n{"nom": "DUPONT"}\n```' data = extractor._parse_vlm_response(text) assert data is not None assert data["nom"] == "DUPONT" def test_parse_vlm_response_invalid(self): extractor = FieldExtractor() data = extractor._parse_vlm_response("pas du json du tout") assert data is None def test_parse_vlm_response_empty(self): extractor = FieldExtractor() assert extractor._parse_vlm_response("") is None assert extractor._parse_vlm_response(None) is None def test_build_extraction_prompt(self, sample_schema): extractor = FieldExtractor() prompt = extractor._build_extraction_prompt(sample_schema.fields) assert "nom" in prompt assert "prenom" in prompt assert "OBLIGATOIRE" in prompt assert "JSON" in prompt @patch("core.extraction.field_extractor.requests.post") def test_extract_via_vlm_success(self, mock_post, sample_schema, tmp_path): # Creer un faux screenshot img_path = tmp_path / "test.png" img_path.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 100) # Mocker la reponse Ollama mock_response = MagicMock() mock_response.status_code = 200 mock_response.json.return_value = { "response": json.dumps({ "nom": "DUPONT", "prenom": "Jean", "date_naissance": "15/03/1965", "ipp": "123456", "age": "58", }) } mock_post.return_value = mock_response extractor = FieldExtractor() result = extractor.extract_fields(str(img_path), sample_schema) assert result["data"]["nom"] == "DUPONT" assert result["data"]["prenom"] == "Jean" assert result["confidence"] > 0.0 assert len(result["errors"]) == 0 @patch("core.extraction.field_extractor.requests.post") def test_extract_via_vlm_connection_error(self, mock_post, sample_schema, tmp_path): """VLM indisponible -> donnees vides.""" img_path = tmp_path / "test.png" img_path.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 100) import requests as req mock_post.side_effect = req.exceptions.ConnectionError("Connection refused") extractor = FieldExtractor() result = extractor.extract_fields(str(img_path), sample_schema) # Doit retourner un resultat (meme vide) sans lever d'exception assert "data" in result assert result["confidence"] == 0.0 def test_check_vlm_available_down(self): extractor = FieldExtractor(ollama_url="http://localhost:99999") assert extractor.check_vlm_available() is False # ====================================================================== # IterationController # ====================================================================== class TestIterationController: def test_has_next(self, sample_schema): ctrl = IterationController(sample_schema) assert ctrl.has_next() is True def test_max_records(self, sample_schema): ctrl = IterationController(sample_schema) assert ctrl.max_records == 5 def test_mark_finished(self, sample_schema): ctrl = IterationController(sample_schema) assert ctrl.has_next() is True ctrl.mark_finished() assert ctrl.has_next() is False def test_reset(self, sample_schema): ctrl = IterationController(sample_schema) ctrl.current_index = 3 ctrl.mark_finished() ctrl.reset() assert ctrl.current_index == 0 assert ctrl.has_next() is True def test_progress(self, sample_schema): ctrl = IterationController(sample_schema) ctrl.current_index = 2 progress = ctrl.progress assert progress["current_index"] == 2 assert progress["max_records"] == 5 assert progress["progress_pct"] == 40.0 @patch("core.extraction.iteration_controller.time.sleep") def test_navigate_manual(self, mock_sleep, sample_schema): """Navigation manuelle = juste un delai.""" ctrl = IterationController(sample_schema) result = ctrl.navigate_to_next("test-session") assert result is True assert ctrl.current_index == 1 # ====================================================================== # ExtractionEngine (integration avec mocks) # ====================================================================== class TestExtractionEngine: def test_extract_current_screen_mock(self, sample_schema, tmp_path): """Test d'extraction ponctuelle avec VLM mocke.""" # Creer un faux screenshot img_path = tmp_path / "screen.png" img_path.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 100) # Mocker le FieldExtractor mock_extractor = MagicMock() mock_extractor.extract_fields.return_value = { "data": {"nom": "DUPONT", "prenom": "Jean", "date_naissance": "15/03/1965", "ipp": "123"}, "confidence": 0.9, "errors": [], "raw_response": "{}", } engine = ExtractionEngine( schema=sample_schema, store=DataStore(db_path=str(tmp_path / "test.db")), field_extractor=mock_extractor, ) result = engine.extract_current_screen(str(img_path)) assert result["data"]["nom"] == "DUPONT" assert result["confidence"] == 0.9 assert "validation" in result def test_extract_from_file(self, sample_schema, tmp_path): """Test extract_from_file (extraction + stockage).""" img_path = tmp_path / "screen.png" img_path.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 100) mock_extractor = MagicMock() mock_extractor.extract_fields.return_value = { "data": {"nom": "MARTIN", "prenom": "Marie", "date_naissance": "01/01/1980", "ipp": "456"}, "confidence": 0.85, "errors": [], "raw_response": "{}", } store = DataStore(db_path=str(tmp_path / "test.db")) engine = ExtractionEngine( schema=sample_schema, store=store, field_extractor=mock_extractor, ) result = engine.extract_from_file(str(img_path)) assert result["data"]["nom"] == "MARTIN" assert "record_id" in result assert "extraction_id" in result # Verifier le stockage records = store.get_records(result["extraction_id"]) assert len(records) == 1 def test_get_progress_not_running(self, sample_schema, tmp_path): engine = ExtractionEngine( schema=sample_schema, store=DataStore(db_path=str(tmp_path / "test.db")), ) progress = engine.get_progress() assert progress["is_running"] is False assert progress["schema_name"] == "test_patient" # ====================================================================== # Import smoke test # ====================================================================== class TestImports: def test_import_all(self): """Verifier que tous les imports fonctionnent.""" from core.extraction import ( ExtractionEngine, ExtractionSchema, ExtractionField, FieldExtractor, DataStore, IterationController, ) assert ExtractionEngine is not None assert ExtractionSchema is not None assert ExtractionField is not None assert FieldExtractor is not None assert DataStore is not None assert IterationController is not None