#!/usr/bin/env python3 """ Debug: Voir ce que le VLM retourne réellement """ import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent)) from core.detection.ollama_client import OllamaClient def test_vlm_response(): """Tester différents prompts avec le VLM""" client = OllamaClient(model="qwen3-vl:8b") screenshot_path = "rpa_vision_v3/examples/test_ui_screenshot.png" print("=" * 80) print("TEST 1: Prompt simple") print("=" * 80) prompt1 = "Describe what you see in this image." result1 = client.generate(prompt1, image_path=screenshot_path, temperature=0.1) if result1["success"]: print(f"✓ Réponse reçue ({len(result1['response'])} caractères)") print(f"\nRéponse:\n{result1['response']}\n") else: print(f"❌ Erreur: {result1['error']}") print("\n" + "=" * 80) print("TEST 2: Demander de lister les boutons") print("=" * 80) prompt2 = "List all the buttons you can see in this image. For each button, tell me its label." result2 = client.generate(prompt2, image_path=screenshot_path, temperature=0.1) if result2["success"]: print(f"✓ Réponse reçue ({len(result2['response'])} caractères)") print(f"\nRéponse:\n{result2['response']}\n") else: print(f"❌ Erreur: {result2['error']}") print("\n" + "=" * 80) print("TEST 3: Demander JSON simple") print("=" * 80) prompt3 = """List the buttons in this image as JSON. Format: [{"label": "button text"}] Return only the JSON array.""" result3 = client.generate(prompt3, image_path=screenshot_path, temperature=0.0) if result3["success"]: print(f"✓ Réponse reçue ({len(result3['response'])} caractères)") print(f"\nRéponse:\n{result3['response']}\n") else: print(f"❌ Erreur: {result3['error']}") if __name__ == "__main__": test_vlm_response()