rpa_vision_v3/examples/simple_vlm_detection.py

#!/usr/bin/env python3
"""
Exemple Simple de Détection VLM

Montre comment utiliser le UIDetector avec VLM pour détecter
des éléments UI dans un screenshot.
"""

import sys
from pathlib import Path

# Ajouter le répertoire parent au path
sys.path.insert(0, str(Path(__file__).parent.parent))

from core.detection.ui_detector import UIDetector, DetectionConfig
from core.detection.ollama_client import check_ollama_available


def main():
    """Exemple simple d'utilisation"""

    # Vérifier qu'Ollama est disponible
    if not check_ollama_available():
        print("❌ Ollama n'est pas disponible!")
        print("   Lancez Ollama avec: ollama serve")
        print("   Puis téléchargez le modèle: ollama pull qwen3-vl:8b")
        return

    print("✓ Ollama est disponible\n")

    # Créer le détecteur avec configuration par défaut
    print("Initialisation du UIDetector...")
    detector = UIDetector()

    if detector.vlm_client is None:
        print("❌ Le VLM n'a pas pu être initialisé")
        return

    print(f"✓ UIDetector initialisé avec {detector.config.vlm_model}\n")

    # Vérifier si un screenshot est fourni en argument
    if len(sys.argv) > 1:
        screenshot_path = sys.argv[1]
    else:
        print("Usage: python simple_vlm_detection.py <screenshot_path>")
        print("\nExemple:")
        print("  python simple_vlm_detection.py /path/to/screenshot.png")
        return

    # Vérifier que le fichier existe
    if not Path(screenshot_path).exists():
        print(f"❌ Le fichier {screenshot_path} n'existe pas")
        return

    print(f"Analyse du screenshot: {screenshot_path}")
    print("(Cela peut prendre quelques secondes...)\n")

    # Détecter les éléments UI
    elements = detector.detect(screenshot_path)

    # Afficher les résultats
    print(f"✓ Détection terminée: {len(elements)} éléments trouvés\n")

    if len(elements) == 0:
        print("Aucun élément UI détecté dans ce screenshot.")
        return

    # Afficher chaque élément
    print("=" * 80)
    print("ÉLÉMENTS UI DÉTECTÉS")
    print("=" * 80)

    for i, elem in enumerate(elements, 1):
        print(f"\n{i}. {elem.type.upper()} - {elem.role}")
        print(f"   Label:      {elem.label or '(aucun)'}")
        print(f"   Position:   x={elem.bbox[0]}, y={elem.bbox[1]}")
        print(f"   Taille:     w={elem.bbox[2]}, h={elem.bbox[3]}")
        print(f"   Centre:     ({elem.center[0]}, {elem.center[1]})")
        print(f"   Confiance:  {elem.confidence:.2%}")

    print("\n" + "=" * 80)

    # Statistiques
    print("\nSTATISTIQUES:")
    types_count = {}
    roles_count = {}

    for elem in elements:
        types_count[elem.type] = types_count.get(elem.type, 0) + 1
        roles_count[elem.role] = roles_count.get(elem.role, 0) + 1

    print("\nTypes d'éléments:")
    for elem_type, count in sorted(types_count.items()):
        print(f"  - {elem_type}: {count}")

    print("\nRôles sémantiques:")
    for role, count in sorted(roles_count.items()):
        print(f"  - {role}: {count}")

    avg_confidence = sum(e.confidence for e in elements) / len(elements)
    print(f"\nConfiance moyenne: {avg_confidence:.2%}")


if __name__ == "__main__":
    main()