rpa_vision_v3/visual_workflow_builder/backend/vlm_provider.py

import os
import io
import json
import base64
from typing import Optional, Dict, Any, List
from PIL import Image
from dotenv import load_dotenv

# Charger les variables d'environnement
env_paths = [
    os.path.join(os.getcwd(), ".env.local"),
    os.path.join(os.getcwd(), "rpa_vision_v3/.env.local"),
    os.path.join(os.path.dirname(__file__), "../../../.env.local")
]
for path in env_paths:
    if os.path.exists(path):
        load_dotenv(path, override=True)
        break

class VLMProvider:
    """Hub de Vision Sémantique Multi-Fournisseurs (OpenAI, Gemini, Anthropic, Ollama)"""

    def __init__(self):
        # Clés API
        self.openai_key = os.getenv("OPENAI_API_KEY")
        self.gemini_key = os.getenv("GOOGLE_API_KEY")
        self.anthropic_key = os.getenv("ANTHROPIC_API_KEY")
        self.deepseek_key = os.getenv("DEEPSEEK_API_KEY")

        # Configuration Ollama Local
        self.ollama_url = os.getenv("OLLAMA_URL", "http://localhost:11434")
        self.local_model = os.getenv("VLM_MODEL", "qwen3-vl:8b")

        # Priorité par défaut
        self.preferred_cloud = "openai" # gpt-4o est la référence UI
        print(f"🔧 [VLM Hub] Initialisé. OpenAI: {bool(self.openai_key)}, Gemini: {bool(self.gemini_key)}, Anthropic: {bool(self.anthropic_key)}")

    def _to_base64(self, image_input) -> str:
        """Convertit n'importe quel input image en base64 pur"""
        if isinstance(image_input, Image.Image):
            buffer = io.BytesIO()
            image_input.save(buffer, format="PNG")
            return base64.b64encode(buffer.getvalue()).decode("utf-8")
        elif isinstance(image_input, str):
            if image_input.startswith("data:image"):
                return image_input.split(",", 1)[1]
            elif os.path.exists(image_input):
                with open(image_input, "rb") as f:
                    return base64.b64encode(f.read()).decode("utf-8")
            return image_input # Base64 brut supposé
        return base64.b64encode(image_input).decode("utf-8")

    def detect_ui_element(self, screenshot, anchor_image=None, description: str = "") -> Optional[Dict[str, Any]]:
        """Tente de localiser l'élément en essayant les fournisseurs par ordre de qualité"""

        # 1. Tenter OpenAI (Référence Vision UI)
        if self.openai_key:
            res = self._call_openai(screenshot, anchor_image, description)
            if res and res.get('found'): return res

        # 2. Tenter Gemini (Excellent backup Vision)
        if self.gemini_key:
            res = self._call_gemini(screenshot, anchor_image, description)
            if res and res.get('found'): return res

        # 3. Tenter Anthropic (Précision logique)
        if self.anthropic_key:
            res = self._call_anthropic(screenshot, anchor_image, description)
            if res and res.get('found'): return res

        # 4. Fallback Local (Ollama) - Crucial pour le DGX Spark
        return self._call_ollama_local(screenshot, anchor_image, description)

    def _call_openai(self, screenshot, anchor_image, description):
        try:
            from openai import OpenAI
            client = OpenAI(api_key=self.openai_key)
            prompt = f"Expert UI: Localise précisément '{description}'. Retourne JSON: {{'found': bool, 'bbox': [ymin, xmin, ymax, xmax] (0-1000), 'confidence': float}}"

            content = [{"type": "text", "text": prompt}]
            content.append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{self._to_base64(screenshot)}"}})
            if anchor_image:
                content.append({"type": "text", "text": "Ancre de référence:"})
                content.append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{self._to_base64(anchor_image)}"}})

            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[{"role": "user", "content": content}],
                response_format={"type": "json_object"},
                temperature=0
            )
            return json.loads(response.choices[0].message.content)
        except Exception as e:
            print(f"⚠️ [Hub] OpenAI Error: {e}")
            return None

    def _call_gemini(self, screenshot, anchor_image, description):
        try:
            from google import genai
            client = genai.Client(api_key=self.gemini_key)
            prompt = f"Expert UI: Localise précisément '{description}'. Retourne JSON: {{'found': bool, 'bbox': [ymin, xmin, ymax, xmax] (0-1000), 'confidence': float}}"

            contents = [prompt, Image.open(io.BytesIO(base64.b64decode(self._to_base64(screenshot))))]
            if anchor_image:
                contents.append(Image.open(io.BytesIO(base64.b64decode(self._to_base64(anchor_image)))))

            response = client.models.generate_content(
                model="gemini-1.5-flash",
                contents=contents,
                config={"response_mime_type": "application/json"}
            )
            return json.loads(response.text)
        except Exception as e:
            print(f"⚠️ [Hub] Gemini Error: {e}")
            return None

    def _call_anthropic(self, screenshot, anchor_image, description):
        try:
            import anthropic
            client = anthropic.Anthropic(api_key=self.anthropic_key)
            # Claude 3.5 Sonnet supporte la vision mais pas le format JSON strict en sortie nativement via config
            # On utilise un prompt renforcé
            prompt = f"Localise '{description}'. Réponds UNIQUEMENT en JSON : {{'found': bool, 'bbox': [ymin, xmin, ymax, xmax], 'confidence': float}}"

            content = [{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": self._to_base64(screenshot)}},
                       {"type": "text", "text": prompt}]

            response = client.messages.create(
                model="claude-3-5-sonnet-20241022",
                max_tokens=1000,
                messages=[{"role": "user", "content": content}]
            )
            text = response.content[0].text
            return json.loads(text[text.find('{'):text.rfind('}')+1])
        except Exception as e:
            print(f"⚠️ [Hub] Anthropic Error: {e}")
            return None

    def _call_ollama_local(self, screenshot, anchor_image, description):
        """Appel à Ollama local (Mode DGX Spark / Offline)"""
        try:
            import requests
            print(f"🏠 [Hub] Fallback Local Ollama ({self.local_model})...")
            prompt = f"Localise l'élément '{description}'. Retourne JSON: {{'found': bool, 'bbox': [ymin, xmin, ymax, xmax] (0-1000)}}"

            payload = {
                "model": self.local_model,
                "prompt": prompt,
                "images": [self._to_base64(screenshot)],
                "stream": False,
                "format": "json"
            }
            if anchor_image:
                payload["images"].append(self._to_base64(anchor_image))

            response = requests.post(f"{self.ollama_url}/api/generate", json=payload, timeout=60)
            if response.status_code == 200:
                return json.loads(response.json().get('response', '{}'))
            return None
        except Exception as e:
            print(f"❌ [Hub] Local Ollama Error: {e}")
            return {"found": False, "error": str(e)}

# Instance unique
vlm_hub = VLMProvider()