import os import io import json import base64 from typing import Optional, Dict, Any, List from PIL import Image from dotenv import load_dotenv # Charger les variables d'environnement env_paths = [ os.path.join(os.getcwd(), ".env.local"), os.path.join(os.getcwd(), "rpa_vision_v3/.env.local"), os.path.join(os.path.dirname(__file__), "../../../.env.local") ] for path in env_paths: if os.path.exists(path): load_dotenv(path, override=True) break class VLMProvider: """Hub de Vision Sémantique — Ollama local prioritaire, cloud opt-in. Par défaut, seul Ollama local est utilisé (100% local, pas de cloud). Pour activer les APIs cloud en fallback, définir VLM_ALLOW_CLOUD=true dans l'environnement. """ def __init__(self): # Cloud opt-in uniquement (VLM_ALLOW_CLOUD=true pour activer) self.allow_cloud = os.getenv("VLM_ALLOW_CLOUD", "").lower() in ("true", "1", "yes") # Clés API (chargées mais pas utilisées sauf si cloud autorisé) self.openai_key = os.getenv("OPENAI_API_KEY") if self.allow_cloud else None self.gemini_key = os.getenv("GOOGLE_API_KEY") if self.allow_cloud else None self.anthropic_key = os.getenv("ANTHROPIC_API_KEY") if self.allow_cloud else None self.deepseek_key = os.getenv("DEEPSEEK_API_KEY") if self.allow_cloud else None # Configuration Ollama Local (toujours prioritaire) self.ollama_url = os.getenv("OLLAMA_URL", "http://localhost:11434") self.local_model = os.getenv("RPA_VLM_MODEL", os.getenv("VLM_MODEL", "gemma4:latest")) cloud_status = f"OpenAI: {bool(self.openai_key)}, Gemini: {bool(self.gemini_key)}, Anthropic: {bool(self.anthropic_key)}" if self.allow_cloud else "désactivé (VLM_ALLOW_CLOUD non défini)" print(f"[VLM Hub] Ollama local: {self.ollama_url} ({self.local_model}), Cloud: {cloud_status}") def _to_base64(self, image_input) -> str: """Convertit n'importe quel input image en base64 pur""" if isinstance(image_input, Image.Image): buffer = io.BytesIO() image_input.save(buffer, format="PNG") return base64.b64encode(buffer.getvalue()).decode("utf-8") elif isinstance(image_input, str): if image_input.startswith("data:image"): return image_input.split(",", 1)[1] elif os.path.exists(image_input): with open(image_input, "rb") as f: return base64.b64encode(f.read()).decode("utf-8") return image_input # Base64 brut supposé return base64.b64encode(image_input).decode("utf-8") def detect_ui_element(self, screenshot, anchor_image=None, description: str = "") -> Optional[Dict[str, Any]]: """Localise l'élément — Ollama local en priorité, cloud en fallback opt-in.""" # 1. Ollama local (toujours prioritaire — 100% local) res = self._call_ollama_local(screenshot, anchor_image, description) if res and res.get('found'): return res # 2-4. Fallback cloud (uniquement si VLM_ALLOW_CLOUD=true) if self.allow_cloud: if self.openai_key: res = self._call_openai(screenshot, anchor_image, description) if res and res.get('found'): return res if self.gemini_key: res = self._call_gemini(screenshot, anchor_image, description) if res and res.get('found'): return res if self.anthropic_key: res = self._call_anthropic(screenshot, anchor_image, description) if res and res.get('found'): return res return res # Retourner le dernier résultat (Ollama ou cloud) def _call_openai(self, screenshot, anchor_image, description): try: from openai import OpenAI client = OpenAI(api_key=self.openai_key) prompt = f"Expert UI: Localise précisément '{description}'. Retourne JSON: {{'found': bool, 'bbox': [ymin, xmin, ymax, xmax] (0-1000), 'confidence': float}}" content = [{"type": "text", "text": prompt}] content.append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{self._to_base64(screenshot)}"}}) if anchor_image: content.append({"type": "text", "text": "Ancre de référence:"}) content.append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{self._to_base64(anchor_image)}"}}) response = client.chat.completions.create( model="gpt-4o", messages=[{"role": "user", "content": content}], response_format={"type": "json_object"}, temperature=0 ) return json.loads(response.choices[0].message.content) except Exception as e: print(f"⚠️ [Hub] OpenAI Error: {e}") return None def _call_gemini(self, screenshot, anchor_image, description): try: from google import genai client = genai.Client(api_key=self.gemini_key) prompt = f"Expert UI: Localise précisément '{description}'. Retourne JSON: {{'found': bool, 'bbox': [ymin, xmin, ymax, xmax] (0-1000), 'confidence': float}}" contents = [prompt, Image.open(io.BytesIO(base64.b64decode(self._to_base64(screenshot))))] if anchor_image: contents.append(Image.open(io.BytesIO(base64.b64decode(self._to_base64(anchor_image))))) response = client.models.generate_content( model="gemini-1.5-flash", contents=contents, config={"response_mime_type": "application/json"} ) return json.loads(response.text) except Exception as e: print(f"⚠️ [Hub] Gemini Error: {e}") return None def _call_anthropic(self, screenshot, anchor_image, description): try: import anthropic client = anthropic.Anthropic(api_key=self.anthropic_key) # Claude 3.5 Sonnet supporte la vision mais pas le format JSON strict en sortie nativement via config # On utilise un prompt renforcé prompt = f"Localise '{description}'. Réponds UNIQUEMENT en JSON : {{'found': bool, 'bbox': [ymin, xmin, ymax, xmax], 'confidence': float}}" content = [{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": self._to_base64(screenshot)}}, {"type": "text", "text": prompt}] response = client.messages.create( model="claude-3-5-sonnet-20241022", max_tokens=1000, messages=[{"role": "user", "content": content}] ) text = response.content[0].text return json.loads(text[text.find('{'):text.rfind('}')+1]) except Exception as e: print(f"⚠️ [Hub] Anthropic Error: {e}") return None def _call_ollama_local(self, screenshot, anchor_image, description): """Appel a Ollama local (prioritaire — 100% local)""" try: import requests print(f"[Hub] Ollama local ({self.local_model})...") prompt = f"Localise l'element '{description}'. Retourne JSON: {{'found': bool, 'bbox': [ymin, xmin, ymax, xmax] (0-1000)}}" images = [self._to_base64(screenshot)] if anchor_image: images.append(self._to_base64(anchor_image)) messages = [{"role": "user", "content": prompt, "images": images}] payload = { "model": self.local_model, "messages": messages, "stream": False, "format": "json" } # gemma4 necessite think=false (sinon tokens vides sur Ollama >=0.20) if "gemma4" in self.local_model.lower(): payload["think"] = False response = requests.post(f"{self.ollama_url}/api/chat", json=payload, timeout=60) if response.status_code == 200: content = response.json().get("message", {}).get("content", "{}") return json.loads(content) return None except Exception as e: print(f"[Hub] Ollama local erreur: {e}") return {"found": False, "error": str(e)} # Instance unique vlm_hub = VLMProvider()