diff --git a/core/execution/observe_reason_act.py b/core/execution/observe_reason_act.py index cf9e07614..893e0d84a 100644 --- a/core/execution/observe_reason_act.py +++ b/core/execution/observe_reason_act.py @@ -925,6 +925,16 @@ Règles: logger.error(f"❌ [ORA/click] Impossible de localiser '{target_text}' — aucune méthode n'a fonctionné") return False + # --- Vérification pré-action : est-ce le bon élément ? --- + if target_text and method_used not in ('template',) and MSS_AVAILABLE and PIL_AVAILABLE: + try: + pre_check = self._verify_pre_click(x, y, target_text, target_desc) + if not pre_check: + print(f"⛔ [ORA/pre-check] L'élément à ({x}, {y}) ne correspond PAS à '{target_text}' — abandon du clic") + return False + except Exception as e: + print(f"⚠️ [ORA/pre-check] Erreur vérification: {e}") + print(f"🖱️ [ORA/click] {decision.value} à ({x}, {y}) via {method_used}") if decision.value == 'double': @@ -1079,6 +1089,55 @@ Règles: pass return '' + def _verify_pre_click(self, x: int, y: int, target_text: str, target_desc: str = "") -> bool: + """Vérifie que l'élément à la position (x,y) correspond au target AVANT de cliquer. + + Fait un crop 200x100 autour de (x,y), envoie au VLM avec la question + "est-ce que c'est bien {target} ?" + """ + try: + import requests as _requests + + with mss_lib.mss() as sct: + mon = sct.monitors[0] + grab = sct.grab(mon) + screen = Image.frombytes('RGB', grab.size, grab.bgra, 'raw', 'BGRX') + + # Crop 200x100 autour du point de clic + crop_w, crop_h = 200, 100 + left = max(0, x - crop_w // 2) + top = max(0, y - crop_h // 2) + right = min(screen.width, left + crop_w) + bottom = min(screen.height, top + crop_h) + crop = screen.crop((left, top, right, bottom)) + + import io as _io + buffer = _io.BytesIO() + crop.save(buffer, format='JPEG', quality=70) + crop_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8') + + label = target_desc or target_text + ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434") + resp = _requests.post(f"{ollama_url}/api/generate", json={ + "model": "qwen2.5vl:3b", + "prompt": f"Is this UI element '{label}'? Answer only YES or NO.", + "images": [crop_b64], + "stream": False, + "options": {"temperature": 0.1, "num_predict": 5} + }, timeout=15) + + if resp.status_code == 200: + answer = resp.json().get("response", "").strip().upper() + is_match = "YES" in answer + print(f"🔍 [ORA/pre-check] '{label}' → {answer} → {'✅' if is_match else '❌'}") + return is_match + + return True # En cas d'erreur HTTP, on laisse passer + + except Exception as e: + print(f"⚠️ [ORA/pre-check] Erreur: {e}") + return True # En cas d'erreur, on laisse passer + def _phash_distance(self, hash1: Any, hash2: Any) -> int: """Distance de Hamming entre deux pHash. Retourne 999 si non calculable.""" if hash1 is None or hash2 is None: