From 8903f354338f1dad551ba5579750f9b9ac84319e Mon Sep 17 00:00:00 2001 From: Dom Date: Wed, 22 Apr 2026 16:22:37 +0200 Subject: [PATCH] =?UTF-8?q?feat(ORA):=20v=C3=A9rification=20pr=C3=A9-actio?= =?UTF-8?q?n=20=E2=80=94=20VLM=20confirme=20avant=20chaque=20clic?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Avant de cliquer, crop 200x100 autour de la position cible envoyé au VLM (qwen2.5vl:3b) : "Is this UI element 'CR_patient_demo'? YES/NO" Si NO → abandon du clic, évite les clics erronés. Si erreur VLM → laisse passer (pas bloquant). Skippé pour le template matching (confiance pixel suffisante). Co-Authored-By: Claude Opus 4.6 (1M context) --- core/execution/observe_reason_act.py | 59 ++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/core/execution/observe_reason_act.py b/core/execution/observe_reason_act.py index cf9e07614..893e0d84a 100644 --- a/core/execution/observe_reason_act.py +++ b/core/execution/observe_reason_act.py @@ -925,6 +925,16 @@ Règles: logger.error(f"❌ [ORA/click] Impossible de localiser '{target_text}' — aucune méthode n'a fonctionné") return False + # --- Vérification pré-action : est-ce le bon élément ? --- + if target_text and method_used not in ('template',) and MSS_AVAILABLE and PIL_AVAILABLE: + try: + pre_check = self._verify_pre_click(x, y, target_text, target_desc) + if not pre_check: + print(f"⛔ [ORA/pre-check] L'élément à ({x}, {y}) ne correspond PAS à '{target_text}' — abandon du clic") + return False + except Exception as e: + print(f"⚠️ [ORA/pre-check] Erreur vérification: {e}") + print(f"🖱️ [ORA/click] {decision.value} à ({x}, {y}) via {method_used}") if decision.value == 'double': @@ -1079,6 +1089,55 @@ Règles: pass return '' + def _verify_pre_click(self, x: int, y: int, target_text: str, target_desc: str = "") -> bool: + """Vérifie que l'élément à la position (x,y) correspond au target AVANT de cliquer. + + Fait un crop 200x100 autour de (x,y), envoie au VLM avec la question + "est-ce que c'est bien {target} ?" + """ + try: + import requests as _requests + + with mss_lib.mss() as sct: + mon = sct.monitors[0] + grab = sct.grab(mon) + screen = Image.frombytes('RGB', grab.size, grab.bgra, 'raw', 'BGRX') + + # Crop 200x100 autour du point de clic + crop_w, crop_h = 200, 100 + left = max(0, x - crop_w // 2) + top = max(0, y - crop_h // 2) + right = min(screen.width, left + crop_w) + bottom = min(screen.height, top + crop_h) + crop = screen.crop((left, top, right, bottom)) + + import io as _io + buffer = _io.BytesIO() + crop.save(buffer, format='JPEG', quality=70) + crop_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8') + + label = target_desc or target_text + ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434") + resp = _requests.post(f"{ollama_url}/api/generate", json={ + "model": "qwen2.5vl:3b", + "prompt": f"Is this UI element '{label}'? Answer only YES or NO.", + "images": [crop_b64], + "stream": False, + "options": {"temperature": 0.1, "num_predict": 5} + }, timeout=15) + + if resp.status_code == 200: + answer = resp.json().get("response", "").strip().upper() + is_match = "YES" in answer + print(f"🔍 [ORA/pre-check] '{label}' → {answer} → {'✅' if is_match else '❌'}") + return is_match + + return True # En cas d'erreur HTTP, on laisse passer + + except Exception as e: + print(f"⚠️ [ORA/pre-check] Erreur: {e}") + return True # En cas d'erreur, on laisse passer + def _phash_distance(self, hash1: Any, hash2: Any) -> int: """Distance de Hamming entre deux pHash. Retourne 999 si non calculable.""" if hash1 is None or hash2 is None: