feat(ORA): vérification pré-action — VLM confirme avant chaque clic
Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 12s
security-audit / pip-audit (CVE dépendances) (push) Successful in 12s
security-audit / Scan secrets (grep) (push) Successful in 9s
tests / Lint (ruff + black) (push) Successful in 15s
tests / Tests unitaires (sans GPU) (push) Failing after 16s
tests / Tests sécurité (critique) (push) Has been skipped

Avant de cliquer, crop 200x100 autour de la position cible envoyé
au VLM (qwen2.5vl:3b) : "Is this UI element 'CR_patient_demo'? YES/NO"

Si NO → abandon du clic, évite les clics erronés.
Si erreur VLM → laisse passer (pas bloquant).
Skippé pour le template matching (confiance pixel suffisante).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dom
2026-04-22 16:22:37 +02:00
parent 4ab2c15e5c
commit 8903f35433

View File

@@ -925,6 +925,16 @@ Règles:
logger.error(f"❌ [ORA/click] Impossible de localiser '{target_text}' — aucune méthode n'a fonctionné")
return False
# --- Vérification pré-action : est-ce le bon élément ? ---
if target_text and method_used not in ('template',) and MSS_AVAILABLE and PIL_AVAILABLE:
try:
pre_check = self._verify_pre_click(x, y, target_text, target_desc)
if not pre_check:
print(f"⛔ [ORA/pre-check] L'élément à ({x}, {y}) ne correspond PAS à '{target_text}' — abandon du clic")
return False
except Exception as e:
print(f"⚠️ [ORA/pre-check] Erreur vérification: {e}")
print(f"🖱️ [ORA/click] {decision.value} à ({x}, {y}) via {method_used}")
if decision.value == 'double':
@@ -1079,6 +1089,55 @@ Règles:
pass
return ''
def _verify_pre_click(self, x: int, y: int, target_text: str, target_desc: str = "") -> bool:
"""Vérifie que l'élément à la position (x,y) correspond au target AVANT de cliquer.
Fait un crop 200x100 autour de (x,y), envoie au VLM avec la question
"est-ce que c'est bien {target} ?"
"""
try:
import requests as _requests
with mss_lib.mss() as sct:
mon = sct.monitors[0]
grab = sct.grab(mon)
screen = Image.frombytes('RGB', grab.size, grab.bgra, 'raw', 'BGRX')
# Crop 200x100 autour du point de clic
crop_w, crop_h = 200, 100
left = max(0, x - crop_w // 2)
top = max(0, y - crop_h // 2)
right = min(screen.width, left + crop_w)
bottom = min(screen.height, top + crop_h)
crop = screen.crop((left, top, right, bottom))
import io as _io
buffer = _io.BytesIO()
crop.save(buffer, format='JPEG', quality=70)
crop_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
label = target_desc or target_text
ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434")
resp = _requests.post(f"{ollama_url}/api/generate", json={
"model": "qwen2.5vl:3b",
"prompt": f"Is this UI element '{label}'? Answer only YES or NO.",
"images": [crop_b64],
"stream": False,
"options": {"temperature": 0.1, "num_predict": 5}
}, timeout=15)
if resp.status_code == 200:
answer = resp.json().get("response", "").strip().upper()
is_match = "YES" in answer
print(f"🔍 [ORA/pre-check] '{label}'{answer}{'' if is_match else ''}")
return is_match
return True # En cas d'erreur HTTP, on laisse passer
except Exception as e:
print(f"⚠️ [ORA/pre-check] Erreur: {e}")
return True # En cas d'erreur, on laisse passer
def _phash_distance(self, hash1: Any, hash2: Any) -> int:
"""Distance de Hamming entre deux pHash. Retourne 999 si non calculable."""
if hash1 is None or hash2 is None: