perf: template matching direct en PREMIER (~1-10ms)
Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 12s
security-audit / pip-audit (CVE dépendances) (push) Successful in 11s
security-audit / Scan secrets (grep) (push) Successful in 9s
tests / Lint (ruff + black) (push) Successful in 14s
tests / Tests unitaires (sans GPU) (push) Failing after 15s
tests / Tests sécurité (critique) (push) Has been skipped
Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 12s
security-audit / pip-audit (CVE dépendances) (push) Successful in 11s
security-audit / Scan secrets (grep) (push) Successful in 9s
tests / Lint (ruff + black) (push) Successful in 14s
tests / Tests unitaires (sans GPU) (push) Failing after 15s
tests / Tests sécurité (critique) (push) Has been skipped
cv2.matchTemplate cherche l'ancre directement dans le screenshot. Pas de RF-DETR, pas de CLIP, pas de 90 comparaisons. Seuil 0.75 pour éviter les faux positifs. Ordre : template (1ms) → CLIP (fallback) → OCR/UI-TARS (dernier recours) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -845,9 +845,45 @@ def execute_action(action_type: str, params: dict) -> dict:
|
|||||||
|
|
||||||
x, y, confidence, method_used = None, None, 0, ''
|
x, y, confidence, method_used = None, None, 0, ''
|
||||||
|
|
||||||
# === MÉTHODE 1 : CLIP visuel (rapide, fiable si écran similaire) ===
|
# === MÉTHODE 1 : Template matching direct (~1-10ms) ===
|
||||||
|
try:
|
||||||
|
import cv2
|
||||||
|
import numpy as np
|
||||||
|
import mss as mss_lib
|
||||||
|
from PIL import Image as PILImage
|
||||||
|
|
||||||
|
with mss_lib.mss() as sct:
|
||||||
|
mon = sct.monitors[0]
|
||||||
|
grab = sct.grab(mon)
|
||||||
|
screen_img = PILImage.frombytes('RGB', grab.size, grab.bgra, 'raw', 'BGRX')
|
||||||
|
|
||||||
|
# Décoder l'ancre
|
||||||
|
anchor_data = base64.b64decode(screenshot_base64.split(',')[1] if ',' in screenshot_base64 else screenshot_base64)
|
||||||
|
anchor_img = PILImage.open(io.BytesIO(anchor_data))
|
||||||
|
|
||||||
|
screen_cv = cv2.cvtColor(np.array(screen_img), cv2.COLOR_RGB2BGR)
|
||||||
|
anchor_cv = cv2.cvtColor(np.array(anchor_img), cv2.COLOR_RGB2BGR)
|
||||||
|
|
||||||
|
if anchor_cv.shape[0] < screen_cv.shape[0] and anchor_cv.shape[1] < screen_cv.shape[1]:
|
||||||
|
import time as _t
|
||||||
|
_t0 = _t.time()
|
||||||
|
result_tm = cv2.matchTemplate(screen_cv, anchor_cv, cv2.TM_CCOEFF_NORMED)
|
||||||
|
min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result_tm)
|
||||||
|
_elapsed = (_t.time() - _t0) * 1000
|
||||||
|
print(f"⚡ [Template] Score={max_val:.3f} pos={max_loc} ({_elapsed:.0f}ms)")
|
||||||
|
|
||||||
|
if max_val > 0.75:
|
||||||
|
x = max_loc[0] + anchor_cv.shape[1] // 2
|
||||||
|
y = max_loc[1] + anchor_cv.shape[0] // 2
|
||||||
|
confidence = max_val
|
||||||
|
method_used = 'template'
|
||||||
|
except Exception as tmpl_err:
|
||||||
|
print(f"⚠️ [Template] Erreur: {tmpl_err}")
|
||||||
|
|
||||||
|
# === MÉTHODE 2 : RF-DETR + CLIP (si template échoue) ===
|
||||||
|
if x is None:
|
||||||
from services.intelligent_executor import find_and_click
|
from services.intelligent_executor import find_and_click
|
||||||
print(f"🧠 [Vision] Recherche visuelle CLIP...")
|
print(f"🧠 [Vision] Fallback CLIP...")
|
||||||
result = find_and_click(
|
result = find_and_click(
|
||||||
anchor_image_base64=screenshot_base64,
|
anchor_image_base64=screenshot_base64,
|
||||||
anchor_bbox=anchor_bbox,
|
anchor_bbox=anchor_bbox,
|
||||||
@@ -861,7 +897,7 @@ def execute_action(action_type: str, params: dict) -> dict:
|
|||||||
confidence = result['confidence']
|
confidence = result['confidence']
|
||||||
method_used = 'clip'
|
method_used = 'clip'
|
||||||
|
|
||||||
# === MÉTHODE 2 : OCR → UI-TARS (si CLIP échoue) ===
|
# === MÉTHODE 3 : OCR → UI-TARS (si tout échoue) ===
|
||||||
if x is None and _fc_target_text and _fc_target_text not in _action_types:
|
if x is None and _fc_target_text and _fc_target_text not in _action_types:
|
||||||
print(f"🔍 [Grounding] Fallback OCR/UI-TARS: '{_fc_target_text}'")
|
print(f"🔍 [Grounding] Fallback OCR/UI-TARS: '{_fc_target_text}'")
|
||||||
grounding_result = _shared_find_element(
|
grounding_result = _shared_find_element(
|
||||||
|
|||||||
Reference in New Issue
Block a user