feat: chaîne de grounding 3 niveaux + refonte capture écran
Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 12s
security-audit / pip-audit (CVE dépendances) (push) Successful in 10s
security-audit / Scan secrets (grep) (push) Successful in 9s
tests / Lint (ruff + black) (push) Successful in 14s
tests / Tests unitaires (sans GPU) (push) Failing after 14s
tests / Tests sécurité (critique) (push) Has been skipped
Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 12s
security-audit / pip-audit (CVE dépendances) (push) Successful in 10s
security-audit / Scan secrets (grep) (push) Successful in 9s
tests / Lint (ruff + black) (push) Successful in 14s
tests / Tests unitaires (sans GPU) (push) Failing after 14s
tests / Tests sécurité (critique) (push) Has been skipped
Grounding en cascade quand CLIP/template échouent : 1. OCR (docTR) → cherche le texte exact sur l'écran (~1s) 2. UI-TARS grounding → "click on X" → coordonnées (~3s, 94% ScreenSpot) 3. VLM reasoning → raisonnement complet + confirmation OCR (~10s) find_element_on_screen() dans input_handler.py (partagé VWB + Léa). Câblé dans find_and_click() et execute_action() comme fallback. Refonte capture écran : - mss.monitors[0] (composite) pour capturer la VM en plein écran - FullscreenSelector réécrit : overlay via getBoundingClientRect() - Bboxes et sélection alignées avec l'image (calcul JS, pas CSS) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -327,6 +327,307 @@ Réponds UNIQUEMENT le JSON, pas d'explication."""
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def find_element_on_screen(
|
||||||
|
target_text: str,
|
||||||
|
target_description: str = "",
|
||||||
|
anchor_image_base64: Optional[str] = None,
|
||||||
|
) -> Optional[Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Cherche un élément sur l'écran en utilisant 3 méthodes en cascade.
|
||||||
|
|
||||||
|
Niveau 1 — OCR (rapide, ~1s) : docTR pour trouver le texte exact
|
||||||
|
Niveau 2 — UI-TARS grounding (~3s) : modèle GUI spécialisé
|
||||||
|
Niveau 3 — VLM reasoning (~10s) : raisonnement + OCR de confirmation
|
||||||
|
|
||||||
|
Args:
|
||||||
|
target_text: Texte de l'élément à trouver (ex: "Demo", "Enregistrer")
|
||||||
|
target_description: Description plus longue (ex: "le dossier Demo sur le bureau")
|
||||||
|
anchor_image_base64: Image de référence de l'ancre (pour CLIP matching, réservé futur)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
{'x': int, 'y': int, 'method': str, 'confidence': float} ou None
|
||||||
|
"""
|
||||||
|
if not target_text and not target_description:
|
||||||
|
logger.debug("find_element_on_screen: ni target_text ni target_description fournis")
|
||||||
|
return None
|
||||||
|
|
||||||
|
search_label = target_description or target_text
|
||||||
|
logger.info(f"[Grounding] Recherche élément: '{search_label}' (cascade 3 niveaux)")
|
||||||
|
|
||||||
|
# ─── Niveau 1 — OCR (rapide, ~1s) ───
|
||||||
|
result = _grounding_ocr(target_text)
|
||||||
|
if result:
|
||||||
|
return result
|
||||||
|
|
||||||
|
# ─── Niveau 2 — UI-TARS grounding (~3s) ───
|
||||||
|
result = _grounding_ui_tars(target_text, target_description)
|
||||||
|
if result:
|
||||||
|
return result
|
||||||
|
|
||||||
|
# ─── Niveau 3 — VLM reasoning (~10s) ───
|
||||||
|
result = _grounding_vlm(target_text, target_description)
|
||||||
|
if result:
|
||||||
|
return result
|
||||||
|
|
||||||
|
logger.warning(f"[Grounding] ÉCHEC total pour '{search_label}' — aucune méthode n'a trouvé l'élément")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _capture_screen():
|
||||||
|
"""Capture l'écran principal et retourne (PIL.Image, width, height)."""
|
||||||
|
try:
|
||||||
|
import mss
|
||||||
|
from PIL import Image as PILImage
|
||||||
|
|
||||||
|
with mss.mss() as sct:
|
||||||
|
monitor = sct.monitors[1]
|
||||||
|
screenshot = sct.grab(monitor)
|
||||||
|
screen = PILImage.frombytes('RGB', screenshot.size, screenshot.bgra, 'raw', 'BGRX')
|
||||||
|
return screen, monitor['width'], monitor['height']
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Capture écran échouée: {e}")
|
||||||
|
return None, 0, 0
|
||||||
|
|
||||||
|
|
||||||
|
def _grounding_ocr(target_text: str) -> Optional[Dict[str, Any]]:
|
||||||
|
"""Niveau 1 — Cherche le texte par OCR (docTR). ~1s."""
|
||||||
|
if not target_text:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
screen, screen_w, screen_h = _capture_screen()
|
||||||
|
if screen is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Importer OCR (essayer les deux chemins)
|
||||||
|
try:
|
||||||
|
from services.ocr_service import ocr_extract_words
|
||||||
|
except ImportError:
|
||||||
|
from core.extraction.field_extractor import FieldExtractor
|
||||||
|
extractor = FieldExtractor()
|
||||||
|
def ocr_extract_words(img):
|
||||||
|
return extractor.extract_words_from_image(img)
|
||||||
|
|
||||||
|
words = ocr_extract_words(screen)
|
||||||
|
if not words:
|
||||||
|
logger.debug("[Grounding/OCR] Aucun mot détecté")
|
||||||
|
return None
|
||||||
|
|
||||||
|
target_lower = target_text.lower()
|
||||||
|
|
||||||
|
# Matching exact insensible à la casse
|
||||||
|
for word in words:
|
||||||
|
if word['text'].lower() == target_lower:
|
||||||
|
x1, y1, x2, y2 = word['bbox']
|
||||||
|
x = int((x1 + x2) / 2)
|
||||||
|
y = int((y1 + y2) / 2)
|
||||||
|
logger.info(f"[Grounding/OCR] Trouvé '{word['text']}' à ({x}, {y}) — match exact")
|
||||||
|
return {'x': x, 'y': y, 'method': 'ocr', 'confidence': 0.95}
|
||||||
|
|
||||||
|
# Matching partiel (mot coupé : "nregistrer" pour "Enregistrer")
|
||||||
|
for word in words:
|
||||||
|
word_lower = word['text'].lower()
|
||||||
|
if len(word_lower) < 3 or len(target_lower) < 3:
|
||||||
|
continue
|
||||||
|
# Le mot OCR contient le target (ou l'inverse)
|
||||||
|
if target_lower in word_lower or word_lower in target_lower:
|
||||||
|
x1, y1, x2, y2 = word['bbox']
|
||||||
|
x = int((x1 + x2) / 2)
|
||||||
|
y = int((y1 + y2) / 2)
|
||||||
|
logger.info(f"[Grounding/OCR] Trouvé '{word['text']}' à ({x}, {y}) — match partiel")
|
||||||
|
return {'x': x, 'y': y, 'method': 'ocr', 'confidence': 0.80}
|
||||||
|
|
||||||
|
# Matching partiel lettre initiale manquante (soulignée ou coupée)
|
||||||
|
if len(target_lower) > 3:
|
||||||
|
partial = target_lower[1:]
|
||||||
|
for word in words:
|
||||||
|
if partial in word['text'].lower():
|
||||||
|
x1, y1, x2, y2 = word['bbox']
|
||||||
|
x = int((x1 + x2) / 2)
|
||||||
|
y = int((y1 + y2) / 2)
|
||||||
|
logger.info(f"[Grounding/OCR] Trouvé '{word['text']}' à ({x}, {y}) — match partiel (lettre initiale manquante)")
|
||||||
|
return {'x': x, 'y': y, 'method': 'ocr', 'confidence': 0.70}
|
||||||
|
|
||||||
|
logger.debug(f"[Grounding/OCR] '{target_text}' non trouvé parmi {len(words)} mots")
|
||||||
|
return None
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"[Grounding/OCR] Erreur: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _grounding_ui_tars(target_text: str, target_description: str = "") -> Optional[Dict[str, Any]]:
|
||||||
|
"""Niveau 2 — UI-TARS grounding visuel (~3s)."""
|
||||||
|
try:
|
||||||
|
import requests
|
||||||
|
import base64
|
||||||
|
import io
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
|
||||||
|
screen, screen_w, screen_h = _capture_screen()
|
||||||
|
if screen is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Encoder le screenshot en base64
|
||||||
|
buffer = io.BytesIO()
|
||||||
|
screen.save(buffer, format='JPEG', quality=70)
|
||||||
|
image_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
|
||||||
|
|
||||||
|
# Construire le prompt pour UI-TARS
|
||||||
|
click_target = target_description or target_text
|
||||||
|
prompt = f"click on {click_target}"
|
||||||
|
|
||||||
|
ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434")
|
||||||
|
model = "0000/ui-tars-1.5-7b-q8_0:7b"
|
||||||
|
|
||||||
|
logger.info(f"[Grounding/UI-TARS] Envoi à {model}: '{prompt}'")
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
f"{ollama_url}/api/generate",
|
||||||
|
json={
|
||||||
|
"model": model,
|
||||||
|
"prompt": prompt,
|
||||||
|
"images": [image_b64],
|
||||||
|
"stream": False,
|
||||||
|
"options": {"temperature": 0.1, "num_predict": 50}
|
||||||
|
},
|
||||||
|
timeout=30
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code != 200:
|
||||||
|
logger.warning(f"[Grounding/UI-TARS] HTTP {response.status_code}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
result = response.json()
|
||||||
|
text = result.get('response', '').strip()
|
||||||
|
logger.debug(f"[Grounding/UI-TARS] Réponse brute: {text[:200]}")
|
||||||
|
|
||||||
|
# Parser les coordonnées de UI-TARS
|
||||||
|
coords = _parse_ui_tars_coordinates(text, screen_w, screen_h)
|
||||||
|
if coords:
|
||||||
|
x, y = coords
|
||||||
|
# Valider que les coordonnées sont dans l'écran
|
||||||
|
if 0 <= x <= screen_w and 0 <= y <= screen_h:
|
||||||
|
logger.info(f"[Grounding/UI-TARS] Grounding → ({x}, {y})")
|
||||||
|
return {'x': x, 'y': y, 'method': 'ui_tars', 'confidence': 0.85}
|
||||||
|
else:
|
||||||
|
logger.warning(f"[Grounding/UI-TARS] Coordonnées hors écran: ({x}, {y}) pour {screen_w}x{screen_h}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
logger.debug(f"[Grounding/UI-TARS] Pas de coordonnées parsées dans: {text[:100]}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"[Grounding/UI-TARS] Erreur: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_ui_tars_coordinates(text: str, screen_w: int, screen_h: int) -> Optional[tuple]:
|
||||||
|
"""Parse les coordonnées retournées par UI-TARS.
|
||||||
|
|
||||||
|
UI-TARS peut retourner :
|
||||||
|
- Coordonnées normalisées (0-1000) : "click at (500, 300)"
|
||||||
|
- Coordonnées en pixels : "click at (960, 540)"
|
||||||
|
- Format (x, y) ou [x, y] ou x,y
|
||||||
|
- Format "Action: click\nCoordinate: (500, 300)" ou "[500, 300]"
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(x_pixel, y_pixel) ou None
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
|
||||||
|
# Chercher des patterns de coordonnées
|
||||||
|
patterns = [
|
||||||
|
r'Coordinate:\s*\[?\(?\s*(\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\s*\)?\]?',
|
||||||
|
r'click\s+(?:at\s+)?\[?\(?\s*(\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\s*\)?\]?',
|
||||||
|
r'\(\s*(\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\s*\)',
|
||||||
|
r'\[\s*(\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\s*\]',
|
||||||
|
]
|
||||||
|
|
||||||
|
for pattern in patterns:
|
||||||
|
match = re.search(pattern, text, re.IGNORECASE)
|
||||||
|
if match:
|
||||||
|
raw_x = float(match.group(1))
|
||||||
|
raw_y = float(match.group(2))
|
||||||
|
|
||||||
|
# UI-TARS utilise souvent des coordonnées normalisées 0-1000
|
||||||
|
if raw_x <= 1000 and raw_y <= 1000 and (raw_x > 1 or raw_y > 1):
|
||||||
|
# Probablement normalisées sur 1000
|
||||||
|
x = int(raw_x * screen_w / 1000)
|
||||||
|
y = int(raw_y * screen_h / 1000)
|
||||||
|
elif raw_x <= 1.0 and raw_y <= 1.0:
|
||||||
|
# Normalisées 0-1
|
||||||
|
x = int(raw_x * screen_w)
|
||||||
|
y = int(raw_y * screen_h)
|
||||||
|
else:
|
||||||
|
# Pixels directs
|
||||||
|
x = int(raw_x)
|
||||||
|
y = int(raw_y)
|
||||||
|
|
||||||
|
return (x, y)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _grounding_vlm(target_text: str, target_description: str = "") -> Optional[Dict[str, Any]]:
|
||||||
|
"""Niveau 3 — VLM reasoning + confirmation OCR (~10s)."""
|
||||||
|
try:
|
||||||
|
search_label = target_description or target_text
|
||||||
|
|
||||||
|
vlm_result = vlm_reason_about_screen(
|
||||||
|
objective=f"Cliquer sur {search_label}",
|
||||||
|
context=f"Je cherche l'élément '{target_text}' sur l'écran pour cliquer dessus"
|
||||||
|
)
|
||||||
|
|
||||||
|
if not vlm_result:
|
||||||
|
logger.debug("[Grounding/VLM] VLM n'a pas retourné de résultat")
|
||||||
|
return None
|
||||||
|
|
||||||
|
if vlm_result.get('action') != 'click' or not vlm_result.get('target'):
|
||||||
|
logger.debug(f"[Grounding/VLM] VLM action={vlm_result.get('action')}, pas un clic")
|
||||||
|
return None
|
||||||
|
|
||||||
|
vlm_target = vlm_result['target']
|
||||||
|
logger.info(f"[Grounding/VLM] VLM suggère de cliquer sur: '{vlm_target}'")
|
||||||
|
|
||||||
|
# Confirmation par OCR : chercher le target VLM sur l'écran
|
||||||
|
screen, screen_w, screen_h = _capture_screen()
|
||||||
|
if screen is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
try:
|
||||||
|
from services.ocr_service import ocr_extract_words
|
||||||
|
except ImportError:
|
||||||
|
from core.extraction.field_extractor import FieldExtractor
|
||||||
|
extractor = FieldExtractor()
|
||||||
|
def ocr_extract_words(img):
|
||||||
|
return extractor.extract_words_from_image(img)
|
||||||
|
|
||||||
|
words = ocr_extract_words(screen)
|
||||||
|
|
||||||
|
vlm_target_lower = vlm_target.lower()
|
||||||
|
for word in words:
|
||||||
|
if vlm_target_lower in word['text'].lower() or word['text'].lower() in vlm_target_lower:
|
||||||
|
x1, y1, x2, y2 = word['bbox']
|
||||||
|
x = int((x1 + x2) / 2)
|
||||||
|
y = int((y1 + y2) / 2)
|
||||||
|
logger.info(f"[Grounding/VLM] Confirmé par OCR: '{word['text']}' à ({x}, {y})")
|
||||||
|
return {'x': x, 'y': y, 'method': 'vlm', 'confidence': 0.75}
|
||||||
|
|
||||||
|
logger.debug(f"[Grounding/VLM] Target VLM '{vlm_target}' non trouvé par OCR")
|
||||||
|
return None
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"[Grounding/VLM] OCR de confirmation échoué: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"[Grounding/VLM] Erreur: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def post_execution_cleanup(execution_mode: str = 'debug'):
|
def post_execution_cleanup(execution_mode: str = 'debug'):
|
||||||
"""Vérifie l'écran après exécution et gère les dialogues restants.
|
"""Vérifie l'écran après exécution et gère les dialogues restants.
|
||||||
|
|
||||||
|
|||||||
@@ -29,6 +29,7 @@ from core.execution.input_handler import (
|
|||||||
check_screen_for_patterns as _shared_check_patterns,
|
check_screen_for_patterns as _shared_check_patterns,
|
||||||
handle_detected_pattern as _shared_handle_pattern,
|
handle_detected_pattern as _shared_handle_pattern,
|
||||||
post_execution_cleanup as _shared_post_cleanup,
|
post_execution_cleanup as _shared_post_cleanup,
|
||||||
|
find_element_on_screen as _shared_find_element,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -213,6 +214,9 @@ def execute_workflow_thread(execution_id: str, workflow_id: str, app):
|
|||||||
break
|
break
|
||||||
|
|
||||||
# === EXÉCUTION DE L'ACTION ===
|
# === EXÉCUTION DE L'ACTION ===
|
||||||
|
# Passer le label de l'étape pour le grounding textuel
|
||||||
|
if step.label:
|
||||||
|
params['_step_label'] = step.label
|
||||||
result = execute_action(step.action_type, params)
|
result = execute_action(step.action_type, params)
|
||||||
|
|
||||||
# === SELF-HEALING INTERACTIF ===
|
# === SELF-HEALING INTERACTIF ===
|
||||||
@@ -809,12 +813,20 @@ def execute_action(action_type: str, params: dict) -> dict:
|
|||||||
'height': bbox.get('height', 0)
|
'height': bbox.get('height', 0)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Extraire le texte cible pour le grounding en dernier recours
|
||||||
|
_fc_target_text = params.get('visual_anchor', {}).get('target_text', '')
|
||||||
|
if not _fc_target_text:
|
||||||
|
_fc_target_text = params.get('_step_label', '')
|
||||||
|
_fc_target_desc = params.get('visual_anchor', {}).get('description', '')
|
||||||
|
|
||||||
# Trouver l'ancre avec la vision (CLIP + position - cf VISION_RPA_INTELLIGENT.md)
|
# Trouver l'ancre avec la vision (CLIP + position - cf VISION_RPA_INTELLIGENT.md)
|
||||||
result = find_and_click(
|
result = find_and_click(
|
||||||
anchor_image_base64=screenshot_base64,
|
anchor_image_base64=screenshot_base64,
|
||||||
anchor_bbox=anchor_bbox,
|
anchor_bbox=anchor_bbox,
|
||||||
method='clip', # UI-DETR-1 + CLIP avec pondération par distance
|
method='clip', # UI-DETR-1 + CLIP avec pondération par distance
|
||||||
detection_threshold=0.35
|
detection_threshold=0.35,
|
||||||
|
target_text=_fc_target_text,
|
||||||
|
target_description=_fc_target_desc
|
||||||
)
|
)
|
||||||
|
|
||||||
if result['found'] and result['coordinates']:
|
if result['found'] and result['coordinates']:
|
||||||
@@ -853,6 +865,47 @@ def execute_action(action_type: str, params: dict) -> dict:
|
|||||||
print(f"❌ [Vision] Ancre NON trouvée (confiance: {confidence:.2f})")
|
print(f"❌ [Vision] Ancre NON trouvée (confiance: {confidence:.2f})")
|
||||||
print(f" Raison: {reason}")
|
print(f" Raison: {reason}")
|
||||||
|
|
||||||
|
# === FALLBACK: Chaîne de grounding (OCR → UI-TARS → VLM) ===
|
||||||
|
target_text = params.get('visual_anchor', {}).get('target_text', '')
|
||||||
|
if not target_text:
|
||||||
|
target_text = params.get('_step_label', '')
|
||||||
|
target_desc = params.get('visual_anchor', {}).get('description', '')
|
||||||
|
|
||||||
|
if target_text:
|
||||||
|
print(f"🔗 [Grounding] Tentative cascade pour '{target_text}'...")
|
||||||
|
grounding_result = _shared_find_element(
|
||||||
|
target_text=target_text,
|
||||||
|
target_description=target_desc,
|
||||||
|
anchor_image_base64=screenshot_base64
|
||||||
|
)
|
||||||
|
if grounding_result:
|
||||||
|
gx, gy = grounding_result['x'], grounding_result['y']
|
||||||
|
gmethod = grounding_result['method']
|
||||||
|
gconf = grounding_result['confidence']
|
||||||
|
print(f"✅ [Grounding] Trouvé via {gmethod} à ({gx}, {gy}) conf={gconf:.2f}")
|
||||||
|
|
||||||
|
# Effectuer le clic
|
||||||
|
if click_type == 'double':
|
||||||
|
pyautogui.doubleClick(gx, gy)
|
||||||
|
elif click_type == 'right':
|
||||||
|
pyautogui.rightClick(gx, gy)
|
||||||
|
else:
|
||||||
|
pyautogui.click(gx, gy)
|
||||||
|
|
||||||
|
time.sleep(2.0)
|
||||||
|
|
||||||
|
return {
|
||||||
|
'success': True,
|
||||||
|
'output': {
|
||||||
|
'clicked_at': {'x': gx, 'y': gy},
|
||||||
|
'mode': execution_mode,
|
||||||
|
'confidence': gconf,
|
||||||
|
'method': f'grounding_{gmethod}'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
print(f"❌ [Grounding] Cascade échouée pour '{target_text}'")
|
||||||
|
|
||||||
# Si self-healing interactif activé, proposer des alternatives
|
# Si self-healing interactif activé, proposer des alternatives
|
||||||
if _execution_state.get('execution_mode') == 'intelligent' and candidates:
|
if _execution_state.get('execution_mode') == 'intelligent' and candidates:
|
||||||
print(f"🔄 [Self-Healing] {len(candidates)} candidats disponibles - attente choix utilisateur")
|
print(f"🔄 [Self-Healing] {len(candidates)} candidats disponibles - attente choix utilisateur")
|
||||||
|
|||||||
@@ -656,7 +656,9 @@ def find_and_click(
|
|||||||
anchor_image_base64: str,
|
anchor_image_base64: str,
|
||||||
anchor_bbox: Optional[Dict[str, int]] = None,
|
anchor_bbox: Optional[Dict[str, int]] = None,
|
||||||
method: str = 'clip',
|
method: str = 'clip',
|
||||||
detection_threshold: float = 0.35
|
detection_threshold: float = 0.35,
|
||||||
|
target_text: str = '',
|
||||||
|
target_description: str = ''
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Fonction utilitaire pour trouver une ancre et retourner les coordonnées de clic.
|
Fonction utilitaire pour trouver une ancre et retourner les coordonnées de clic.
|
||||||
@@ -665,11 +667,16 @@ def find_and_click(
|
|||||||
- 'clip': UI-DETR-1 + CLIP (matching sémantique intelligent, recommandé)
|
- 'clip': UI-DETR-1 + CLIP (matching sémantique intelligent, recommandé)
|
||||||
- 'zoned': Template matching zonée (fallback)
|
- 'zoned': Template matching zonée (fallback)
|
||||||
|
|
||||||
|
En dernier recours, si target_text est fourni, utilise la chaîne de grounding
|
||||||
|
(OCR → UI-TARS → VLM) via find_element_on_screen.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
anchor_image_base64: Image de l'ancre en base64
|
anchor_image_base64: Image de l'ancre en base64
|
||||||
anchor_bbox: Bounding box originale
|
anchor_bbox: Bounding box originale
|
||||||
method: 'clip' pour UI-DETR-1+CLIP, 'zoned' pour template zonée
|
method: 'clip' pour UI-DETR-1+CLIP, 'zoned' pour template zonée
|
||||||
detection_threshold: Seuil de détection pour UI-DETR-1
|
detection_threshold: Seuil de détection pour UI-DETR-1
|
||||||
|
target_text: Texte de l'élément à trouver (pour fallback grounding)
|
||||||
|
target_description: Description longue (pour fallback grounding)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dict avec found, coordinates, confidence, etc.
|
Dict avec found, coordinates, confidence, etc.
|
||||||
@@ -815,6 +822,35 @@ def find_and_click(
|
|||||||
except Exception as seeclick_err:
|
except Exception as seeclick_err:
|
||||||
print(f"⚠️ [Vision] Erreur SeeClick: {seeclick_err}")
|
print(f"⚠️ [Vision] Erreur SeeClick: {seeclick_err}")
|
||||||
|
|
||||||
|
# === FALLBACK: Chaîne de grounding (OCR → UI-TARS → VLM) ===
|
||||||
|
if target_text or target_description:
|
||||||
|
try:
|
||||||
|
from core.execution.input_handler import find_element_on_screen
|
||||||
|
print(f"🔗 [Vision] Dernier recours: chaîne de grounding pour '{target_text or target_description}'...")
|
||||||
|
grounding_result = find_element_on_screen(
|
||||||
|
target_text=target_text,
|
||||||
|
target_description=target_description,
|
||||||
|
anchor_image_base64=anchor_image_base64
|
||||||
|
)
|
||||||
|
if grounding_result:
|
||||||
|
gx, gy = grounding_result['x'], grounding_result['y']
|
||||||
|
gmethod = grounding_result['method']
|
||||||
|
gconf = grounding_result['confidence']
|
||||||
|
print(f"✅ [Vision] Grounding réussi via {gmethod} à ({gx}, {gy}) conf={gconf:.2f}")
|
||||||
|
return {
|
||||||
|
'found': True,
|
||||||
|
'confidence': gconf,
|
||||||
|
'coordinates': {'x': gx, 'y': gy},
|
||||||
|
'bbox': anchor_bbox,
|
||||||
|
'method': f'grounding_{gmethod}',
|
||||||
|
'search_time_ms': (_time.time() - start_time) * 1000,
|
||||||
|
'candidates': []
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
print(f"❌ [Vision] Chaîne de grounding échouée pour '{target_text or target_description}'")
|
||||||
|
except Exception as grounding_err:
|
||||||
|
print(f"⚠️ [Vision] Erreur chaîne de grounding: {grounding_err}")
|
||||||
|
|
||||||
# === Toutes les méthodes visuelles ont échoué ===
|
# === Toutes les méthodes visuelles ont échoué ===
|
||||||
if anchor_bbox:
|
if anchor_bbox:
|
||||||
best_conf = max(global_result.get('confidence', 0), 0)
|
best_conf = max(global_result.get('confidence', 0), 0)
|
||||||
|
|||||||
Reference in New Issue
Block a user