From d1b556b6cd98730d53f22a2d4dc0b9ca350444cb Mon Sep 17 00:00:00 2001 From: Dom Date: Tue, 21 Apr 2026 10:05:29 +0200 Subject: [PATCH] =?UTF-8?q?fix(grounding):=20supprimer=20SeeClick=20cass?= =?UTF-8?q?=C3=A9=20+=20log=20description=20ancre?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SeeClick supprimé : modèle HF incompatible (QWenConfig non reconnu), crashait à chaque exécution et polluait les logs. Remplacé par UI-TARS via la chaîne de grounding. Log warning visible quand la description VLM de l'ancre échoue (pour diagnostiquer les problèmes de VRAM). Co-Authored-By: Claude Opus 4.6 (1M context) --- core/execution/input_handler.py | 2 +- .../backend/services/intelligent_executor.py | 46 ------------------- 2 files changed, 1 insertion(+), 47 deletions(-) diff --git a/core/execution/input_handler.py b/core/execution/input_handler.py index 4d212c30b..c262edbfe 100644 --- a/core/execution/input_handler.py +++ b/core/execution/input_handler.py @@ -424,7 +424,7 @@ def _describe_anchor_image(anchor_image_base64: str) -> Optional[str]: return None except Exception as e: - logger.debug(f"Description ancre échouée: {e}") + logger.warning(f"[Grounding] Description ancre échouée: {e}") return None diff --git a/visual_workflow_builder/backend/services/intelligent_executor.py b/visual_workflow_builder/backend/services/intelligent_executor.py index 30cf5b275..fc0d73841 100644 --- a/visual_workflow_builder/backend/services/intelligent_executor.py +++ b/visual_workflow_builder/backend/services/intelligent_executor.py @@ -776,52 +776,6 @@ def find_and_click( global_result['search_time_ms'] = (_time.time() - start_time) * 1000 return global_result - # === STRATÉGIE 4: SeeClick (visual grounding) === - # Essayer SeeClick si les autres méthodes ont échoué - try: - print("🎯 [Vision] Essai SeeClick (visual grounding)...") - from core.detection.seeclick_adapter import get_seeclick - - seeclick = get_seeclick() - if seeclick.available: - # Utiliser une description générique basée sur l'ancre - # TODO: Améliorer avec une description plus précise - description = "the clickable element or button" - - grounding_result = seeclick.ground(screen_image, description, return_pixels=True) - - if grounding_result.found: - found_x = grounding_result.x_pixel - found_y = grounding_result.y_pixel - - # Vérifier la distance à la position originale si anchor_bbox existe - accept_seeclick = True - if anchor_bbox: - orig_x = anchor_bbox.get('x', 0) + anchor_bbox.get('width', 0) // 2 - orig_y = anchor_bbox.get('y', 0) + anchor_bbox.get('height', 0) // 2 - distance = np.sqrt((found_x - orig_x)**2 + (found_y - orig_y)**2) - - MAX_SEECLICK_DISTANCE = 500 - if distance > MAX_SEECLICK_DISTANCE: - print(f"⛔ [Vision] SeeClick rejeté: distance {distance:.0f}px > {MAX_SEECLICK_DISTANCE}px max") - accept_seeclick = False - - if accept_seeclick: - print(f"✅ [Vision] SeeClick réussi! Coordonnées: ({found_x}, {found_y})") - return { - 'found': True, - 'confidence': grounding_result.confidence, - 'coordinates': {'x': found_x, 'y': found_y}, - 'bbox': anchor_bbox, - 'method': 'seeclick_grounding', - 'search_time_ms': (_time.time() - start_time) * 1000, - 'raw_output': grounding_result.raw_output - } - except ImportError: - print("ℹ️ [Vision] SeeClick non disponible (module non trouvé)") - except Exception as seeclick_err: - print(f"⚠️ [Vision] Erreur SeeClick: {seeclick_err}") - # === FALLBACK: Chaîne de grounding (OCR → UI-TARS → VLM) === if target_text or target_description: try: