feat: chaîne de grounding 3 niveaux + refonte capture écran
Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 12s
security-audit / pip-audit (CVE dépendances) (push) Successful in 10s
security-audit / Scan secrets (grep) (push) Successful in 9s
tests / Lint (ruff + black) (push) Successful in 14s
tests / Tests unitaires (sans GPU) (push) Failing after 14s
tests / Tests sécurité (critique) (push) Has been skipped
Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 12s
security-audit / pip-audit (CVE dépendances) (push) Successful in 10s
security-audit / Scan secrets (grep) (push) Successful in 9s
tests / Lint (ruff + black) (push) Successful in 14s
tests / Tests unitaires (sans GPU) (push) Failing after 14s
tests / Tests sécurité (critique) (push) Has been skipped
Grounding en cascade quand CLIP/template échouent : 1. OCR (docTR) → cherche le texte exact sur l'écran (~1s) 2. UI-TARS grounding → "click on X" → coordonnées (~3s, 94% ScreenSpot) 3. VLM reasoning → raisonnement complet + confirmation OCR (~10s) find_element_on_screen() dans input_handler.py (partagé VWB + Léa). Câblé dans find_and_click() et execute_action() comme fallback. Refonte capture écran : - mss.monitors[0] (composite) pour capturer la VM en plein écran - FullscreenSelector réécrit : overlay via getBoundingClientRect() - Bboxes et sélection alignées avec l'image (calcul JS, pas CSS) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -29,6 +29,7 @@ from core.execution.input_handler import (
|
||||
check_screen_for_patterns as _shared_check_patterns,
|
||||
handle_detected_pattern as _shared_handle_pattern,
|
||||
post_execution_cleanup as _shared_post_cleanup,
|
||||
find_element_on_screen as _shared_find_element,
|
||||
)
|
||||
|
||||
|
||||
@@ -213,6 +214,9 @@ def execute_workflow_thread(execution_id: str, workflow_id: str, app):
|
||||
break
|
||||
|
||||
# === EXÉCUTION DE L'ACTION ===
|
||||
# Passer le label de l'étape pour le grounding textuel
|
||||
if step.label:
|
||||
params['_step_label'] = step.label
|
||||
result = execute_action(step.action_type, params)
|
||||
|
||||
# === SELF-HEALING INTERACTIF ===
|
||||
@@ -809,12 +813,20 @@ def execute_action(action_type: str, params: dict) -> dict:
|
||||
'height': bbox.get('height', 0)
|
||||
}
|
||||
|
||||
# Extraire le texte cible pour le grounding en dernier recours
|
||||
_fc_target_text = params.get('visual_anchor', {}).get('target_text', '')
|
||||
if not _fc_target_text:
|
||||
_fc_target_text = params.get('_step_label', '')
|
||||
_fc_target_desc = params.get('visual_anchor', {}).get('description', '')
|
||||
|
||||
# Trouver l'ancre avec la vision (CLIP + position - cf VISION_RPA_INTELLIGENT.md)
|
||||
result = find_and_click(
|
||||
anchor_image_base64=screenshot_base64,
|
||||
anchor_bbox=anchor_bbox,
|
||||
method='clip', # UI-DETR-1 + CLIP avec pondération par distance
|
||||
detection_threshold=0.35
|
||||
detection_threshold=0.35,
|
||||
target_text=_fc_target_text,
|
||||
target_description=_fc_target_desc
|
||||
)
|
||||
|
||||
if result['found'] and result['coordinates']:
|
||||
@@ -853,6 +865,47 @@ def execute_action(action_type: str, params: dict) -> dict:
|
||||
print(f"❌ [Vision] Ancre NON trouvée (confiance: {confidence:.2f})")
|
||||
print(f" Raison: {reason}")
|
||||
|
||||
# === FALLBACK: Chaîne de grounding (OCR → UI-TARS → VLM) ===
|
||||
target_text = params.get('visual_anchor', {}).get('target_text', '')
|
||||
if not target_text:
|
||||
target_text = params.get('_step_label', '')
|
||||
target_desc = params.get('visual_anchor', {}).get('description', '')
|
||||
|
||||
if target_text:
|
||||
print(f"🔗 [Grounding] Tentative cascade pour '{target_text}'...")
|
||||
grounding_result = _shared_find_element(
|
||||
target_text=target_text,
|
||||
target_description=target_desc,
|
||||
anchor_image_base64=screenshot_base64
|
||||
)
|
||||
if grounding_result:
|
||||
gx, gy = grounding_result['x'], grounding_result['y']
|
||||
gmethod = grounding_result['method']
|
||||
gconf = grounding_result['confidence']
|
||||
print(f"✅ [Grounding] Trouvé via {gmethod} à ({gx}, {gy}) conf={gconf:.2f}")
|
||||
|
||||
# Effectuer le clic
|
||||
if click_type == 'double':
|
||||
pyautogui.doubleClick(gx, gy)
|
||||
elif click_type == 'right':
|
||||
pyautogui.rightClick(gx, gy)
|
||||
else:
|
||||
pyautogui.click(gx, gy)
|
||||
|
||||
time.sleep(2.0)
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'output': {
|
||||
'clicked_at': {'x': gx, 'y': gy},
|
||||
'mode': execution_mode,
|
||||
'confidence': gconf,
|
||||
'method': f'grounding_{gmethod}'
|
||||
}
|
||||
}
|
||||
else:
|
||||
print(f"❌ [Grounding] Cascade échouée pour '{target_text}'")
|
||||
|
||||
# Si self-healing interactif activé, proposer des alternatives
|
||||
if _execution_state.get('execution_mode') == 'intelligent' and candidates:
|
||||
print(f"🔄 [Self-Healing] {len(candidates)} candidats disponibles - attente choix utilisateur")
|
||||
|
||||
@@ -656,7 +656,9 @@ def find_and_click(
|
||||
anchor_image_base64: str,
|
||||
anchor_bbox: Optional[Dict[str, int]] = None,
|
||||
method: str = 'clip',
|
||||
detection_threshold: float = 0.35
|
||||
detection_threshold: float = 0.35,
|
||||
target_text: str = '',
|
||||
target_description: str = ''
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Fonction utilitaire pour trouver une ancre et retourner les coordonnées de clic.
|
||||
@@ -665,11 +667,16 @@ def find_and_click(
|
||||
- 'clip': UI-DETR-1 + CLIP (matching sémantique intelligent, recommandé)
|
||||
- 'zoned': Template matching zonée (fallback)
|
||||
|
||||
En dernier recours, si target_text est fourni, utilise la chaîne de grounding
|
||||
(OCR → UI-TARS → VLM) via find_element_on_screen.
|
||||
|
||||
Args:
|
||||
anchor_image_base64: Image de l'ancre en base64
|
||||
anchor_bbox: Bounding box originale
|
||||
method: 'clip' pour UI-DETR-1+CLIP, 'zoned' pour template zonée
|
||||
detection_threshold: Seuil de détection pour UI-DETR-1
|
||||
target_text: Texte de l'élément à trouver (pour fallback grounding)
|
||||
target_description: Description longue (pour fallback grounding)
|
||||
|
||||
Returns:
|
||||
Dict avec found, coordinates, confidence, etc.
|
||||
@@ -815,6 +822,35 @@ def find_and_click(
|
||||
except Exception as seeclick_err:
|
||||
print(f"⚠️ [Vision] Erreur SeeClick: {seeclick_err}")
|
||||
|
||||
# === FALLBACK: Chaîne de grounding (OCR → UI-TARS → VLM) ===
|
||||
if target_text or target_description:
|
||||
try:
|
||||
from core.execution.input_handler import find_element_on_screen
|
||||
print(f"🔗 [Vision] Dernier recours: chaîne de grounding pour '{target_text or target_description}'...")
|
||||
grounding_result = find_element_on_screen(
|
||||
target_text=target_text,
|
||||
target_description=target_description,
|
||||
anchor_image_base64=anchor_image_base64
|
||||
)
|
||||
if grounding_result:
|
||||
gx, gy = grounding_result['x'], grounding_result['y']
|
||||
gmethod = grounding_result['method']
|
||||
gconf = grounding_result['confidence']
|
||||
print(f"✅ [Vision] Grounding réussi via {gmethod} à ({gx}, {gy}) conf={gconf:.2f}")
|
||||
return {
|
||||
'found': True,
|
||||
'confidence': gconf,
|
||||
'coordinates': {'x': gx, 'y': gy},
|
||||
'bbox': anchor_bbox,
|
||||
'method': f'grounding_{gmethod}',
|
||||
'search_time_ms': (_time.time() - start_time) * 1000,
|
||||
'candidates': []
|
||||
}
|
||||
else:
|
||||
print(f"❌ [Vision] Chaîne de grounding échouée pour '{target_text or target_description}'")
|
||||
except Exception as grounding_err:
|
||||
print(f"⚠️ [Vision] Erreur chaîne de grounding: {grounding_err}")
|
||||
|
||||
# === Toutes les méthodes visuelles ont échoué ===
|
||||
if anchor_bbox:
|
||||
best_conf = max(global_result.get('confidence', 0), 0)
|
||||
|
||||
Reference in New Issue
Block a user