feat(execution): cascade post-raccourci pilotée par DialogHandler/OCR
Le pHash global 8x8 sur écran 1920x1080 ne détecte pas l'ouverture d'un dialog modal dans une VM QEMU (un dialog 800x500 couvre ~3 pixels pHash, distance Hamming typique = 1-2, sous le seuil de 3). Découvert sur Win11/ Notepad : Ctrl+Shift+S ouvrait bien le dialog mais Léa abortait à tort. _handle_post_shortcut() poll désormais DialogHandler.handle_if_dialog() toutes les 500ms (EasyOCR + KNOWN_DIALOGS). 8s pour le premier dialog, 3s de stabilité entre dialogs successifs, 60s budget total. KNOWN_DIALOGS réordonné : popups modaux (confirmer/remplacer/écraser) prioritaires sur fenêtres parents (enregistrer sous/save as) car l'OCR full-screen capte les deux simultanément. DialogHandler bascule sur UITarsGrounder subprocess one-shot (au lieu du serveur HTTP localhost:8200 qui n'existait plus). InfiGUI worker, think_arbiter et ui_tars_grounder alignés sur le même contrat. Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
This commit is contained in:
@@ -221,22 +221,31 @@ class ORALoop:
|
|||||||
if action_type in ('click_anchor', 'click', 'double_click_anchor', 'right_click_anchor'):
|
if action_type in ('click_anchor', 'click', 'double_click_anchor', 'right_click_anchor'):
|
||||||
target_text = anchor.get('target_text', '') or anchor.get('description', '')
|
target_text = anchor.get('target_text', '') or anchor.get('description', '')
|
||||||
|
|
||||||
# Si target_text est vide ou est un nom d'action → décrire le crop
|
# Détecter les target_text absurdes : vide, nom d'action, ou bruit OCR
|
||||||
if not target_text or target_text in _action_type_names:
|
def _is_garbage(t):
|
||||||
screenshot_b64 = anchor.get('screenshot', '')
|
if not t or t in _action_type_names:
|
||||||
if screenshot_b64:
|
return True
|
||||||
try:
|
# Bruit OCR : que des caractères spéciaux/chiffres/espaces
|
||||||
from core.execution.input_handler import _describe_anchor_image
|
cleaned = t.replace('-', '').replace(' ', '').replace('.', '').replace('_', '')
|
||||||
desc = _describe_anchor_image(screenshot_b64)
|
if len(cleaned) < 3:
|
||||||
if desc and len(desc) > 2:
|
return True
|
||||||
target_text = desc
|
# Que des chiffres
|
||||||
print(f"🏷️ [ORA/reason] Ancre décrite par VLM: '{target_text}'")
|
if cleaned.isdigit():
|
||||||
except Exception:
|
return True
|
||||||
pass
|
return False
|
||||||
|
|
||||||
|
# Note: plus d'appel à _describe_anchor_image() (qwen2.5vl) ici.
|
||||||
|
# Le crop d'ancre (screenshot_b64) servira directement au template matching
|
||||||
|
# cv2 dans _act_click, puis fallback InfiGUI fusionné si nécessaire.
|
||||||
|
# Cela évite le conflit VRAM (qwen2.5vl 9.4GB + InfiGUI 2.4GB > 11.5GB GPU).
|
||||||
|
|
||||||
# Dernier fallback : label si pas un nom d'action
|
# Dernier fallback : label si pas un nom d'action
|
||||||
if not target_text or target_text in _action_type_names:
|
if _is_garbage(target_text):
|
||||||
target_text = label if label not in _action_type_names else ''
|
target_text = label if label not in _action_type_names else ''
|
||||||
|
if target_text:
|
||||||
|
print(f"🏷️ [ORA/reason] Label garbage, fallback texte: '{target_text}'")
|
||||||
|
else:
|
||||||
|
print(f"🏷️ [ORA/reason] Pas de label texte — grounding via crop visuel uniquement")
|
||||||
|
|
||||||
action = 'click'
|
action = 'click'
|
||||||
value = 'double' if action_type == 'double_click_anchor' else (
|
value = 'double' if action_type == 'double_click_anchor' else (
|
||||||
@@ -1245,6 +1254,7 @@ Règles:
|
|||||||
)
|
)
|
||||||
|
|
||||||
print(f"🚀 [ORA] Démarrage workflow: {total} étapes, verify={self.verify_level}, retries={self.max_retries}")
|
print(f"🚀 [ORA] Démarrage workflow: {total} étapes, verify={self.verify_level}, retries={self.max_retries}")
|
||||||
|
print(f"🔧 [ORA] CODE VERSION: post-shortcut-dialog-handler ACTIF (26 avril 17h30)")
|
||||||
|
|
||||||
for i, step in enumerate(steps):
|
for i, step in enumerate(steps):
|
||||||
if not self._should_continue():
|
if not self._should_continue():
|
||||||
@@ -1326,6 +1336,47 @@ Règles:
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# --- 3b. Post-raccourci : attendre changement écran + gérer dialogue ---
|
||||||
|
# Après un keyboard_shortcut (pas scroll), on polle le pHash pour détecter
|
||||||
|
# si un dialogue est apparu (ex: "Enregistrer sous" après Ctrl+Shift+S).
|
||||||
|
# Si oui → InfiGUI localise et clique le bouton visuellement.
|
||||||
|
if act_success and decision.action == 'hotkey' and not decision.value.startswith('scroll_'):
|
||||||
|
print(f"🔍 [ORA/post-shortcut] ENTRÉ dans le bloc post-shortcut (action={decision.action}, value={decision.value})")
|
||||||
|
dialog_handled = self._handle_post_shortcut(pre)
|
||||||
|
if dialog_handled:
|
||||||
|
time.sleep(0.5)
|
||||||
|
post = self.observe()
|
||||||
|
self._last_post_phash = post.phash
|
||||||
|
if on_progress:
|
||||||
|
on_progress(i + 1, total, VerificationResult(
|
||||||
|
success=True, change_level='major',
|
||||||
|
matches_expected=True,
|
||||||
|
detail="Dialogue géré visuellement après raccourci"
|
||||||
|
))
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# Invariant : aucune étape suivante ne doit s'exécuter tant que
|
||||||
|
# la cascade déclenchée par le raccourci n'est pas pleinement résolue.
|
||||||
|
# Cas typique : Ctrl+S → "Enregistrer sous" non géré → on ABORT plutôt
|
||||||
|
# que de cliquer sur des coordonnées potentiellement obsolètes.
|
||||||
|
msg = (
|
||||||
|
f"Étape {i+1}: raccourci '{decision.value}' — cascade post-raccourci "
|
||||||
|
f"non résolue (dialogue absent ou bloqué). Workflow stoppé pour éviter "
|
||||||
|
f"un clic dans un contexte incohérent."
|
||||||
|
)
|
||||||
|
print(f"❌ [ORA/post-shortcut] {msg}")
|
||||||
|
logger.warning(f"🆘 [ORA] {msg}")
|
||||||
|
if on_progress:
|
||||||
|
on_progress(i + 1, total, VerificationResult(
|
||||||
|
success=False, change_level='none',
|
||||||
|
matches_expected=False,
|
||||||
|
detail="Cascade post-raccourci non résolue"
|
||||||
|
))
|
||||||
|
return LoopResult(
|
||||||
|
success=False, steps_completed=i, total_steps=total,
|
||||||
|
reason=msg,
|
||||||
|
)
|
||||||
|
|
||||||
# Petit délai pour laisser l'écran se stabiliser
|
# Petit délai pour laisser l'écran se stabiliser
|
||||||
time.sleep(0.3)
|
time.sleep(0.3)
|
||||||
|
|
||||||
@@ -1412,6 +1463,107 @@ Règles:
|
|||||||
# Méthodes privées — actions
|
# Méthodes privées — actions
|
||||||
# ═══════════════════════════════════════════════════════════
|
# ═══════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
def _handle_post_shortcut(self, pre_obs: 'Observation') -> bool:
|
||||||
|
"""Après un raccourci clavier, résoudre la cascade de dialogues réflexes.
|
||||||
|
|
||||||
|
Pilotage par DialogHandler (OCR direct), PAS par pHash. Raison :
|
||||||
|
un dialog modal qui s'ouvre dans une VM ne change quasiment pas le
|
||||||
|
pHash global de l'écran hôte (signature 8x8 sur 1920x1080 — un dialog
|
||||||
|
de 800x500 couvre ~3 pixels pHash, distance Hamming souvent < 3).
|
||||||
|
On poll donc directement DialogHandler.handle_if_dialog().
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True si au moins un dialog connu a été détecté + géré et qu'aucun
|
||||||
|
autre dialog n'apparaît dans la fenêtre de stabilité finale.
|
||||||
|
False si aucun dialog connu n'apparaît dans la fenêtre d'attente
|
||||||
|
initiale (le workflow doit ABORT — état incohérent).
|
||||||
|
"""
|
||||||
|
from core.grounding.dialog_handler import DialogHandler
|
||||||
|
|
||||||
|
# Fenêtre d'attente du PREMIER dialog après le raccourci. Win11/QEMU :
|
||||||
|
# Ctrl+Shift+S → "Enregistrer sous" apparaît en <2s typiquement.
|
||||||
|
first_dialog_timeout = 8.0
|
||||||
|
# Budget total pour résoudre toute la cascade (InfiGUI ~15s/dialog).
|
||||||
|
total_timeout = 60.0
|
||||||
|
# Fenêtre de stabilité après le dernier dialog géré : si rien d'autre
|
||||||
|
# n'apparaît pendant cette durée, la cascade est considérée terminée.
|
||||||
|
# Doit couvrir l'apparition du popup modal suivant (post_click_wait + marge).
|
||||||
|
stable_window = 3.0
|
||||||
|
# Délai post-clic avant de tester le dialog suivant.
|
||||||
|
post_click_wait = 1.5
|
||||||
|
# Cadence de polling OCR (EasyOCR full-screen ~500ms/poll).
|
||||||
|
poll_interval = 0.5
|
||||||
|
# Garde-fou anti-boucle infinie.
|
||||||
|
max_dialog_iterations = 5
|
||||||
|
|
||||||
|
t_start = time.time()
|
||||||
|
dh = DialogHandler()
|
||||||
|
dialogs_handled = 0
|
||||||
|
|
||||||
|
def _elapsed() -> float:
|
||||||
|
return time.time() - t_start
|
||||||
|
|
||||||
|
def _poll_dialog(deadline: float) -> Optional[Dict[str, Any]]:
|
||||||
|
"""Poll DialogHandler jusqu'à détection d'un dialog connu OU deadline.
|
||||||
|
|
||||||
|
Retourne le dict result si un dialog connu a été géré (cliqué),
|
||||||
|
None si la deadline est atteinte sans match. Si DialogHandler
|
||||||
|
détecte ET clique avec succès, le clic InfiGUI peut excéder la
|
||||||
|
deadline mais on retourne quand même le résultat (action déjà
|
||||||
|
engagée — on ne va pas l'annuler).
|
||||||
|
"""
|
||||||
|
while time.time() < deadline:
|
||||||
|
obs = self.observe()
|
||||||
|
try:
|
||||||
|
result = dh.handle_if_dialog(obs.screenshot)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠️ [ORA/post-shortcut] Erreur dialog handler: {e}")
|
||||||
|
return None
|
||||||
|
if result.get('handled'):
|
||||||
|
return result
|
||||||
|
sleep_left = deadline - time.time()
|
||||||
|
if sleep_left > 0:
|
||||||
|
time.sleep(min(poll_interval, sleep_left))
|
||||||
|
return None
|
||||||
|
|
||||||
|
# --- Étape 1 : attendre le PREMIER dialog ---
|
||||||
|
first_deadline = t_start + min(total_timeout, first_dialog_timeout)
|
||||||
|
result = _poll_dialog(first_deadline)
|
||||||
|
if result is None:
|
||||||
|
print(f"⏳ [ORA/post-shortcut] Aucun dialog connu détecté après "
|
||||||
|
f"{_elapsed():.1f}s (fenêtre={first_dialog_timeout}s) — "
|
||||||
|
f"raccourci sans effet attendu")
|
||||||
|
return False
|
||||||
|
|
||||||
|
dialogs_handled = 1
|
||||||
|
print(f"✅ [ORA/post-shortcut] Dialog #1 géré: {result.get('action')} "
|
||||||
|
f"({_elapsed():.1f}s)")
|
||||||
|
time.sleep(post_click_wait)
|
||||||
|
|
||||||
|
# --- Étape 2 : cascade — chaque dialog suivant doit apparaître dans stable_window ---
|
||||||
|
for iteration in range(1, max_dialog_iterations):
|
||||||
|
if _elapsed() >= total_timeout:
|
||||||
|
print(f"⏳ [ORA/post-shortcut] Timeout cascade ({total_timeout:.0f}s, "
|
||||||
|
f"{dialogs_handled} dialog(s) géré(s))")
|
||||||
|
return True # au moins un dialog traité → considéré OK
|
||||||
|
|
||||||
|
next_deadline = min(time.time() + stable_window, t_start + total_timeout)
|
||||||
|
result = _poll_dialog(next_deadline)
|
||||||
|
if result is None:
|
||||||
|
# Pas de nouveau dialog dans stable_window → cascade terminée
|
||||||
|
print(f"✅ [ORA/post-shortcut] Cascade résolue "
|
||||||
|
f"({dialogs_handled} dialog(s), {_elapsed():.1f}s)")
|
||||||
|
return True
|
||||||
|
|
||||||
|
dialogs_handled += 1
|
||||||
|
print(f"✅ [ORA/post-shortcut] Dialog #{dialogs_handled} géré: "
|
||||||
|
f"{result.get('action')} ({_elapsed():.1f}s)")
|
||||||
|
time.sleep(post_click_wait)
|
||||||
|
|
||||||
|
print(f"⚠️ [ORA/post-shortcut] Trop d'itérations cascade "
|
||||||
|
f"({max_dialog_iterations}) — cascade malformée, on s'arrête là")
|
||||||
|
return dialogs_handled > 0
|
||||||
|
|
||||||
def _act_click(self, decision: Decision, step_params: dict) -> bool:
|
def _act_click(self, decision: Decision, step_params: dict) -> bool:
|
||||||
"""Exécute un clic (simple, double, droit, hover, focus).
|
"""Exécute un clic (simple, double, droit, hover, focus).
|
||||||
|
|
||||||
@@ -1425,16 +1577,62 @@ Règles:
|
|||||||
anchor = step_params.get('visual_anchor', {})
|
anchor = step_params.get('visual_anchor', {})
|
||||||
screenshot_b64 = anchor.get('screenshot')
|
screenshot_b64 = anchor.get('screenshot')
|
||||||
bbox = anchor.get('bounding_box', {})
|
bbox = anchor.get('bounding_box', {})
|
||||||
target_text = anchor.get('target_text', '') or decision.target
|
# Utiliser le target nettoyé par reason_workflow_step (pas relire le garbage de l'ancre)
|
||||||
|
target_text = decision.target
|
||||||
target_desc = anchor.get('description', '')
|
target_desc = anchor.get('description', '')
|
||||||
|
|
||||||
|
print(f"🎯 [ORA/_act_click] target='{target_text}', desc='{target_desc[:40]}', bbox={bbox.get('x','?')},{bbox.get('y','?')}")
|
||||||
|
|
||||||
x, y = None, None
|
x, y = None, None
|
||||||
method_used = ''
|
method_used = ''
|
||||||
|
# Score et position du template-first (réutilisés en fallback intermédiaire)
|
||||||
|
template_score = 0.0
|
||||||
|
template_xy: Optional[tuple] = None
|
||||||
|
|
||||||
# --- Pipeline FAST→SMART→THINK ---
|
# --- AVANT-POSTE : template matching cv2 sur le crop d'ancre ---
|
||||||
|
# Si l'UI n'a pas changé (cas dominant en replay), un match pixel-perfect
|
||||||
|
# nous donne le clic en ~50ms sans toucher au GPU. On ne déclenche le
|
||||||
|
# pipeline VLM que si le score est insuffisant.
|
||||||
|
if screenshot_b64 and CV2_AVAILABLE and PIL_AVAILABLE and MSS_AVAILABLE:
|
||||||
|
try:
|
||||||
|
import io as _io
|
||||||
|
with mss_lib.mss() as sct:
|
||||||
|
mon = sct.monitors[0]
|
||||||
|
grab = sct.grab(mon)
|
||||||
|
screen_img = Image.frombytes('RGB', grab.size, grab.bgra, 'raw', 'BGRX')
|
||||||
|
|
||||||
|
raw_b64 = screenshot_b64.split(',')[1] if ',' in screenshot_b64 else screenshot_b64
|
||||||
|
anchor_data = base64.b64decode(raw_b64)
|
||||||
|
anchor_img = Image.open(_io.BytesIO(anchor_data))
|
||||||
|
|
||||||
|
screen_cv = cv2.cvtColor(np.array(screen_img), cv2.COLOR_RGB2BGR)
|
||||||
|
anchor_cv = cv2.cvtColor(np.array(anchor_img), cv2.COLOR_RGB2BGR)
|
||||||
|
|
||||||
|
if anchor_cv.shape[0] < screen_cv.shape[0] and anchor_cv.shape[1] < screen_cv.shape[1]:
|
||||||
|
t0 = time.time()
|
||||||
|
result_tm = cv2.matchTemplate(screen_cv, anchor_cv, cv2.TM_CCOEFF_NORMED)
|
||||||
|
_, max_val, _, max_loc = cv2.minMaxLoc(result_tm)
|
||||||
|
elapsed_ms = (time.time() - t0) * 1000
|
||||||
|
template_score = float(max_val)
|
||||||
|
template_xy = (
|
||||||
|
max_loc[0] + anchor_cv.shape[1] // 2,
|
||||||
|
max_loc[1] + anchor_cv.shape[0] // 2,
|
||||||
|
)
|
||||||
|
print(f"⚡ [ORA/template-first] score={template_score:.3f} pos={max_loc} ({elapsed_ms:.0f}ms)")
|
||||||
|
# Seuil élevé pour le mode "direct" : on veut être quasi-certain
|
||||||
|
# que c'est le même élément, pixel-perfect, avant de zapper le VLM.
|
||||||
|
if template_score >= 0.95:
|
||||||
|
x, y = template_xy
|
||||||
|
method_used = 'template_direct'
|
||||||
|
print(f"✅ [ORA/template-first] Match direct → ({x}, {y}), skip pipeline")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠️ [ORA/template-first] Erreur: {e}")
|
||||||
|
|
||||||
|
# --- Pipeline FAST→SMART→THINK (escalade si template-first n'a pas tranché) ---
|
||||||
_use_fast = os.environ.get('RPA_USE_FAST_PIPELINE', '1') == '1'
|
_use_fast = os.environ.get('RPA_USE_FAST_PIPELINE', '1') == '1'
|
||||||
|
|
||||||
if _use_fast and (target_text or target_desc):
|
if x is None and _use_fast and (target_text or target_desc or screenshot_b64):
|
||||||
|
print(f"🎯 [ORA/_act_click] RPA_USE_FAST_PIPELINE={_use_fast}, has_target={bool(target_text or target_desc)}, template_score={template_score:.3f}")
|
||||||
try:
|
try:
|
||||||
from core.grounding.fast_pipeline import FastSmartThinkPipeline
|
from core.grounding.fast_pipeline import FastSmartThinkPipeline
|
||||||
from core.grounding.target import GroundingTarget
|
from core.grounding.target import GroundingTarget
|
||||||
@@ -1471,34 +1669,13 @@ Règles:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"⚠️ [ORA/pipeline] Erreur: {e}")
|
print(f"⚠️ [ORA/pipeline] Erreur: {e}")
|
||||||
|
|
||||||
# --- Fallback : ancien pipeline (template → OCR → static) ---
|
# --- Fallback : on réutilise le score template-first si pertinent ---
|
||||||
if x is None and screenshot_b64 and CV2_AVAILABLE and PIL_AVAILABLE and MSS_AVAILABLE:
|
# Si le pipeline VLM a échoué mais que le template-first avait un score
|
||||||
try:
|
# intermédiaire (0.75-0.95), on accepte ce match comme secours.
|
||||||
import io as _io
|
if x is None and template_xy is not None and template_score >= 0.75:
|
||||||
with mss_lib.mss() as sct:
|
x, y = template_xy
|
||||||
mon = sct.monitors[0]
|
method_used = 'template_fallback'
|
||||||
grab = sct.grab(mon)
|
print(f"⚡ [ORA/template-fallback] Réutilisation score={template_score:.3f} → ({x}, {y})")
|
||||||
screen_img = Image.frombytes('RGB', grab.size, grab.bgra, 'raw', 'BGRX')
|
|
||||||
|
|
||||||
raw_b64 = screenshot_b64.split(',')[1] if ',' in screenshot_b64 else screenshot_b64
|
|
||||||
anchor_data = base64.b64decode(raw_b64)
|
|
||||||
anchor_img = Image.open(_io.BytesIO(anchor_data))
|
|
||||||
|
|
||||||
screen_cv = cv2.cvtColor(np.array(screen_img), cv2.COLOR_RGB2BGR)
|
|
||||||
anchor_cv = cv2.cvtColor(np.array(anchor_img), cv2.COLOR_RGB2BGR)
|
|
||||||
|
|
||||||
if anchor_cv.shape[0] < screen_cv.shape[0] and anchor_cv.shape[1] < screen_cv.shape[1]:
|
|
||||||
t0 = time.time()
|
|
||||||
result_tm = cv2.matchTemplate(screen_cv, anchor_cv, cv2.TM_CCOEFF_NORMED)
|
|
||||||
_, max_val, _, max_loc = cv2.minMaxLoc(result_tm)
|
|
||||||
elapsed_ms = (time.time() - t0) * 1000
|
|
||||||
print(f"⚡ [ORA/template] score={max_val:.3f} pos={max_loc} ({elapsed_ms:.0f}ms)")
|
|
||||||
if max_val > 0.75:
|
|
||||||
x = max_loc[0] + anchor_cv.shape[1] // 2
|
|
||||||
y = max_loc[1] + anchor_cv.shape[0] // 2
|
|
||||||
method_used = 'template'
|
|
||||||
except Exception as e:
|
|
||||||
print(f"⚠️ [ORA/template] Erreur: {e}")
|
|
||||||
|
|
||||||
if x is None and target_text:
|
if x is None and target_text:
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -25,31 +25,42 @@ import time
|
|||||||
from typing import Any, Dict, Optional
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
|
|
||||||
# Titres connus → quelle action demander à InfiGUI
|
# Titres connus → quelle action demander à InfiGUI.
|
||||||
|
#
|
||||||
|
# IMPORTANT — ordre du dict = priorité de matching.
|
||||||
|
# L'OCR est full-screen et capte souvent le texte du dialog parent ET du popup
|
||||||
|
# modal qui apparaît par-dessus (ex: "Enregistrer sous" reste visible derrière
|
||||||
|
# "Confirmer l'enregistrement"). Les popups modaux DOIVENT matcher avant les
|
||||||
|
# fenêtres principales, sinon Léa clique sur le bouton du parent qui n'a pas
|
||||||
|
# le focus.
|
||||||
KNOWN_DIALOGS = {
|
KNOWN_DIALOGS = {
|
||||||
"enregistrer sous": {"target": "Enregistrer", "description": "Clique sur le bouton Enregistrer dans le dialogue Enregistrer sous"},
|
# ── Popups modaux de confirmation (priorité HAUTE) ──────────────────
|
||||||
"save as": {"target": "Save", "description": "Click the Save button in the Save As dialog"},
|
"voulez-vous le remplacer": {"target": "Oui", "description": "Clique sur Oui pour confirmer le remplacement du fichier"},
|
||||||
"confirmer": {"target": "Oui", "description": "Clique sur le bouton Oui dans le dialogue de confirmation"},
|
"do you want to replace": {"target": "Yes", "description": "Click Yes to confirm file replacement"},
|
||||||
|
"existe déjà": {"target": "Oui", "description": "Clique sur Oui, le fichier existe déjà et doit être remplacé"},
|
||||||
|
"already exists": {"target": "Yes", "description": "Click Yes, the file already exists"},
|
||||||
"remplacer": {"target": "Oui", "description": "Clique sur le bouton Oui pour confirmer le remplacement du fichier"},
|
"remplacer": {"target": "Oui", "description": "Clique sur le bouton Oui pour confirmer le remplacement du fichier"},
|
||||||
"replace": {"target": "Yes", "description": "Click Yes to confirm file replacement"},
|
"replace": {"target": "Yes", "description": "Click Yes to confirm file replacement"},
|
||||||
"voulez-vous enregistrer": {"target": "Enregistrer", "description": "Clique sur Enregistrer pour sauvegarder les modifications"},
|
|
||||||
"do you want to save": {"target": "Save", "description": "Click Save to save changes"},
|
|
||||||
"overwrite": {"target": "Yes", "description": "Click Yes to overwrite"},
|
|
||||||
"écraser": {"target": "Oui", "description": "Clique sur Oui pour écraser le fichier"},
|
"écraser": {"target": "Oui", "description": "Clique sur Oui pour écraser le fichier"},
|
||||||
"already exists": {"target": "Yes", "description": "Click Yes, the file already exists"},
|
"overwrite": {"target": "Yes", "description": "Click Yes to overwrite"},
|
||||||
"existe déjà": {"target": "Oui", "description": "Clique sur Oui, le fichier existe déjà"},
|
"confirmer l'enregistrement": {"target": "Oui", "description": "Clique sur Oui dans le popup de confirmation d'enregistrement"},
|
||||||
|
"confirmer": {"target": "Oui", "description": "Clique sur le bouton Oui dans le dialogue de confirmation"},
|
||||||
|
# ── Avertissements/erreurs (priorité haute, 1 seul bouton OK) ───────
|
||||||
"erreur": {"target": "OK", "description": "Clique sur OK pour fermer le message d'erreur"},
|
"erreur": {"target": "OK", "description": "Clique sur OK pour fermer le message d'erreur"},
|
||||||
"error": {"target": "OK", "description": "Click OK to close the error message"},
|
"error": {"target": "OK", "description": "Click OK to close the error message"},
|
||||||
"avertissement": {"target": "OK", "description": "Clique sur OK pour fermer l'avertissement"},
|
"avertissement": {"target": "OK", "description": "Clique sur OK pour fermer l'avertissement"},
|
||||||
"warning": {"target": "OK", "description": "Click OK to close the warning"},
|
"warning": {"target": "OK", "description": "Click OK to close the warning"},
|
||||||
|
# ── Dialogs principaux de sauvegarde (priorité BASSE — fenêtres parents) ─
|
||||||
|
"voulez-vous enregistrer": {"target": "Enregistrer", "description": "Clique sur Enregistrer pour sauvegarder les modifications"},
|
||||||
|
"do you want to save": {"target": "Save", "description": "Click Save to save changes"},
|
||||||
|
"enregistrer sous": {"target": "Enregistrer", "description": "Clique sur le bouton Enregistrer dans le dialogue Enregistrer sous"},
|
||||||
|
"save as": {"target": "Save", "description": "Click the Save button in the Save As dialog"},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class DialogHandler:
|
class DialogHandler:
|
||||||
"""Gestion intelligente des dialogues via titre + InfiGUI."""
|
"""Gestion intelligente des dialogues via titre + InfiGUI."""
|
||||||
|
|
||||||
GROUNDING_URL = "http://localhost:8200"
|
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self._easyocr_reader = None
|
self._easyocr_reader = None
|
||||||
|
|
||||||
@@ -169,29 +180,21 @@ class DialogHandler:
|
|||||||
def _click_via_infigui(
|
def _click_via_infigui(
|
||||||
self, target: str, description: str, screenshot_pil
|
self, target: str, description: str, screenshot_pil
|
||||||
) -> Optional[Dict]:
|
) -> Optional[Dict]:
|
||||||
"""Demande à InfiGUI de localiser et cliquer sur le bouton."""
|
"""Demande à InfiGUI (subprocess one-shot) de localiser et cliquer sur le bouton."""
|
||||||
try:
|
try:
|
||||||
import requests
|
from core.grounding.ui_tars_grounder import UITarsGrounder
|
||||||
import base64
|
|
||||||
import io
|
|
||||||
|
|
||||||
buf = io.BytesIO()
|
grounder = UITarsGrounder.get_instance()
|
||||||
screenshot_pil.save(buf, format='JPEG', quality=85)
|
result = grounder.ground(
|
||||||
b64 = base64.b64encode(buf.getvalue()).decode()
|
target_text=target,
|
||||||
|
target_description=description,
|
||||||
|
screen_pil=screenshot_pil,
|
||||||
|
)
|
||||||
|
|
||||||
resp = requests.post(f"{self.GROUNDING_URL}/ground", json={
|
if result and result.x is not None:
|
||||||
'target_text': target,
|
|
||||||
'target_description': description,
|
|
||||||
'image_b64': b64,
|
|
||||||
}, timeout=15)
|
|
||||||
|
|
||||||
if resp.status_code == 200:
|
|
||||||
data = resp.json()
|
|
||||||
if data.get('x') is not None:
|
|
||||||
# Cliquer
|
|
||||||
import pyautogui
|
import pyautogui
|
||||||
pyautogui.click(data['x'], data['y'])
|
pyautogui.click(result.x, result.y)
|
||||||
return data
|
return {'x': result.x, 'y': result.y}
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|||||||
@@ -61,7 +61,14 @@ def load_model():
|
|||||||
|
|
||||||
|
|
||||||
def infer(model, processor, req):
|
def infer(model, processor, req):
|
||||||
"""Fait une inférence."""
|
"""Fait une inférence.
|
||||||
|
|
||||||
|
Modes :
|
||||||
|
- texte seul (target/description) : grounding classique
|
||||||
|
- fusionné (anchor_image_path présent) : on passe en plus le crop d'ancre
|
||||||
|
comme image de référence et le modèle doit retrouver cet élément sur
|
||||||
|
le screenshot. Évite la double passe describe→ground.
|
||||||
|
"""
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from qwen_vl_utils import process_vision_info
|
from qwen_vl_utils import process_vision_info
|
||||||
|
|
||||||
@@ -69,10 +76,7 @@ def infer(model, processor, req):
|
|||||||
description = req.get("description", "")
|
description = req.get("description", "")
|
||||||
label = f"{target} — {description}" if description else target
|
label = f"{target} — {description}" if description else target
|
||||||
|
|
||||||
if not label.strip():
|
# Image principale (screenshot complet)
|
||||||
return {"x": None, "y": None, "error": "target requis"}
|
|
||||||
|
|
||||||
# Image
|
|
||||||
image_path = req.get("image_path", "")
|
image_path = req.get("image_path", "")
|
||||||
if image_path and os.path.exists(image_path):
|
if image_path and os.path.exists(image_path):
|
||||||
img = Image.open(image_path).convert("RGB")
|
img = Image.open(image_path).convert("RGB")
|
||||||
@@ -82,6 +86,15 @@ def infer(model, processor, req):
|
|||||||
grab = sct.grab(sct.monitors[0])
|
grab = sct.grab(sct.monitors[0])
|
||||||
img = Image.frombytes("RGB", grab.size, grab.bgra, "raw", "BGRX")
|
img = Image.frombytes("RGB", grab.size, grab.bgra, "raw", "BGRX")
|
||||||
|
|
||||||
|
# Image d'ancre (optionnelle) — mode fusionné describe+ground
|
||||||
|
anchor_image_path = req.get("anchor_image_path", "")
|
||||||
|
anchor_img = None
|
||||||
|
if anchor_image_path and os.path.exists(anchor_image_path):
|
||||||
|
anchor_img = Image.open(anchor_image_path).convert("RGB")
|
||||||
|
|
||||||
|
if not label.strip() and anchor_img is None:
|
||||||
|
return {"x": None, "y": None, "error": "target ou anchor_image requis"}
|
||||||
|
|
||||||
W, H = img.size
|
W, H = img.size
|
||||||
factor = 28
|
factor = 28
|
||||||
rH = max(factor, round(H / factor) * factor)
|
rH = max(factor, round(H / factor) * factor)
|
||||||
@@ -92,13 +105,34 @@ def infer(model, processor, req):
|
|||||||
"and then provide the final answer.\n"
|
"and then provide the final answer.\n"
|
||||||
"The reasoning process MUST BE enclosed within <think> </think> tags."
|
"The reasoning process MUST BE enclosed within <think> </think> tags."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Construction du prompt selon le mode
|
||||||
|
if anchor_img is not None:
|
||||||
|
# Mode fusionné : Image1 = crop d'ancre, Image2 = screenshot
|
||||||
|
hint = f' Hint: this element looks like "{label}".' if label.strip() else ""
|
||||||
|
user_text = (
|
||||||
|
f"The first image is a small crop of a UI element captured previously. "
|
||||||
|
f"The second image is the current screen ({rW}x{rH}).{hint}\n"
|
||||||
|
f"Locate on the second image the UI element that visually matches the first image. "
|
||||||
|
f"Output the coordinates using JSON format: "
|
||||||
|
f'[{{"point_2d": [x, y]}}, ...]'
|
||||||
|
)
|
||||||
|
messages = [
|
||||||
|
{"role": "system", "content": system},
|
||||||
|
{"role": "user", "content": [
|
||||||
|
{"type": "image", "image": anchor_img},
|
||||||
|
{"type": "image", "image": img},
|
||||||
|
{"type": "text", "text": user_text},
|
||||||
|
]},
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
# Mode classique : texte seul
|
||||||
user_text = (
|
user_text = (
|
||||||
f'The screen\'s resolution is {rW}x{rH}.\n'
|
f'The screen\'s resolution is {rW}x{rH}.\n'
|
||||||
f'Locate the UI element(s) for "{label}", '
|
f'Locate the UI element(s) for "{label}", '
|
||||||
f'output the coordinates using JSON format: '
|
f'output the coordinates using JSON format: '
|
||||||
f'[{{"point_2d": [x, y]}}, ...]'
|
f'[{{"point_2d": [x, y]}}, ...]'
|
||||||
)
|
)
|
||||||
|
|
||||||
messages = [
|
messages = [
|
||||||
{"role": "system", "content": system},
|
{"role": "system", "content": system},
|
||||||
{"role": "user", "content": [
|
{"role": "user", "content": [
|
||||||
@@ -124,7 +158,8 @@ def infer(model, processor, req):
|
|||||||
trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False,
|
trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False,
|
||||||
)[0].strip()
|
)[0].strip()
|
||||||
|
|
||||||
print(f"[infigui-worker] '{label[:40]}' ({infer_ms:.0f}ms)")
|
mode_str = "fused" if anchor_img is not None else "text"
|
||||||
|
print(f"[infigui-worker] [{mode_str}] '{label[:40]}' ({infer_ms:.0f}ms)")
|
||||||
|
|
||||||
# Parser JSON point_2d
|
# Parser JSON point_2d
|
||||||
json_part = raw.split("</think>")[-1] if "</think>" in raw else raw
|
json_part = raw.split("</think>")[-1] if "</think>" in raw else raw
|
||||||
@@ -153,34 +188,22 @@ def infer(model, processor, req):
|
|||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
model, processor = load_model()
|
"""Mode one-shot : lit une requête sur stdin, infère, écrit le résultat sur stdout."""
|
||||||
|
# Lire la requête
|
||||||
|
input_data = sys.stdin.read().strip()
|
||||||
|
if not input_data:
|
||||||
|
print(json.dumps({"x": None, "y": None, "error": "pas de requête"}))
|
||||||
|
return
|
||||||
|
|
||||||
# Nettoyer les fichiers résiduels
|
|
||||||
for f in [REQUEST_FILE, RESPONSE_FILE]:
|
|
||||||
if os.path.exists(f):
|
|
||||||
os.unlink(f)
|
|
||||||
|
|
||||||
print(f"[infigui-worker] En attente de requêtes ({REQUEST_FILE})")
|
|
||||||
|
|
||||||
# Boucle : surveiller le fichier de requête
|
|
||||||
while True:
|
|
||||||
if os.path.exists(REQUEST_FILE):
|
|
||||||
try:
|
try:
|
||||||
with open(REQUEST_FILE, "r") as f:
|
req = json.loads(input_data)
|
||||||
req = json.load(f)
|
except json.JSONDecodeError:
|
||||||
os.unlink(REQUEST_FILE)
|
print(json.dumps({"x": None, "y": None, "error": "JSON invalide"}))
|
||||||
|
return
|
||||||
|
|
||||||
|
model, processor = load_model()
|
||||||
result = infer(model, processor, req)
|
result = infer(model, processor, req)
|
||||||
|
print(json.dumps(result))
|
||||||
with open(RESPONSE_FILE, "w") as f:
|
|
||||||
json.dump(result, f)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"[infigui-worker] ERREUR: {e}")
|
|
||||||
with open(RESPONSE_FILE, "w") as f:
|
|
||||||
json.dump({"x": None, "y": None, "error": str(e)}, f)
|
|
||||||
|
|
||||||
time.sleep(0.05) # 50ms polling
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -1,50 +1,41 @@
|
|||||||
"""
|
"""
|
||||||
core/grounding/think_arbiter.py — Layer THINK : VLM arbitre (UI-TARS)
|
core/grounding/think_arbiter.py — Layer THINK : VLM arbitre (InfiGUI via subprocess)
|
||||||
|
|
||||||
Appelé UNIQUEMENT quand le SmartMatcher n'a pas assez confiance :
|
Appelé UNIQUEMENT quand le SmartMatcher n'a pas assez confiance.
|
||||||
- Score < 0.60 : aucun candidat clair → UI-TARS cherche dans tout l'écran
|
Utilise le subprocess worker InfiGUI (pas de serveur HTTP).
|
||||||
- Score 0.60-0.90 : candidats ambigus → UI-TARS confirme/infirme
|
|
||||||
|
|
||||||
Le VLM tourne dans un process séparé (serveur FastAPI port 8200).
|
|
||||||
Ce module est un CLIENT HTTP — il ne charge aucun modèle en VRAM.
|
|
||||||
|
|
||||||
Utilisation :
|
Utilisation :
|
||||||
from core.grounding.think_arbiter import ThinkArbiter
|
from core.grounding.think_arbiter import ThinkArbiter
|
||||||
|
|
||||||
arbiter = ThinkArbiter()
|
arbiter = ThinkArbiter()
|
||||||
if arbiter.available:
|
|
||||||
result = arbiter.arbitrate(target, candidates, screenshot)
|
result = arbiter.arbitrate(target, candidates, screenshot)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import base64
|
|
||||||
import io
|
|
||||||
import time
|
import time
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
from core.grounding.fast_types import DetectedUIElement, LocateResult, MatchCandidate
|
from core.grounding.fast_types import LocateResult, MatchCandidate
|
||||||
from core.grounding.target import GroundingTarget
|
from core.grounding.target import GroundingTarget
|
||||||
|
|
||||||
|
|
||||||
class ThinkArbiter:
|
class ThinkArbiter:
|
||||||
"""Arbitre VLM pour les cas ambigus — appelle le serveur UI-TARS."""
|
"""Arbitre VLM — appelle InfiGUI via subprocess worker."""
|
||||||
|
|
||||||
DEFAULT_URL = "http://localhost:8200"
|
def __init__(self):
|
||||||
|
self._grounder = None
|
||||||
|
|
||||||
def __init__(self, server_url: str = DEFAULT_URL, timeout: int = 30):
|
def _get_grounder(self):
|
||||||
self.server_url = server_url
|
if self._grounder is None:
|
||||||
self.timeout = timeout
|
from core.grounding.ui_tars_grounder import UITarsGrounder
|
||||||
|
self._grounder = UITarsGrounder.get_instance()
|
||||||
|
return self._grounder
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def available(self) -> bool:
|
def available(self) -> bool:
|
||||||
"""Vérifie si le serveur de grounding est accessible."""
|
"""Toujours disponible — le worker se lance à la demande."""
|
||||||
try:
|
return True
|
||||||
import requests
|
|
||||||
resp = requests.get(f"{self.server_url}/health", timeout=3)
|
|
||||||
return resp.status_code == 200 and resp.json().get("model_loaded", False)
|
|
||||||
except Exception:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def arbitrate(
|
def arbitrate(
|
||||||
self,
|
self,
|
||||||
@@ -54,62 +45,57 @@ class ThinkArbiter:
|
|||||||
) -> Optional[LocateResult]:
|
) -> Optional[LocateResult]:
|
||||||
"""Demande au VLM de trancher.
|
"""Demande au VLM de trancher.
|
||||||
|
|
||||||
Args:
|
Si target.template_b64 est fourni, on bascule en mode fusionné :
|
||||||
target: Ce qu'on cherche.
|
le crop est passé comme image de référence à InfiGUI, ce qui évite
|
||||||
candidates: Candidats SMART (peut être vide).
|
une description Ollama qwen2.5vl coûteuse en VRAM.
|
||||||
screenshot_pil: Screenshot PIL. Si None, le serveur capture lui-même.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
LocateResult ou None si le VLM ne trouve pas non plus.
|
|
||||||
"""
|
"""
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
|
|
||||||
|
# Décodage du crop d'ancre si disponible (mode fusionné)
|
||||||
|
anchor_pil = None
|
||||||
|
if target.template_b64:
|
||||||
try:
|
try:
|
||||||
import requests
|
import base64
|
||||||
|
import io
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
# Construire le payload
|
raw_b64 = target.template_b64
|
||||||
payload: Dict[str, Any] = {
|
if ',' in raw_b64:
|
||||||
"target_text": target.text or "",
|
raw_b64 = raw_b64.split(',', 1)[1]
|
||||||
"target_description": target.description or "",
|
anchor_pil = Image.open(io.BytesIO(base64.b64decode(raw_b64))).convert("RGB")
|
||||||
}
|
except Exception as ex:
|
||||||
|
print(f"⚠️ [THINK] Décodage anchor échoué: {ex}")
|
||||||
|
anchor_pil = None
|
||||||
|
|
||||||
# Envoyer l'image si disponible
|
try:
|
||||||
if screenshot_pil is not None:
|
grounder = self._get_grounder()
|
||||||
buf = io.BytesIO()
|
result = grounder.ground(
|
||||||
screenshot_pil.save(buf, format="JPEG", quality=85)
|
target_text=target.text or "",
|
||||||
payload["image_b64"] = base64.b64encode(buf.getvalue()).decode("utf-8")
|
target_description=target.description or "",
|
||||||
|
screen_pil=screenshot_pil,
|
||||||
# Appel au serveur
|
anchor_pil=anchor_pil,
|
||||||
resp = requests.post(
|
|
||||||
f"{self.server_url}/ground",
|
|
||||||
json=payload,
|
|
||||||
timeout=self.timeout,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
dt = (time.time() - t0) * 1000
|
dt = (time.time() - t0) * 1000
|
||||||
|
|
||||||
if resp.status_code != 200:
|
if result is None:
|
||||||
print(f"🤔 [THINK] Serveur HTTP {resp.status_code}")
|
label = target.text or "<crop>"
|
||||||
|
print(f"🤔 [THINK] VLM n'a pas trouvé '{label}' ({dt:.0f}ms)")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
data = resp.json()
|
method = "think_vlm_fused" if anchor_pil is not None else "think_vlm"
|
||||||
|
locate = LocateResult(
|
||||||
if data.get("x") is None:
|
x=result.x,
|
||||||
print(f"🤔 [THINK] VLM n'a pas trouvé '{target.text}' ({dt:.0f}ms)")
|
y=result.y,
|
||||||
return None
|
confidence=result.confidence,
|
||||||
|
method=method,
|
||||||
result = LocateResult(
|
|
||||||
x=data["x"],
|
|
||||||
y=data["y"],
|
|
||||||
confidence=data.get("confidence", 0.85),
|
|
||||||
method="think_vlm",
|
|
||||||
time_ms=dt,
|
time_ms=dt,
|
||||||
tier="think",
|
tier="think",
|
||||||
candidates_count=len(candidates),
|
candidates_count=len(candidates),
|
||||||
)
|
)
|
||||||
|
|
||||||
print(f"🤔 [THINK] VLM → ({result.x}, {result.y}) conf={result.confidence:.2f} ({dt:.0f}ms)")
|
print(f"🤔 [THINK/{method}] ({result.x}, {result.y}) conf={result.confidence:.2f} ({dt:.0f}ms)")
|
||||||
return result
|
return locate
|
||||||
|
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
dt = (time.time() - t0) * 1000
|
dt = (time.time() - t0) * 1000
|
||||||
|
|||||||
@@ -1,21 +1,18 @@
|
|||||||
"""
|
"""
|
||||||
core/grounding/ui_tars_grounder.py — Grounding via worker InfiGUI indépendant
|
core/grounding/ui_tars_grounder.py — Grounding via script one-shot InfiGUI
|
||||||
|
|
||||||
Communication par fichiers :
|
Chaque appel lance un subprocess Python qui charge le modèle, infère, et quitte.
|
||||||
- Écrit la requête dans /tmp/infigui_request.json
|
Lent (~15s) mais fiable — pas de crash CUDA en process persistant.
|
||||||
- Le worker lit, infère, écrit la réponse dans /tmp/infigui_response.json
|
|
||||||
- Le grounder lit la réponse
|
|
||||||
|
|
||||||
Le worker est un process indépendant lancé par start_grounding_worker.sh,
|
|
||||||
PAS un subprocess de Flask.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import time
|
import subprocess
|
||||||
|
import sys
|
||||||
import threading
|
import threading
|
||||||
|
import time
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from core.grounding.target import GroundingResult
|
from core.grounding.target import GroundingResult
|
||||||
@@ -23,16 +20,15 @@ from core.grounding.target import GroundingResult
|
|||||||
_instance: Optional[UITarsGrounder] = None
|
_instance: Optional[UITarsGrounder] = None
|
||||||
_instance_lock = threading.Lock()
|
_instance_lock = threading.Lock()
|
||||||
|
|
||||||
REQUEST_FILE = "/tmp/infigui_request.json"
|
|
||||||
RESPONSE_FILE = "/tmp/infigui_response.json"
|
|
||||||
READY_FILE = "/tmp/infigui_ready"
|
|
||||||
|
|
||||||
|
|
||||||
class UITarsGrounder:
|
class UITarsGrounder:
|
||||||
"""Grounding via worker InfiGUI indépendant — communication par fichiers."""
|
"""Grounding via script one-shot InfiGUI."""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self._lock = threading.Lock()
|
self._lock = threading.Lock()
|
||||||
|
self._project_root = os.path.abspath(
|
||||||
|
os.path.join(os.path.dirname(__file__), "..", "..")
|
||||||
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_instance(cls) -> UITarsGrounder:
|
def get_instance(cls) -> UITarsGrounder:
|
||||||
@@ -45,68 +41,111 @@ class UITarsGrounder:
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def available(self) -> bool:
|
def available(self) -> bool:
|
||||||
return os.path.exists(READY_FILE)
|
return True # Toujours disponible — le script se lance à la demande
|
||||||
|
|
||||||
def ground(
|
def ground(
|
||||||
self,
|
self,
|
||||||
target_text: str = "",
|
target_text: str = "",
|
||||||
target_description: str = "",
|
target_description: str = "",
|
||||||
screen_pil=None,
|
screen_pil=None,
|
||||||
|
anchor_pil=None,
|
||||||
) -> Optional[GroundingResult]:
|
) -> Optional[GroundingResult]:
|
||||||
"""Localise un élément UI via le worker InfiGUI."""
|
"""Localise un élément UI via un script one-shot InfiGUI.
|
||||||
if not self.available:
|
|
||||||
print("[InfiGUI] Worker non démarré (pas de /tmp/infigui_ready)")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
Args:
|
||||||
|
target_text: nom textuel de la cible (peut être vide si anchor_pil fourni).
|
||||||
|
target_description: description sémantique libre.
|
||||||
|
screen_pil: screenshot complet (PIL.Image).
|
||||||
|
anchor_pil: crop visuel de l'ancre capturée précédemment (PIL.Image).
|
||||||
|
Si fourni, le worker passe en mode fusionné : Image1=crop, Image2=screen,
|
||||||
|
"trouve sur l'image 2 l'élément visuel de l'image 1".
|
||||||
|
"""
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with self._lock:
|
with self._lock:
|
||||||
# Sauver l'image si fournie
|
# Sauver l'image principale
|
||||||
image_path = ""
|
|
||||||
if screen_pil is not None:
|
|
||||||
image_path = "/tmp/infigui_screen.png"
|
image_path = "/tmp/infigui_screen.png"
|
||||||
|
if screen_pil is not None:
|
||||||
screen_pil.save(image_path)
|
screen_pil.save(image_path)
|
||||||
|
|
||||||
# Écrire la requête
|
# Sauver l'image d'ancre (mode fusionné)
|
||||||
req = {
|
anchor_image_path = ""
|
||||||
|
if anchor_pil is not None:
|
||||||
|
anchor_image_path = "/tmp/infigui_anchor.png"
|
||||||
|
anchor_pil.save(anchor_image_path)
|
||||||
|
|
||||||
|
# Construire la requête JSON
|
||||||
|
req = json.dumps({
|
||||||
"target": target_text,
|
"target": target_text,
|
||||||
"description": target_description,
|
"description": target_description,
|
||||||
"image_path": image_path,
|
"image_path": image_path,
|
||||||
"timestamp": time.time(),
|
"anchor_image_path": anchor_image_path,
|
||||||
}
|
})
|
||||||
|
|
||||||
# Supprimer l'ancienne réponse
|
mode_str = "fused" if anchor_pil is not None else "text"
|
||||||
if os.path.exists(RESPONSE_FILE):
|
label_short = target_text[:30] if target_text else "<crop only>"
|
||||||
os.unlink(RESPONSE_FILE)
|
print(f"🎯 [InfiGUI] Lancement one-shot [{mode_str}]: '{label_short}'")
|
||||||
|
|
||||||
# Écrire la requête
|
# Lancer le script one-shot
|
||||||
with open(REQUEST_FILE, "w") as f:
|
# IMPORTANT: depuis un service systemd où le parent a déjà chargé CUDA,
|
||||||
json.dump(req, f)
|
# le subprocess hérite d'un état GPU cassé (No CUDA GPUs available).
|
||||||
|
# Solutions : start_new_session=True (nouveau cgroup) + forcer
|
||||||
|
# CUDA_VISIBLE_DEVICES=0 explicitement pour bypass l'héritage parent.
|
||||||
|
_child_env = {**os.environ}
|
||||||
|
_child_env["PYTHONDONTWRITEBYTECODE"] = "1"
|
||||||
|
_child_env["CUDA_VISIBLE_DEVICES"] = "0"
|
||||||
|
_child_env["NVIDIA_VISIBLE_DEVICES"] = "all"
|
||||||
|
# Supprimer les variables Python qui pourraient pointer sur l'état parent
|
||||||
|
_child_env.pop("PYTORCH_NVML_BASED_CUDA_CHECK", None)
|
||||||
|
|
||||||
# Attendre la réponse (max 30s)
|
result = subprocess.run(
|
||||||
for _ in range(300):
|
[sys.executable, "-m", "core.grounding.infigui_worker"],
|
||||||
if os.path.exists(RESPONSE_FILE):
|
input=req + "\n",
|
||||||
time.sleep(0.05) # Laisser le fichier se fermer
|
capture_output=True,
|
||||||
try:
|
text=True,
|
||||||
with open(RESPONSE_FILE, "r") as f:
|
timeout=60,
|
||||||
data = json.load(f)
|
cwd=self._project_root,
|
||||||
os.unlink(RESPONSE_FILE)
|
env=_child_env,
|
||||||
break
|
start_new_session=True, # nouveau session group, isole du parent
|
||||||
except (json.JSONDecodeError, IOError):
|
close_fds=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.returncode != 0:
|
||||||
|
stderr_lines = (result.stderr or '').strip().split('\n')
|
||||||
|
# Afficher les dernières lignes significatives du stderr
|
||||||
|
last_err = [l for l in stderr_lines[-5:] if l.strip()]
|
||||||
|
print(f"⚠️ [InfiGUI] Script échoué (code {result.returncode})")
|
||||||
|
for l in last_err:
|
||||||
|
print(f" ❌ {l}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Parser la sortie — chercher la ligne JSON de résultat
|
||||||
|
data = None
|
||||||
|
for line in result.stdout.strip().split("\n"):
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
continue
|
continue
|
||||||
time.sleep(0.1)
|
try:
|
||||||
else:
|
parsed = json.loads(line)
|
||||||
print(f"⚠️ [InfiGUI] Timeout 30s — worker ne répond pas")
|
if "x" in parsed:
|
||||||
|
data = parsed
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if data is None:
|
||||||
|
print(f"⚠️ [InfiGUI] Pas de réponse JSON dans la sortie")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
dt = (time.time() - t0) * 1000
|
dt = (time.time() - t0) * 1000
|
||||||
|
|
||||||
if data.get("x") is not None:
|
if data.get("x") is not None:
|
||||||
print(f"🎯 [InfiGUI] ({data['x']}, {data['y']}) conf={data.get('confidence', 0):.2f} ({dt:.0f}ms)")
|
method_name = "infigui_fused" if anchor_pil is not None else "infigui"
|
||||||
|
print(f"🎯 [InfiGUI/{method_name}] ({data['x']}, {data['y']}) "
|
||||||
|
f"conf={data.get('confidence', 0):.2f} ({dt:.0f}ms)")
|
||||||
return GroundingResult(
|
return GroundingResult(
|
||||||
x=data["x"], y=data["y"],
|
x=data["x"], y=data["y"],
|
||||||
method="infigui",
|
method=method_name,
|
||||||
confidence=data.get("confidence", 0.90),
|
confidence=data.get("confidence", 0.90),
|
||||||
time_ms=dt,
|
time_ms=dt,
|
||||||
)
|
)
|
||||||
@@ -114,6 +153,9 @@ class UITarsGrounder:
|
|||||||
print(f"⚠️ [InfiGUI] Pas trouvé ({dt:.0f}ms)")
|
print(f"⚠️ [InfiGUI] Pas trouvé ({dt:.0f}ms)")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
print(f"⚠️ [InfiGUI] Timeout 60s")
|
||||||
|
return None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"⚠️ [InfiGUI] Erreur: {e}")
|
print(f"⚠️ [InfiGUI] Erreur: {e}")
|
||||||
return None
|
return None
|
||||||
|
|||||||
@@ -896,15 +896,15 @@ def execute_action(action_type: str, params: dict) -> dict:
|
|||||||
_fc_target_text = params.get('_step_label', '')
|
_fc_target_text = params.get('_step_label', '')
|
||||||
_action_types = {'click_anchor', 'double_click_anchor', 'right_click_anchor',
|
_action_types = {'click_anchor', 'double_click_anchor', 'right_click_anchor',
|
||||||
'hover_anchor', 'focus_anchor', 'scroll_to_anchor'}
|
'hover_anchor', 'focus_anchor', 'scroll_to_anchor'}
|
||||||
if _fc_target_text in _action_types and screenshot_base64:
|
# Note: plus d'appel à _describe_anchor_image() (qwen2.5vl) ici.
|
||||||
try:
|
# Le crop d'ancre (screenshot_base64) est utilisé directement par
|
||||||
from core.execution.input_handler import _describe_anchor_image
|
# le template matching pixel-perfect en avant-poste, puis InfiGUI
|
||||||
_desc = _describe_anchor_image(screenshot_base64)
|
# en mode fusionné si nécessaire (option 2.c+2.a). Économise ~9.4 GB
|
||||||
if _desc:
|
# de VRAM Ollama qui rentrait en conflit avec InfiGUI.
|
||||||
print(f"🏷️ [Vision] Ancre décrite: '{_desc}'")
|
if _fc_target_text in _action_types:
|
||||||
_fc_target_text = _desc
|
# Marquer le label comme garbage pour que le pipeline
|
||||||
except Exception:
|
# bascule sur le mode fusionné via template_b64.
|
||||||
pass
|
_fc_target_text = ''
|
||||||
_fc_target_desc = params.get('visual_anchor', {}).get('description', '')
|
_fc_target_desc = params.get('visual_anchor', {}).get('description', '')
|
||||||
|
|
||||||
x, y, confidence, method_used = None, None, 0, ''
|
x, y, confidence, method_used = None, None, 0, ''
|
||||||
|
|||||||
Reference in New Issue
Block a user