feat: mode apprentissage — retry échoué + écran inchangé déclenchent la capture humaine

Trois chemins vers le mode apprentissage supervisé :
1. Grounding échoue → Policy RETRY → retry échoue → capture humaine
2. Clic visuel sans effet (écran inchangé 3s) → capture humaine
3. Policy SUPERVISE direct → capture humaine

La capture enregistre un mini-workflow complet (clics + frappes + combos)
jusqu'à Ctrl+Shift+L ou 10s d'inactivité. Correction envoyée au serveur.

Testé E2E : workflow Chrome avec résultats Google dynamiques +
bandeau cookies — Léa demande l'aide, capture, reprend.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dom
2026-04-13 08:33:57 +02:00
parent 33c198b827
commit d5285de99c

View File

@@ -760,16 +760,48 @@ class ActionExecutorV1:
result["visual_resolved"] = True result["visual_resolved"] = True
print(f" [POLICY] Re-resolve OK après {policy_decision.action_taken}") print(f" [POLICY] Re-resolve OK après {policy_decision.action_taken}")
else: else:
result["success"] = False # Retry échoué → mode apprentissage
result["error"] = "target_not_found" # Léa a tout essayé (UIA, template, VLM, retry)
result["target_description"] = target_desc # et ne trouve toujours pas. L'humain doit montrer.
result["target_spec"] = target_spec print(f" [POLICY] Retry échoué → mode apprentissage")
result["screenshot"] = self._capture_screenshot_b64() try:
result["warning"] = "visual_resolve_failed" self.notifier.replay_target_not_found(
self.notifier.replay_target_not_found( target_desc,
target_desc, target_spec.get("window_title", ""),
target_spec.get("window_title", ""), )
except Exception:
pass
human_actions = self._capture_human_correction(
timeout_s=120,
) )
if human_actions:
result["success"] = True
result["resolution_method"] = "human_supervised"
result["warning"] = "human_supervised_after_retry_failed"
last_click = None
for ha in reversed(human_actions):
if ha.get("type") == "click":
last_click = ha
break
if last_click:
result["actual_position"] = {
"x_pct": last_click["x_pct"],
"y_pct": last_click["y_pct"],
}
result["correction"] = {
"actions": human_actions,
"action_count": len(human_actions),
"last_click": last_click,
"trigger": "retry_failed",
}
else:
result["success"] = False
result["error"] = "target_not_found"
result["target_description"] = target_desc
result["target_spec"] = target_spec
result["screenshot"] = self._capture_screenshot_b64()
result["warning"] = "visual_resolve_failed"
return result return result
elif policy_decision.decision == Decision.SKIP: elif policy_decision.decision == Decision.SKIP:
@@ -1004,33 +1036,66 @@ class ActionExecutorV1:
hash_before, timeout_ms=3000 hash_before, timeout_ms=3000
) )
if not screen_changed: if not screen_changed:
# ── Recovery : tenter un rollback si l'action n'a pas eu d'effet ──
from .recovery import RecoveryEngine
recovery = RecoveryEngine(self)
recovery_result = recovery.attempt(
failed_action=action,
critic_detail="L'écran n'a pas changé après l'action",
)
if recovery_result.success:
print(f" [RECOVERY] {recovery_result.detail}")
result["recovery"] = recovery_result.to_dict()
result["success"] = False
result["warning"] = "no_screen_change"
result["error"] = "Ecran inchange apres l'action"
print(
f" [ECHEC] Ecran inchange apres {action_type}"
f"l'action n'a pas eu d'effet visible"
)
logger.warning( logger.warning(
f"[LEA] Écran inchangé après {action_type} " f"[LEA] Écran inchangé après {action_type} "
f"(action_id={action_id}) — pas d'effet visible" f"(action_id={action_id}) — pas d'effet visible"
) )
# Notifier l'utilisateur en français naturel (niveau ATTENTION)
try: # ── Mode apprentissage : clic sans effet = mauvais clic ──
self.notifier.replay_no_screen_change(action_type) # Si l'action était un clic visuel, l'écran inchangé prouve
except Exception: # que le grounding a cliqué au mauvais endroit. Au lieu de
pass # passer silencieusement à la suite, Léa demande à l'humain.
if action_type == "click" and visual_mode:
print(
f" [ECHEC] Clic sans effet — "
f"je demande de l'aide"
)
try:
self.notifier.replay_no_screen_change(action_type)
except Exception:
pass
human_actions = self._capture_human_correction(
timeout_s=120,
)
if human_actions:
result["success"] = True
result["resolution_method"] = "human_supervised"
result["warning"] = "human_supervised_after_no_change"
last_click = None
for ha in reversed(human_actions):
if ha.get("type") == "click":
last_click = ha
break
if last_click:
result["actual_position"] = {
"x_pct": last_click["x_pct"],
"y_pct": last_click["y_pct"],
}
result["correction"] = {
"actions": human_actions,
"action_count": len(human_actions),
"last_click": last_click,
"trigger": "no_screen_change",
}
else:
# Timeout — l'humain n'a pas répondu
result["success"] = False
result["warning"] = "no_screen_change"
result["error"] = "Ecran inchange apres l'action"
else:
# Actions non-visuelles : comportement existant
result["success"] = False
result["warning"] = "no_screen_change"
result["error"] = "Ecran inchange apres l'action"
print(
f" [ECHEC] Ecran inchange apres {action_type}"
f"l'action n'a pas eu d'effet visible"
)
try:
self.notifier.replay_no_screen_change(action_type)
except Exception:
pass
else: else:
print(f" [OK] Changement d'ecran detecte apres {action_type}") print(f" [OK] Changement d'ecran detecte apres {action_type}")
else: else: