feat: Léa chat + IRBuilder enrichi (stratégies V4 complètes)
Aspect 2/4 Léa : interface conversationnelle
- chat_interface.py : ChatSession thread-safe, états idle/planning/awaiting/executing/done
- 5 endpoints REST : /api/v1/chat/* (session, message, history, confirm, sessions)
- web_dashboard/chat.html + chat.js : UI minimaliste, polling 2s, pas de framework
- Proxy Flask /api/chat/* → serveur streaming
- 34 tests (happy path, abandon, refus, erreurs, gemma4 down)
IRBuilder enrichi pour plans V4 complets
- _event_to_action() appelle enrich_click_from_screenshot() quand session_dir dispo
- Chaque clic porte _enrichment (by_text OCR, anchor_image_base64, vlm_description)
- ExecutionCompiler consomme l'enrichissement pour produire 3 stratégies par clic
Avant : [ocr] uniquement, target="unknown_window"
Après : [ocr, template, vlm] avec vrai texte OCR ("Rechercher", "Ouvrir")
Validé sur session réelle : 10/10 clics enrichis (by_text + anchor + vlm_description)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -208,18 +208,32 @@ class ExecutionCompiler:
|
||||
) -> tuple:
|
||||
"""Compiler les stratégies de résolution pour un clic.
|
||||
|
||||
Utilise les données d'enrichissement visuel (action._enrichment) si
|
||||
disponibles (crop anchor, description VLM, window_capture).
|
||||
|
||||
Ordre de priorité :
|
||||
1. OCR exact (si texte connu) — 100ms, pixel-perfect
|
||||
2. Template matching (si crop disponible) — 10ms, même interface
|
||||
3. Position relative (si hint disponible) — instantané, fragile
|
||||
4. VLM (dernier recours) — 2-5s, exception handler
|
||||
1. OCR exact (si by_text disponible) — 100ms, pixel-perfect
|
||||
2. Template matching (si anchor_image_base64) — 10ms
|
||||
3. VLM (vlm_description) — 2-5s, exception handler
|
||||
|
||||
Le learning peut réordonner si une stratégie a mieux marché avant.
|
||||
"""
|
||||
primary = None
|
||||
fallbacks = []
|
||||
|
||||
target_text = action.anchor_hint or action.target
|
||||
# Lire l'enrichissement visuel si dispo
|
||||
enrichment = getattr(action, "_enrichment", None) or {}
|
||||
by_text_from_enrich = enrichment.get("by_text", "")
|
||||
anchor_b64 = enrichment.get("anchor_image_base64", "")
|
||||
vlm_desc_from_enrich = enrichment.get("vlm_description", "")
|
||||
window_title = enrichment.get("window_title", "")
|
||||
|
||||
# Source de texte : enrichissement > anchor_hint > target
|
||||
target_text = by_text_from_enrich or action.anchor_hint or action.target
|
||||
# Ne pas utiliser "unknown_window" comme texte OCR
|
||||
if target_text == "unknown_window":
|
||||
target_text = ""
|
||||
|
||||
learned_method = learned.get(target_text, "")
|
||||
|
||||
# Stratégie OCR — le texte visible est la meilleure ancre
|
||||
@@ -227,41 +241,49 @@ class ExecutionCompiler:
|
||||
ocr_strategy = ResolutionStrategy(
|
||||
method="ocr",
|
||||
target_text=target_text,
|
||||
threshold=0.8,
|
||||
threshold=0.7,
|
||||
)
|
||||
# Si le learning dit que l'OCR marche pour cette cible, c'est la primaire
|
||||
if not learned_method or learned_method in ("ocr", "som_text_match", "hybrid_text_direct"):
|
||||
if not learned_method or learned_method in ("ocr", "som_text_match", "hybrid_text_direct", "v4_ocr"):
|
||||
primary = ocr_strategy
|
||||
else:
|
||||
fallbacks.append(ocr_strategy)
|
||||
|
||||
# Stratégie template — le crop visuel de l'enregistrement
|
||||
if action.anchor_hint:
|
||||
if anchor_b64:
|
||||
template_strategy = ResolutionStrategy(
|
||||
method="template",
|
||||
target_text=action.anchor_hint,
|
||||
target_text=target_text,
|
||||
anchor_b64=anchor_b64,
|
||||
threshold=0.85,
|
||||
)
|
||||
if learned_method in ("anchor_template", "template_matching"):
|
||||
if learned_method in ("anchor_template", "template_matching", "v4_template"):
|
||||
if primary:
|
||||
fallbacks.insert(0, primary)
|
||||
primary = template_strategy
|
||||
else:
|
||||
fallbacks.append(template_strategy)
|
||||
|
||||
# Stratégie VLM — exception handler (dernier recours)
|
||||
vlm_description = action.target or step.intent
|
||||
vlm_strategy = ResolutionStrategy(
|
||||
method="vlm",
|
||||
vlm_description=vlm_description,
|
||||
threshold=0.6,
|
||||
)
|
||||
fallbacks.append(vlm_strategy)
|
||||
vlm_description = vlm_desc_from_enrich or action.target or step.intent
|
||||
if vlm_description and vlm_description != "unknown_window":
|
||||
vlm_strategy = ResolutionStrategy(
|
||||
method="vlm",
|
||||
vlm_description=vlm_description,
|
||||
threshold=0.6,
|
||||
)
|
||||
fallbacks.append(vlm_strategy)
|
||||
|
||||
# Si aucune primaire trouvée, utiliser le VLM
|
||||
# Si aucune primaire trouvée, prendre le premier fallback
|
||||
if primary is None:
|
||||
if fallbacks:
|
||||
primary = fallbacks.pop(0)
|
||||
else:
|
||||
primary = vlm_strategy
|
||||
# Dernier recours : VLM avec l'intention métier
|
||||
primary = ResolutionStrategy(
|
||||
method="vlm",
|
||||
vlm_description=step.intent or "élément UI",
|
||||
threshold=0.5,
|
||||
)
|
||||
|
||||
return primary, fallbacks
|
||||
|
||||
|
||||
@@ -63,6 +63,15 @@ class IRBuilder:
|
||||
"""
|
||||
t_start = time.time()
|
||||
|
||||
# Résoudre le session_dir_path pour l'enrichissement visuel
|
||||
session_dir_path = Path(session_dir) if session_dir else None
|
||||
if session_dir_path and not session_dir_path.is_dir():
|
||||
logger.warning(
|
||||
f"IRBuilder: session_dir '{session_dir}' introuvable — "
|
||||
f"enrichissement visuel désactivé"
|
||||
)
|
||||
session_dir_path = None
|
||||
|
||||
# Créer le WorkflowIR vide
|
||||
ir = WorkflowIR.new(
|
||||
name=name or f"Workflow du {time.strftime('%d/%m/%Y %H:%M')}",
|
||||
@@ -90,6 +99,7 @@ class IRBuilder:
|
||||
total_steps=len(segments),
|
||||
workflow_name=ir.name,
|
||||
domain=domain,
|
||||
session_dir_path=session_dir_path,
|
||||
)
|
||||
ir.steps.append(step)
|
||||
|
||||
@@ -189,6 +199,7 @@ class IRBuilder:
|
||||
total_steps: int,
|
||||
workflow_name: str,
|
||||
domain: str,
|
||||
session_dir_path: Optional[Path] = None,
|
||||
) -> Step:
|
||||
"""Construire une Step depuis un segment d'événements.
|
||||
|
||||
@@ -197,7 +208,7 @@ class IRBuilder:
|
||||
# Construire la description du segment pour gemma4
|
||||
actions = []
|
||||
for evt in segment:
|
||||
action = self._event_to_action(evt)
|
||||
action = self._event_to_action(evt, session_dir_path=session_dir_path)
|
||||
if action:
|
||||
actions.append(action)
|
||||
|
||||
@@ -217,17 +228,49 @@ class IRBuilder:
|
||||
actions=actions,
|
||||
)
|
||||
|
||||
def _event_to_action(self, evt: Dict) -> Optional[Action]:
|
||||
"""Convertir un événement brut en Action."""
|
||||
def _event_to_action(self, evt: Dict, session_dir_path: Optional[Path] = None) -> Optional[Action]:
|
||||
"""Convertir un événement brut en Action enrichie.
|
||||
|
||||
Pour les clics : appelle enrich_click_from_screenshot() si le session_dir
|
||||
est disponible pour obtenir :
|
||||
- by_text (texte OCR exact de l'élément cliqué)
|
||||
- anchor_image_base64 (crop 80x80 pour template matching)
|
||||
- vlm_description (description positionnelle)
|
||||
- window_capture (rect pour le grounding ciblé)
|
||||
|
||||
Cet enrichissement est LA clé pour que l'ExecutionCompiler produise
|
||||
des plans V4 complets avec toutes les stratégies (OCR + template + VLM).
|
||||
"""
|
||||
evt_type = evt.get("type", "")
|
||||
|
||||
if evt_type == "mouse_click":
|
||||
window = evt.get("window", {}).get("title", "")
|
||||
return Action(
|
||||
pos = evt.get("pos", [0, 0])
|
||||
|
||||
# Action de base (fallback sans enrichissement)
|
||||
action = Action(
|
||||
type="click",
|
||||
target=window,
|
||||
anchor_hint=evt.get("vision_info", {}).get("text", ""),
|
||||
anchor_hint=evt.get("vision_info", {}).get("text", "") if isinstance(evt.get("vision_info"), dict) else "",
|
||||
)
|
||||
|
||||
# Enrichissement visuel via enrich_click_from_screenshot
|
||||
# Accès direct au crop OCR + anchor pour l'ExecutionCompiler
|
||||
if session_dir_path and isinstance(pos, list) and len(pos) == 2:
|
||||
enrichment = self._enrich_click(
|
||||
evt, session_dir_path, window, int(pos[0]), int(pos[1]),
|
||||
)
|
||||
if enrichment:
|
||||
# Le texte OCR devient l'anchor_hint pour l'OCR primaire
|
||||
by_text = enrichment.get("by_text", "")
|
||||
if by_text:
|
||||
action.anchor_hint = by_text
|
||||
# Stocker les métadonnées d'enrichissement dans l'action
|
||||
# (utilisé par l'ExecutionCompiler pour construire les stratégies)
|
||||
action._enrichment = enrichment
|
||||
|
||||
return action
|
||||
|
||||
elif evt_type == "text_input":
|
||||
text = evt.get("text", "")
|
||||
if text:
|
||||
@@ -241,6 +284,55 @@ class IRBuilder:
|
||||
|
||||
return None
|
||||
|
||||
def _enrich_click(
|
||||
self,
|
||||
evt: Dict,
|
||||
session_dir_path: Path,
|
||||
window_title: str,
|
||||
click_x: int,
|
||||
click_y: int,
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""Enrichir un clic avec OCR + crop + description.
|
||||
|
||||
Réutilise enrich_click_from_screenshot du stream_processor (éprouvé).
|
||||
Retourne un dict avec by_text, anchor_image_base64, vlm_description, etc.
|
||||
"""
|
||||
try:
|
||||
from agent_v0.server_v1.stream_processor import enrich_click_from_screenshot
|
||||
|
||||
# Trouver le screenshot full
|
||||
screenshot_id = evt.get("screenshot_id", "")
|
||||
if not screenshot_id:
|
||||
return None
|
||||
|
||||
full_path = session_dir_path / "shots" / f"{screenshot_id}_full.png"
|
||||
if not full_path.is_file():
|
||||
return None
|
||||
|
||||
# Résolution écran
|
||||
screen_w = 1280
|
||||
screen_h = 800
|
||||
window_capture = evt.get("window_capture", {})
|
||||
if window_capture.get("window_rect"):
|
||||
rect = window_capture["window_rect"]
|
||||
screen_w = max(screen_w, rect[2])
|
||||
screen_h = max(screen_h, rect[3])
|
||||
|
||||
return enrich_click_from_screenshot(
|
||||
screenshot_path=full_path,
|
||||
click_x=click_x,
|
||||
click_y=click_y,
|
||||
screen_w=screen_w,
|
||||
screen_h=screen_h,
|
||||
window_title=window_title,
|
||||
vision_info=evt.get("vision_info") if isinstance(evt.get("vision_info"), dict) else None,
|
||||
session_dir=session_dir_path,
|
||||
screenshot_id=screenshot_id,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug(f"IRBuilder._enrich_click: {e}")
|
||||
return None
|
||||
|
||||
def _describe_segment(self, segment: List[Dict]) -> str:
|
||||
"""Décrire un segment en langage naturel (pour gemma4)."""
|
||||
parts = []
|
||||
|
||||
Reference in New Issue
Block a user