feat: chat unifié, GestureCatalog, Copilot, Léa UI, extraction données, vérification replay
Refonte majeure du système Agent Chat et ajout de nombreux modules : - Chat unifié : suppression du dual Workflows/Agent Libre, tout passe par /api/chat avec résolution en 3 niveaux (workflow → geste → "montre-moi") - GestureCatalog : 38 raccourcis clavier universels Windows avec matching sémantique, substitution automatique dans les replays, et endpoint /api/gestures - Mode Copilot : exécution pas-à-pas des workflows avec validation humaine via WebSocket (approve/skip/abort) avant chaque action - Léa UI (agent_v0/lea_ui/) : interface PyQt5 pour Windows avec overlay transparent pour feedback visuel pendant le replay - Data Extraction (core/extraction/) : moteur d'extraction visuelle de données (OCR + VLM → SQLite), avec schémas YAML et export CSV/Excel - ReplayVerifier (agent_v0/server_v1/) : vérification post-action par comparaison de screenshots, avec logique de retry (max 3) - IntentParser durci : meilleur fallback regex, type GREETING, patterns améliorés - Dashboard : nouvelles pages gestures, streaming, extractions - Tests : 63 tests GestureCatalog, 47 tests extraction, corrections tests existants - Dépréciation : /api/agent/plan et /api/agent/execute retournent HTTP 410, suppression du code hardcodé _plan_to_replay_actions Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -24,8 +24,9 @@ Example:
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import List, Dict, Optional, Tuple
|
||||
from collections import defaultdict
|
||||
import os
|
||||
from typing import List, Dict, Optional, Tuple, Any
|
||||
from collections import defaultdict, Counter
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
@@ -106,6 +107,7 @@ class GraphBuilder:
|
||||
self.clustering_eps = clustering_eps
|
||||
self.clustering_min_samples = clustering_min_samples
|
||||
self.enable_quality_validation = enable_quality_validation
|
||||
self._screen_analyzer = None # ScreenAnalyzer (lazy import)
|
||||
|
||||
logger.info(
|
||||
f"GraphBuilder initialized: "
|
||||
@@ -119,39 +121,47 @@ class GraphBuilder:
|
||||
self,
|
||||
session: RawSession,
|
||||
workflow_name: Optional[str] = None,
|
||||
precomputed_states: Optional[List["ScreenState"]] = None,
|
||||
) -> Workflow:
|
||||
"""
|
||||
Construire un Workflow complet depuis une RawSession.
|
||||
|
||||
|
||||
Processus:
|
||||
1. Créer ScreenStates depuis screenshots
|
||||
1. Créer ScreenStates depuis screenshots (ou utiliser precomputed_states)
|
||||
2. Calculer embeddings pour chaque état
|
||||
3. Détecter patterns via clustering
|
||||
4. Construire nodes depuis clusters
|
||||
5. Construire edges depuis transitions
|
||||
|
||||
|
||||
Args:
|
||||
session: Session brute à analyser
|
||||
workflow_name: Nom du workflow (généré si None)
|
||||
|
||||
precomputed_states: ScreenStates déjà analysés (streaming).
|
||||
Si fourni, saute l'étape 1 (pas de re-analyse via ScreenAnalyzer).
|
||||
|
||||
Returns:
|
||||
Workflow construit avec nodes et edges
|
||||
|
||||
|
||||
Raises:
|
||||
ValueError: Si la session est vide ou invalide
|
||||
"""
|
||||
if not session.screenshots:
|
||||
raise ValueError("Session has no screenshots")
|
||||
|
||||
if not precomputed_states and not session.screenshots:
|
||||
raise ValueError("Session has no screenshots and no precomputed states")
|
||||
|
||||
logger.info(
|
||||
f"Building workflow from session {session.session_id} "
|
||||
f"with {len(session.screenshots)} screenshots"
|
||||
f"with {len(precomputed_states or session.screenshots)} "
|
||||
f"{'precomputed states' if precomputed_states else 'screenshots'}"
|
||||
)
|
||||
|
||||
# Étape 1: Créer ScreenStates
|
||||
screen_states = self._create_screen_states(session)
|
||||
logger.debug(f"Created {len(screen_states)} screen states")
|
||||
|
||||
|
||||
# Étape 1: Créer ScreenStates (ou réutiliser ceux pré-calculés)
|
||||
if precomputed_states:
|
||||
screen_states = precomputed_states
|
||||
logger.debug(f"Using {len(screen_states)} precomputed screen states")
|
||||
else:
|
||||
screen_states = self._create_screen_states(session)
|
||||
logger.debug(f"Created {len(screen_states)} screen states")
|
||||
|
||||
# Étape 2: Calculer embeddings
|
||||
embeddings = self._compute_embeddings(screen_states)
|
||||
logger.debug(f"Computed {len(embeddings)} embeddings")
|
||||
@@ -315,16 +325,31 @@ class GraphBuilder:
|
||||
file_size_bytes=screenshot_path.stat().st_size if screenshot_path.exists() else 0
|
||||
)
|
||||
|
||||
# Créer PerceptionLevel (sera enrichi par embedding_builder)
|
||||
# Créer PerceptionLevel — enrichir avec OCR si le screenshot existe
|
||||
detected_text = []
|
||||
text_method = "none"
|
||||
|
||||
if screenshot_path.exists():
|
||||
try:
|
||||
if self._screen_analyzer is None:
|
||||
from core.pipeline.screen_analyzer import ScreenAnalyzer
|
||||
self._screen_analyzer = ScreenAnalyzer(session_id=session.session_id)
|
||||
extracted = self._screen_analyzer._extract_text(str(screenshot_path))
|
||||
if extracted:
|
||||
detected_text = extracted
|
||||
text_method = self._screen_analyzer._get_ocr_method_name()
|
||||
except Exception as e:
|
||||
logger.debug(f"OCR échoué pour {screenshot_path}: {e}")
|
||||
|
||||
perception = PerceptionLevel(
|
||||
embedding=EmbeddingRef(
|
||||
provider="openclip_ViT-B-32",
|
||||
vector_id=f"data/embeddings/screens/{session.session_id}_state_{i:04d}.npy",
|
||||
dimensions=512
|
||||
),
|
||||
detected_text=[], # Sera rempli par VLM/OCR
|
||||
text_detection_method="pending",
|
||||
confidence_avg=0.0
|
||||
detected_text=detected_text,
|
||||
text_detection_method=text_method,
|
||||
confidence_avg=0.85 if detected_text else 0.0
|
||||
)
|
||||
|
||||
# Créer ContextLevel
|
||||
@@ -504,8 +529,12 @@ class GraphBuilder:
|
||||
node = WorkflowNode(
|
||||
node_id=f"node_{cluster_id:03d}",
|
||||
name=f"State Pattern {cluster_id}",
|
||||
screen_template=template,
|
||||
observation_count=len(indices),
|
||||
description=f"Pattern auto-détecté ({len(indices)} observations)",
|
||||
template=template,
|
||||
metadata={
|
||||
"observation_count": len(indices),
|
||||
"_prototype_vector": prototype.tolist(),
|
||||
},
|
||||
)
|
||||
|
||||
nodes.append(node)
|
||||
@@ -522,27 +551,172 @@ class GraphBuilder:
|
||||
) -> ScreenTemplate:
|
||||
"""
|
||||
Créer un ScreenTemplate depuis un cluster d'états.
|
||||
|
||||
TODO: Implémenter extraction intelligente de:
|
||||
- window_title_pattern (regex depuis titres communs)
|
||||
- required_text_patterns (texte présent dans tous les états)
|
||||
- required_ui_elements (éléments UI communs)
|
||||
|
||||
|
||||
Extrait les contraintes communes à tous les états du cluster :
|
||||
- window_title_pattern : titre de fenêtre commun
|
||||
- required_text_patterns : textes présents dans la majorité des états
|
||||
- required_ui_elements : rôles/types UI récurrents
|
||||
|
||||
Args:
|
||||
states: États du cluster
|
||||
prototype_embedding: Embedding prototype
|
||||
|
||||
|
||||
Returns:
|
||||
ScreenTemplate avec contraintes
|
||||
ScreenTemplate avec contraintes extraites
|
||||
"""
|
||||
# Pour l'instant, template basique avec seulement l'embedding
|
||||
return ScreenTemplate(
|
||||
embedding_prototype=prototype_embedding.tolist(),
|
||||
similarity_threshold=0.85,
|
||||
window_title_pattern=None, # TODO: Extraire
|
||||
required_text_patterns=[], # TODO: Extraire
|
||||
required_ui_elements=[], # TODO: Extraire
|
||||
# --- Extraction du titre de fenêtre commun ---
|
||||
window_title_pattern = self._extract_window_pattern(states)
|
||||
|
||||
# --- Extraction des textes récurrents ---
|
||||
required_text_patterns = self._extract_common_texts(states)
|
||||
|
||||
# --- Extraction des éléments UI récurrents ---
|
||||
required_ui_elements = self._extract_common_ui_elements(states)
|
||||
|
||||
# Construire les sous-objets de contraintes
|
||||
window_constraint = WindowConstraint(
|
||||
title_pattern=window_title_pattern,
|
||||
title_contains=window_title_pattern,
|
||||
)
|
||||
|
||||
text_constraint = TextConstraint(
|
||||
required_texts=required_text_patterns,
|
||||
)
|
||||
|
||||
ui_roles = [
|
||||
e.get("role", "") for e in required_ui_elements if e.get("role")
|
||||
]
|
||||
ui_constraint = UIConstraint(
|
||||
required_roles=ui_roles,
|
||||
)
|
||||
|
||||
embedding_proto = EmbeddingPrototype(
|
||||
provider="openclip_ViT-B-32",
|
||||
vector_id="", # Le vecteur est stocké dans node.metadata._prototype_vector
|
||||
min_cosine_similarity=0.85,
|
||||
sample_count=len(states),
|
||||
)
|
||||
|
||||
return ScreenTemplate(
|
||||
window=window_constraint,
|
||||
text=text_constraint,
|
||||
ui=ui_constraint,
|
||||
embedding=embedding_proto,
|
||||
)
|
||||
|
||||
def _extract_window_pattern(self, states: List[ScreenState]) -> Optional[str]:
|
||||
"""Extraire un pattern de titre de fenêtre commun aux états du cluster."""
|
||||
titles = [s.window.window_title for s in states if s.window.window_title]
|
||||
if not titles:
|
||||
return None
|
||||
|
||||
# Si tous les titres sont identiques, retourner directement
|
||||
if len(set(titles)) == 1:
|
||||
return titles[0]
|
||||
|
||||
# Trouver le préfixe commun le plus long
|
||||
prefix = os.path.commonprefix(titles)
|
||||
if len(prefix) >= 5:
|
||||
return prefix.rstrip(" -–—|")
|
||||
|
||||
# Fallback: le titre le plus fréquent
|
||||
from collections import Counter
|
||||
most_common = Counter(titles).most_common(1)[0][0]
|
||||
return most_common
|
||||
|
||||
def _extract_common_texts(
|
||||
self, states: List[ScreenState], min_presence_ratio: float = 0.6
|
||||
) -> List[str]:
|
||||
"""
|
||||
Extraire les textes présents dans la majorité des états du cluster.
|
||||
|
||||
Args:
|
||||
states: États du cluster
|
||||
min_presence_ratio: Proportion minimale de présence (0.6 = 60% des états)
|
||||
"""
|
||||
if not states:
|
||||
return []
|
||||
|
||||
# Collecter les textes de chaque état
|
||||
text_counts: Dict[str, int] = defaultdict(int)
|
||||
states_with_text = 0
|
||||
|
||||
for state in states:
|
||||
if hasattr(state.perception, 'detected_text') and state.perception.detected_text:
|
||||
states_with_text += 1
|
||||
seen_in_state = set()
|
||||
for text in state.perception.detected_text:
|
||||
normalized = text.strip().lower()
|
||||
if len(normalized) >= 3 and normalized not in seen_in_state:
|
||||
text_counts[normalized] += 1
|
||||
seen_in_state.add(normalized)
|
||||
|
||||
if states_with_text == 0:
|
||||
return []
|
||||
|
||||
# Garder les textes présents dans au moins min_presence_ratio des états
|
||||
threshold = max(2, int(states_with_text * min_presence_ratio))
|
||||
common_texts = [
|
||||
text for text, count in text_counts.items()
|
||||
if count >= threshold
|
||||
]
|
||||
|
||||
# Limiter à 10 textes les plus fréquents
|
||||
common_texts.sort(key=lambda t: text_counts[t], reverse=True)
|
||||
return common_texts[:10]
|
||||
|
||||
def _extract_common_ui_elements(
|
||||
self, states: List[ScreenState], min_presence_ratio: float = 0.5
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Extraire les types/rôles d'éléments UI récurrents dans le cluster.
|
||||
|
||||
Retourne une liste de contraintes UI au format:
|
||||
[{"type": "button", "role": "validate", "min_count": 1}, ...]
|
||||
"""
|
||||
if not states:
|
||||
return []
|
||||
|
||||
# Compter les paires (type, role) dans chaque état
|
||||
role_counts: Dict[str, int] = defaultdict(int)
|
||||
type_counts: Dict[str, int] = defaultdict(int)
|
||||
states_with_ui = 0
|
||||
|
||||
for state in states:
|
||||
if state.ui_elements:
|
||||
states_with_ui += 1
|
||||
seen_roles = set()
|
||||
seen_types = set()
|
||||
for el in state.ui_elements:
|
||||
el_type = getattr(el, 'type', 'unknown')
|
||||
el_role = getattr(el, 'role', 'unknown')
|
||||
|
||||
if el_role != 'unknown' and el_role not in seen_roles:
|
||||
role_counts[el_role] += 1
|
||||
seen_roles.add(el_role)
|
||||
|
||||
if el_type != 'unknown' and el_type not in seen_types:
|
||||
type_counts[el_type] += 1
|
||||
seen_types.add(el_type)
|
||||
|
||||
if states_with_ui == 0:
|
||||
return []
|
||||
|
||||
threshold = max(2, int(states_with_ui * min_presence_ratio))
|
||||
|
||||
constraints = []
|
||||
|
||||
# Ajouter les rôles récurrents
|
||||
for role, count in role_counts.items():
|
||||
if count >= threshold:
|
||||
constraints.append({
|
||||
"role": role,
|
||||
"min_count": 1,
|
||||
})
|
||||
|
||||
# Limiter à 8 contraintes
|
||||
constraints.sort(key=lambda c: role_counts.get(c.get("role", ""), 0), reverse=True)
|
||||
return constraints[:8]
|
||||
|
||||
def _build_edges(
|
||||
self,
|
||||
@@ -633,9 +807,14 @@ class GraphBuilder:
|
||||
# Récupérer les embeddings des prototypes de nodes
|
||||
node_prototypes = {}
|
||||
for node in nodes:
|
||||
if hasattr(node, 'template') and node.template:
|
||||
if hasattr(node.template, 'embedding_prototype'):
|
||||
node_prototypes[node.node_id] = np.array(node.template.embedding_prototype)
|
||||
# Priorité : vecteur en mémoire (metadata), sinon chargement depuis disque
|
||||
proto_list = node.metadata.get("_prototype_vector")
|
||||
if proto_list is not None:
|
||||
node_prototypes[node.node_id] = np.array(proto_list, dtype=np.float32)
|
||||
elif node.template and node.template.embedding and node.template.embedding.vector_id:
|
||||
proto_path = Path(node.template.embedding.vector_id)
|
||||
if proto_path.exists():
|
||||
node_prototypes[node.node_id] = np.load(proto_path)
|
||||
|
||||
if not node_prototypes:
|
||||
logger.warning("No node prototypes available for mapping")
|
||||
@@ -741,7 +920,7 @@ class GraphBuilder:
|
||||
action = Action(
|
||||
type=action_type,
|
||||
target=TargetSpec(
|
||||
role=target_role,
|
||||
by_role=target_role,
|
||||
selection_policy="first",
|
||||
fallback_strategy="visual_similarity"
|
||||
),
|
||||
|
||||
@@ -133,10 +133,10 @@ class NodeMatcher:
|
||||
node: WorkflowNode
|
||||
) -> bool:
|
||||
"""Valider les contraintes du node contre l'état."""
|
||||
template = node.screen_template
|
||||
|
||||
if template.window_title_pattern:
|
||||
if not state.raw_level or not state.raw_level.window_title:
|
||||
template = node.template
|
||||
|
||||
if template and template.window and template.window.title_pattern:
|
||||
if not state.window or not state.window.window_title:
|
||||
return False
|
||||
|
||||
return True
|
||||
@@ -179,13 +179,14 @@ class NodeMatcher:
|
||||
# Calculer similarités avec tous les nodes
|
||||
similarities = []
|
||||
for node in candidate_nodes:
|
||||
if node.screen_template.embedding_prototype_path:
|
||||
proto_path = node.template.embedding.vector_id if (node.template and node.template.embedding) else None
|
||||
if proto_path:
|
||||
try:
|
||||
prototype = np.load(node.screen_template.embedding_prototype_path)
|
||||
prototype = np.load(proto_path)
|
||||
similarity = float(np.dot(state_vector, prototype))
|
||||
similarities.append({
|
||||
'node_id': node.node_id,
|
||||
'node_label': node.label,
|
||||
'node_label': node.name,
|
||||
'similarity': similarity,
|
||||
'threshold': self.similarity_threshold,
|
||||
'matched': similarity >= self.similarity_threshold
|
||||
@@ -204,9 +205,9 @@ class NodeMatcher:
|
||||
'timestamp': timestamp,
|
||||
'failed_match_id': failed_match_id,
|
||||
'state': {
|
||||
'window_title': state.raw_level.window_title if state.raw_level else None,
|
||||
'screenshot_path': str(state.raw_level.screenshot_path) if state.raw_level else None,
|
||||
'ui_elements_count': len(state.perception_level.ui_elements) if state.perception_level else 0
|
||||
'window_title': state.window.window_title if getattr(state, 'window', None) else None,
|
||||
'screenshot_path': str(state.raw.screenshot_path) if getattr(state, 'raw', None) else None,
|
||||
'ui_elements_count': len(state.ui_elements) if getattr(state, 'ui_elements', None) else 0
|
||||
},
|
||||
'matching_results': {
|
||||
'best_confidence': best_confidence,
|
||||
|
||||
Reference in New Issue
Block a user