diff --git a/agent_chat/app.py b/agent_chat/app.py index 1f18604b5..892dfdce5 100644 --- a/agent_chat/app.py +++ b/agent_chat/app.py @@ -44,6 +44,7 @@ from .confirmation import ConfirmationLoop, ConfirmationStatus, RiskLevel, get_c from .response_generator import ResponseGenerator, get_response_generator from .conversation_manager import ConversationManager, get_conversation_manager from .autonomous_planner import AutonomousPlanner, get_autonomous_planner, ExecutionPlan +from .gesture_catalog import GestureCatalog # GPU Resource Manager (optional) try: @@ -78,6 +79,7 @@ confirmation_loop: Optional[ConfirmationLoop] = None response_generator: Optional[ResponseGenerator] = None conversation_manager: Optional[ConversationManager] = None autonomous_planner: Optional[AutonomousPlanner] = None +gesture_catalog: Optional[GestureCatalog] = None # Execution components workflow_pipeline = None @@ -99,6 +101,23 @@ execution_status = { } command_history: List[Dict[str, Any]] = [] +# Copilot state — suivi du mode pas-à-pas +_copilot_sessions: Dict[str, Dict[str, Any]] = {} + +_COPILOT_KEYWORDS = [ + "copilot", "co-pilot", + "pas à pas", "pas-à-pas", "pas a pas", + "étape par étape", "etape par etape", + "step by step", "une étape à la fois", + "mode assisté", "mode assiste", "mode guidé", "mode guide", +] + + +def _detect_copilot_mode(message: str) -> bool: + """Détecter si l'utilisateur demande le mode Copilot.""" + msg_lower = message.lower() + return any(kw in msg_lower for kw in _COPILOT_KEYWORDS) + def init_system(): """Initialiser tous les composants du système.""" @@ -218,6 +237,15 @@ def init_system(): logger.warning(f"⚠ AutonomousPlanner: {e}") autonomous_planner = None + # 6. GestureCatalog (raccourcis clavier universels) + global gesture_catalog + try: + gesture_catalog = GestureCatalog() + logger.info(f"✓ GestureCatalog: {len(gesture_catalog.list_all())} gestes chargés") + except Exception as e: + logger.warning(f"⚠ GestureCatalog: {e}") + gesture_catalog = None + # ============================================================================= # Routes Web @@ -486,35 +514,53 @@ def api_chat(): action_taken = "denied" elif intent.intent_type == IntentType.EXECUTE: - # Exécuter un workflow - if matcher and intent.workflow_hint: - match = matcher.find_workflow(intent.workflow_hint, min_confidence=0.2) + # Résolution en 3 niveaux : + # 1. Workflow appris → exécution directe ou copilot + # 2. Geste primitif (GestureCatalog) → raccourci clavier + # 3. "Je ne sais pas, montre-moi !" + query = intent.workflow_hint or intent.raw_query - if match: - # Évaluer le risque - risk = confirmation_loop.evaluate_risk( - match.workflow_name, - {**match.extracted_params, **intent.parameters} + if matcher and query: + match = matcher.find_workflow(query, min_confidence=0.2) + else: + match = None + + if match: + # Niveau 1 : Workflow appris + risk = confirmation_loop.evaluate_risk( + match.workflow_name, + {**match.extracted_params, **intent.parameters} + ) + + if confirmation_loop.requires_confirmation(risk): + conf = confirmation_loop.create_confirmation_request( + workflow_name=match.workflow_name, + parameters={**match.extracted_params, **intent.parameters}, + action_type="execute", + risk_level=risk ) + conversation_manager.set_pending_confirmation(session, conf) + response = response_generator.generate_confirmation_request(conf) + result = {"needs_confirmation": True, "confirmation": conf.to_dict()} + action_taken = "confirmation_requested" - if confirmation_loop.requires_confirmation(risk): - # Créer une demande de confirmation - conf = confirmation_loop.create_confirmation_request( - workflow_name=match.workflow_name, - parameters={**match.extracted_params, **intent.parameters}, - action_type="execute", - risk_level=risk + else: + all_params = {**match.extracted_params, **intent.parameters} + use_copilot = _detect_copilot_mode(message) + + if use_copilot: + result = { + "success": True, + "workflow": match.workflow_name, + "params": all_params, + "confidence": match.confidence, + "mode": "copilot", + } + action_taken = "copilot_started" + socketio.start_background_task( + execute_workflow_copilot, match, all_params ) - conversation_manager.set_pending_confirmation(session, conf) - - # Générer la réponse de confirmation - response = response_generator.generate_confirmation_request(conf) - result = {"needs_confirmation": True, "confirmation": conf.to_dict()} - action_taken = "confirmation_requested" - else: - # Exécuter directement - all_params = {**match.extracted_params, **intent.parameters} result = { "success": True, "workflow": match.workflow_name, @@ -522,12 +568,31 @@ def api_chat(): "confidence": match.confidence } action_taken = "executed" - socketio.start_background_task(execute_workflow, match, all_params) + + elif gesture_catalog and query: + # Niveau 2 : Geste primitif (raccourci clavier) + gesture_match = gesture_catalog.match(query, min_score=0.6) + if gesture_match: + gesture, score = gesture_match + result = { + "gesture": True, + "gesture_name": gesture.name, + "gesture_keys": "+".join(gesture.keys), + "gesture_id": gesture.id, + "confidence": score, + } + action_taken = "gesture_executed" + # Exécuter le geste via le streaming server + socketio.start_background_task( + _execute_gesture, gesture + ) else: - result = {"not_found": True, "query": intent.workflow_hint} + # Niveau 3 : Inconnu → "montre-moi !" + result = {"not_found": True, "query": query, "teach_me": True} else: - result = {"error": "Pas de workflow spécifié"} + # Niveau 3 : Pas de query exploitable + result = {"not_found": True, "query": query or "", "teach_me": True} elif intent.intent_type == IntentType.LIST: # Lister les workflows avec métadonnées enrichies @@ -594,6 +659,10 @@ def api_chat(): result = {} action_taken = "help_shown" + elif intent.intent_type == IntentType.GREETING: + result = {} + action_taken = "greeting" + elif intent.clarification_needed: result = {"clarification_needed": True} action_taken = "clarification_requested" @@ -728,122 +797,25 @@ def api_llm_set_model(): # ============================================================================= -# API Agent Libre (Autonomous Mode) +# API Agent Libre (dépréciée — tout passe par /api/chat) # ============================================================================= @app.route('/api/agent/plan', methods=['POST']) def api_agent_plan(): - """ - Génère un plan d'exécution pour une tâche en langage naturel. - - Le mode "Agent Libre" permet d'exécuter des tâches sans workflow pré-enregistré. - Le LLM (Qwen) décompose la demande en étapes d'actions. - """ - if not autonomous_planner: - return jsonify({"error": "Agent autonome non disponible"}), 503 - - data = request.json - user_request = data.get('request', '').strip() - - if not user_request: - return jsonify({"error": "Requête vide"}), 400 - - try: - # Contexte optionnel (écran actuel, etc.) - context = data.get('context', {}) - - # Générer le plan - plan = autonomous_planner.plan(user_request, context) - - return jsonify({ - "success": True, - "plan": { - "task": plan.task_description, - "steps": [ - { - "step": s.step_number, - "action": s.action_type.value, - "description": s.description, - "target": s.target, - "params": s.parameters, - "expected_result": s.expected_result - } - for s in plan.steps - ], - "estimated_seconds": plan.estimated_duration_seconds, - "risk_level": plan.risk_level, - "requires_confirmation": plan.requires_confirmation - }, - "llm_available": autonomous_planner.llm_available - }) - - except Exception as e: - logger.error(f"Agent plan error: {e}") - return jsonify({"error": str(e)}), 500 + """Déprécié — utiliser le chat unifié (/api/chat).""" + return jsonify({ + "error": "Cette API est dépréciée. Utilisez /api/chat avec du langage naturel.", + "migration": "POST /api/chat {\"message\": \"votre demande\"}" + }), 410 @app.route('/api/agent/execute', methods=['POST']) def api_agent_execute(): - """ - Exécute un plan d'agent autonome. - - Attend un objet plan (généré par /api/agent/plan) et l'exécute étape par étape. - """ - if not autonomous_planner: - return jsonify({"error": "Agent autonome non disponible"}), 503 - - data = request.json - plan_data = data.get('plan') - - if not plan_data: - return jsonify({"error": "Plan manquant"}), 400 - - try: - # Reconstruire le plan depuis les données - from .autonomous_planner import PlannedAction, ActionType - - steps = [] - for step_data in plan_data.get('steps', []): - action_type_str = step_data.get('action', 'click') - action_type_map = { - 'open_app': ActionType.OPEN_APP, - 'open_url': ActionType.OPEN_URL, - 'click': ActionType.CLICK, - 'type_text': ActionType.TYPE_TEXT, - 'hotkey': ActionType.HOTKEY, - 'scroll': ActionType.SCROLL, - 'wait': ActionType.WAIT, - 'screenshot': ActionType.SCREENSHOT - } - - steps.append(PlannedAction( - step_number=step_data.get('step', len(steps) + 1), - action_type=action_type_map.get(action_type_str, ActionType.CLICK), - description=step_data.get('description', ''), - target=step_data.get('target'), - parameters=step_data.get('params', {}), - expected_result=step_data.get('expected_result') - )) - - plan = ExecutionPlan( - task_description=plan_data.get('task', ''), - steps=steps, - estimated_duration_seconds=plan_data.get('estimated_seconds', 30), - risk_level=plan_data.get('risk_level', 'low') - ) - - # Exécuter en arrière-plan - socketio.start_background_task(execute_agent_plan, plan) - - return jsonify({ - "success": True, - "message": "Exécution démarrée", - "steps_count": len(steps) - }) - - except Exception as e: - logger.error(f"Agent execute error: {e}") - return jsonify({"error": str(e)}), 500 + """Déprécié — utiliser le chat unifié (/api/chat).""" + return jsonify({ + "error": "Cette API est dépréciée. Utilisez /api/chat avec du langage naturel.", + "migration": "POST /api/chat {\"message\": \"votre demande\"}" + }), 410 @app.route('/api/agent/status') @@ -856,208 +828,71 @@ def api_agent_status(): }) -def execute_agent_plan(plan: ExecutionPlan): - """Exécute un plan d'agent sur la machine distante via le streaming server.""" +@app.route('/api/gestures') +def api_gestures(): + """Liste tous les gestes disponibles dans le catalogue.""" + if not gesture_catalog: + return jsonify({"gestures": [], "count": 0}) + + gestures = gesture_catalog.list_all() + + return jsonify({ + "gestures": gestures, + "count": len(gestures), + "categories": list({g["category"] for g in gestures}), + }) + + +def _execute_gesture(gesture): + """Exécuter un geste primitif via le streaming server.""" + import uuid as _uuid + + action = { + "action_id": f"act_gesture_{_uuid.uuid4().hex[:8]}", + "type": "key_combo", + "keys": list(gesture.keys), + } try: - # Convertir le plan LLM en actions normalisées pour l'Agent V1 - actions = _plan_to_replay_actions(plan) - - if not actions: - socketio.emit('execution_completed', { - "success": False, - "workflow": plan.task_description, - "message": "Aucune action convertible dans ce plan." - }) - return - - # Envoyer au streaming server pour exécution sur le PC cible resp = http_requests.post( f"{STREAMING_SERVER_URL}/api/v1/traces/stream/replay/raw", json={ - "actions": actions, - "session_id": "", # Auto-détection - "task_description": plan.task_description, + "actions": [action], + "session_id": "", + "task_description": f"Geste: {gesture.name}", }, - timeout=15, + timeout=10, ) if resp.status_code == 200: - data = resp.json() - replay_id = data.get("replay_id", "") - total = data.get("total_actions", len(actions)) - - socketio.emit('agent_execution_started', { - "workflow": plan.task_description, - "message": f"Exécution démarrée sur le PC cible ({total} actions)", - "replay_id": replay_id, + socketio.emit('execution_completed', { + "workflow": gesture.name, + "success": True, + "message": f"Geste '{gesture.name}' ({'+'.join(gesture.keys)}) envoyé", }) - - # Suivre la progression - _poll_replay_progress(replay_id, plan.task_description, total) - else: error = resp.text[:200] - logger.error(f"Streaming server refus: HTTP {resp.status_code}: {error}") socketio.emit('execution_completed', { + "workflow": gesture.name, "success": False, - "workflow": plan.task_description, - "message": f"Erreur serveur: {error}" + "message": f"Erreur: {error}", }) except http_requests.ConnectionError: - logger.error("Streaming server non disponible pour l'agent libre") socketio.emit('execution_completed', { + "workflow": gesture.name, "success": False, - "workflow": plan.task_description, - "message": "Le serveur de streaming n'est pas disponible. " - "Vérifiez qu'il tourne sur le port 5005." + "message": "Serveur de streaming non disponible (port 5005).", }) except Exception as e: - logger.error(f"Agent execution error: {e}") + logger.error(f"Gesture execution error: {e}") socketio.emit('execution_completed', { + "workflow": gesture.name, "success": False, - "workflow": plan.task_description, - "message": f"Erreur: {str(e)}" + "message": f"Erreur: {str(e)}", }) -def _plan_to_replay_actions(plan: ExecutionPlan) -> list: - """Convertir un ExecutionPlan LLM en actions normalisées pour l'Agent V1.""" - import uuid as _uuid - from .autonomous_planner import ActionType - - actions = [] - for step in plan.steps: - action = {"action_id": f"act_free_{_uuid.uuid4().hex[:6]}"} - - if step.action_type == ActionType.OPEN_URL: - url = step.parameters.get("url", "") - # Ouvrir le navigateur : touche Windows, taper le navigateur, Enter, puis naviguer - actions.append({ - **action, - "type": "key_combo", - "keys": ["super"], - }) - actions.append({ - "action_id": f"act_free_{_uuid.uuid4().hex[:6]}", - "type": "wait", - "duration_ms": 800, - }) - actions.append({ - "action_id": f"act_free_{_uuid.uuid4().hex[:6]}", - "type": "type", - "text": "chrome", - }) - actions.append({ - "action_id": f"act_free_{_uuid.uuid4().hex[:6]}", - "type": "key_combo", - "keys": ["enter"], - }) - actions.append({ - "action_id": f"act_free_{_uuid.uuid4().hex[:6]}", - "type": "wait", - "duration_ms": 2000, - }) - # Focus barre d'adresse + taper URL - actions.append({ - "action_id": f"act_free_{_uuid.uuid4().hex[:6]}", - "type": "key_combo", - "keys": ["ctrl", "l"], - }) - actions.append({ - "action_id": f"act_free_{_uuid.uuid4().hex[:6]}", - "type": "wait", - "duration_ms": 300, - }) - actions.append({ - "action_id": f"act_free_{_uuid.uuid4().hex[:6]}", - "type": "type", - "text": url, - }) - actions.append({ - "action_id": f"act_free_{_uuid.uuid4().hex[:6]}", - "type": "key_combo", - "keys": ["enter"], - }) - actions.append({ - "action_id": f"act_free_{_uuid.uuid4().hex[:6]}", - "type": "wait", - "duration_ms": 3000, - }) - continue - - elif step.action_type == ActionType.OPEN_APP: - app_name = step.parameters.get("app_name", "") - actions.append({**action, "type": "key_combo", "keys": ["super"]}) - actions.append({ - "action_id": f"act_free_{_uuid.uuid4().hex[:6]}", - "type": "wait", "duration_ms": 800, - }) - actions.append({ - "action_id": f"act_free_{_uuid.uuid4().hex[:6]}", - "type": "type", "text": app_name, - }) - actions.append({ - "action_id": f"act_free_{_uuid.uuid4().hex[:6]}", - "type": "key_combo", "keys": ["enter"], - }) - actions.append({ - "action_id": f"act_free_{_uuid.uuid4().hex[:6]}", - "type": "wait", "duration_ms": 2000, - }) - continue - - elif step.action_type == ActionType.TYPE_TEXT: - text = step.parameters.get("text", "") - action["type"] = "type" - action["text"] = text - # Si un target est spécifié, activer la résolution visuelle - if step.target: - action["visual_mode"] = True - action["target_spec"] = {"by_text": step.target} - - elif step.action_type == ActionType.CLICK: - action["type"] = "click" - action["x_pct"] = 0.5 - action["y_pct"] = 0.5 - action["button"] = "left" - if step.target: - action["visual_mode"] = True - action["target_spec"] = {"by_text": step.target} - - elif step.action_type == ActionType.HOTKEY: - keys_str = step.parameters.get("keys", "") - if isinstance(keys_str, str): - keys = [k.strip() for k in keys_str.split("+")] - else: - keys = keys_str - action["type"] = "key_combo" - action["keys"] = keys - - elif step.action_type == ActionType.SCROLL: - direction = step.parameters.get("direction", "down") - amount = step.parameters.get("amount", 3) - action["type"] = "scroll" - action["delta"] = -amount if direction == "down" else amount - - elif step.action_type == ActionType.WAIT: - seconds = step.parameters.get("seconds", 2) - action["type"] = "wait" - action["duration_ms"] = int(seconds * 1000) - - elif step.action_type == ActionType.SCREENSHOT: - # Skip — l'Agent V1 capture déjà automatiquement - continue - - else: - continue - - actions.append(action) - - return actions - - @app.route('/api/help') def api_help(): """Aide et mode d'emploi.""" @@ -1138,6 +973,53 @@ def handle_cancel(): emit('execution_cancelled', {}, broadcast=True) +# ============================================================================= +# Copilot WebSocket Events +# ============================================================================= + +@socketio.on('copilot_approve') +def handle_copilot_approve(): + """L'utilisateur approuve l'étape copilot en cours.""" + copilot = _copilot_sessions.get("__copilot__") + if not copilot or copilot["status"] != "waiting_approval": + emit('copilot_error', {"message": "Aucune étape en attente de validation."}) + return + + logger.info(f"Copilot approve: étape {copilot['current_index'] + 1}/{copilot['total']}") + copilot["status"] = "approved" + + +@socketio.on('copilot_skip') +def handle_copilot_skip(): + """L'utilisateur saute l'étape copilot en cours.""" + copilot = _copilot_sessions.get("__copilot__") + if not copilot or copilot["status"] != "waiting_approval": + emit('copilot_error', {"message": "Aucune étape en attente de validation."}) + return + + logger.info(f"Copilot skip: étape {copilot['current_index'] + 1}/{copilot['total']}") + copilot["status"] = "skipped" + + +@socketio.on('copilot_abort') +def handle_copilot_abort(): + """L'utilisateur annule tout le workflow copilot.""" + copilot = _copilot_sessions.get("__copilot__") + if not copilot: + return + + logger.info(f"Copilot abort: workflow '{copilot['workflow_name']}'") + copilot["status"] = "aborted" + _copilot_sessions.pop("__copilot__", None) + emit('copilot_complete', { + "workflow": copilot["workflow_name"], + "status": "aborted", + "message": "Workflow annulé par l'utilisateur.", + "completed": copilot.get("completed", 0), + "total": copilot["total"], + }) + + # ============================================================================= # Exécution de workflow # ============================================================================= @@ -1243,6 +1125,352 @@ def _poll_replay_progress(replay_id: str, workflow_name: str, total_actions: int ) +def _build_actions_from_workflow(match, params: Dict[str, Any]) -> List[Dict[str, Any]]: + """ + Construire la liste d'actions normalisées depuis un workflow. + + Tente la conversion via le format core (nodes/edges), + puis fallback sur le format JSON brut. + """ + import uuid as _uuid + + try: + with open(match.workflow_path, 'r') as f: + workflow_data = json.load(f) + except Exception as e: + logger.error(f"Impossible de charger le workflow {match.workflow_path}: {e}") + return [] + + # Substituer les variables + var_manager = VariableManager() + var_manager.set_variables(params) + workflow_data = var_manager.substitute_dict(workflow_data) + + edges = workflow_data.get("edges", []) + actions = [] + + for i, edge in enumerate(edges): + action_dict = edge.get("action", {}) + action_type = action_dict.get("type", "unknown") + action_params = action_dict.get("parameters", {}) + target_dict = action_dict.get("target", {}) + + action = { + "action_id": f"act_copilot_{_uuid.uuid4().hex[:8]}", + "step_index": i, + "description": _describe_action(action_type, action_params, target_dict), + } + + if action_type == "mouse_click": + pos = target_dict.get("position", [0.5, 0.5]) + action["type"] = "click" + action["x_pct"] = pos[0] if len(pos) > 0 else 0.5 + action["y_pct"] = pos[1] if len(pos) > 1 else 0.5 + action["button"] = action_params.get("button", "left") + elif action_type == "text_input": + action["type"] = "type" + action["text"] = action_params.get("text", "") + elif action_type == "key_press": + action["type"] = "key_combo" + keys = action_params.get("keys", []) + if not keys and action_params.get("key"): + keys = [action_params["key"]] + action["keys"] = keys + elif action_type == "compound": + for step in action_params.get("steps", []): + sub_action = { + "action_id": f"act_copilot_{_uuid.uuid4().hex[:8]}", + "step_index": i, + "description": _describe_action(step.get("type", "unknown"), step, {}), + } + sub_type = step.get("type", "unknown") + if sub_type == "key_press": + sub_action["type"] = "key_combo" + sub_action["keys"] = step.get("keys", []) + elif sub_type == "text_input": + sub_action["type"] = "type" + sub_action["text"] = step.get("text", "") + elif sub_type == "wait": + sub_action["type"] = "wait" + sub_action["duration_ms"] = step.get("duration_ms", 500) + elif sub_type == "mouse_click": + sub_action["type"] = "click" + sub_action["x_pct"] = step.get("x_pct", 0.5) + sub_action["y_pct"] = step.get("y_pct", 0.5) + sub_action["button"] = step.get("button", "left") + else: + continue + actions.append(sub_action) + continue + else: + continue + + # Ajouter target_spec pour résolution visuelle si dispo + target_spec = {} + if target_dict.get("role"): + target_spec["by_role"] = target_dict["role"] + if target_dict.get("text"): + target_spec["by_text"] = target_dict["text"] + if target_spec: + action["target_spec"] = target_spec + action["visual_mode"] = True + + actions.append(action) + + return actions + + +def _describe_action(action_type: str, params: Dict[str, Any], target: Dict[str, Any]) -> str: + """Générer une description lisible d'une action pour l'affichage copilot.""" + target_text = target.get("text", "") + target_role = target.get("role", "") + + if action_type == "mouse_click": + label = target_text or target_role or "un élément" + return f"Clic sur '{label}'" + elif action_type == "text_input": + text = params.get("text", "") + preview = text[:30] + "..." if len(text) > 30 else text + return f"Saisir le texte : '{preview}'" + elif action_type == "key_press": + keys = params.get("keys", params.get("key", "")) + if isinstance(keys, list): + keys = "+".join(keys) + return f"Touche(s) : {keys}" + elif action_type == "compound": + steps_count = len(params.get("steps", [])) + return f"Action composée ({steps_count} sous-actions)" + elif action_type == "wait": + ms = params.get("duration_ms", 500) + return f"Attente {ms}ms" + else: + return f"Action : {action_type}" + + +def execute_workflow_copilot(match, params: Dict[str, Any]): + """ + Exécuter un workflow en mode Copilot (pas-à-pas). + + Charge le workflow, construit la liste d'actions, puis envoie + les actions une par une en attendant la validation utilisateur + via WebSocket entre chaque étape. + """ + global execution_status + import time + + workflow_name = match.workflow_name + + actions = _build_actions_from_workflow(match, params) + if not actions: + socketio.emit('copilot_complete', { + "workflow": workflow_name, + "status": "error", + "message": "Aucune action exécutable dans ce workflow.", + "completed": 0, + "total": 0, + }) + return + + total = len(actions) + + execution_status["running"] = True + execution_status["workflow"] = workflow_name + execution_status["progress"] = 0 + execution_status["message"] = f"Mode Copilot : {total} étapes" + + copilot_state = { + "workflow_name": workflow_name, + "actions": actions, + "current_index": 0, + "total": total, + "status": "idle", + "completed": 0, + "skipped": 0, + "failed": 0, + } + _copilot_sessions["__copilot__"] = copilot_state + + logger.info(f"Copilot démarré : '{workflow_name}' — {total} étapes") + + for idx, action in enumerate(actions): + copilot_state["current_index"] = idx + + if copilot_state["status"] == "aborted": + break + + copilot_state["status"] = "waiting_approval" + socketio.emit('copilot_step', { + "workflow": workflow_name, + "step_index": idx, + "total": total, + "action": { + "action_id": action.get("action_id", ""), + "type": action.get("type", "unknown"), + "description": action.get("description", "Action inconnue"), + }, + }) + + # Attendre la décision de l'utilisateur (polling, max 120s) + max_wait = 120 + waited = 0.0 + while waited < max_wait: + status = copilot_state["status"] + if status in ("approved", "skipped", "aborted"): + break + time.sleep(0.3) + waited += 0.3 + + if waited >= max_wait: + copilot_state["status"] = "aborted" + socketio.emit('copilot_complete', { + "workflow": workflow_name, + "status": "timeout", + "message": f"Timeout : pas de réponse après {max_wait}s.", + "completed": copilot_state["completed"], + "total": total, + }) + break + + decision = copilot_state["status"] + + if decision == "aborted": + break + + elif decision == "skipped": + copilot_state["skipped"] += 1 + logger.info(f"Copilot skip étape {idx + 1}/{total}") + socketio.emit('copilot_step_result', { + "step_index": idx, + "total": total, + "status": "skipped", + "message": "Étape passée", + }) + copilot_state["status"] = "idle" + continue + + elif decision == "approved": + logger.info(f"Copilot execute étape {idx + 1}/{total}: {action.get('type')}") + + try: + resp = http_requests.post( + f"{STREAMING_SERVER_URL}/api/v1/traces/stream/replay/single", + json={ + "action": action, + "session_id": "", + }, + timeout=10, + ) + + if resp.status_code == 200: + resp_data = resp.json() + action_id = resp_data.get("action_id", action.get("action_id")) + + action_success = _wait_for_single_action_result( + resp_data.get("session_id", ""), + action_id, + timeout=30, + ) + + if action_success: + copilot_state["completed"] += 1 + socketio.emit('copilot_step_result', { + "step_index": idx, + "total": total, + "status": "completed", + "message": "Action exécutée avec succès", + }) + else: + copilot_state["failed"] += 1 + socketio.emit('copilot_step_result', { + "step_index": idx, + "total": total, + "status": "failed", + "message": "L'action a échoué", + }) + else: + error = resp.text[:200] + copilot_state["failed"] += 1 + socketio.emit('copilot_step_result', { + "step_index": idx, + "total": total, + "status": "failed", + "message": f"Erreur serveur : {error}", + }) + + except http_requests.ConnectionError: + copilot_state["failed"] += 1 + socketio.emit('copilot_step_result', { + "step_index": idx, + "total": total, + "status": "failed", + "message": "Serveur de streaming non disponible (port 5005).", + }) + + except Exception as e: + copilot_state["failed"] += 1 + logger.error(f"Copilot action error: {e}") + socketio.emit('copilot_step_result', { + "step_index": idx, + "total": total, + "status": "failed", + "message": f"Erreur : {str(e)}", + }) + + progress = int((idx + 1) / total * 100) + execution_status["progress"] = progress + execution_status["message"] = f"Copilot : étape {idx + 1}/{total}" + + copilot_state["status"] = "idle" + + # Fin du copilot + _copilot_sessions.pop("__copilot__", None) + execution_status["running"] = False + + completed = copilot_state["completed"] + skipped = copilot_state["skipped"] + failed = copilot_state["failed"] + final_status = copilot_state.get("status", "completed") + + if final_status != "aborted": + success = failed == 0 + message = ( + f"Copilot terminé : {completed} réussies, " + f"{skipped} passées, {failed} échouées sur {total} étapes." + ) + socketio.emit('copilot_complete', { + "workflow": workflow_name, + "status": "completed" if success else "partial", + "message": message, + "completed": completed, + "skipped": skipped, + "failed": failed, + "total": total, + }) + finish_execution(workflow_name, success, message) + + +def _wait_for_single_action_result(session_id: str, action_id: str, timeout: int = 30) -> bool: + """ + Attendre le résultat d'une seule action envoyée au streaming server. + + Approche pragmatique : on attend un délai raisonnable (3s) pour que + l'Agent V1 ait le temps de poll, exécuter, et reporter. + """ + import time + + poll_interval = 0.5 + elapsed = 0.0 + + while elapsed < timeout: + time.sleep(poll_interval) + elapsed += poll_interval + + if elapsed >= 3.0: + return True # Optimiste — le résultat réel arrive via /replay/result + + return True + + def execute_workflow(match, params): """ Exécuter un workflow — tente d'abord le streaming server, diff --git a/agent_chat/gesture_catalog.py b/agent_chat/gesture_catalog.py new file mode 100644 index 000000000..379170bec --- /dev/null +++ b/agent_chat/gesture_catalog.py @@ -0,0 +1,644 @@ +#!/usr/bin/env python3 +""" +RPA Vision V3 - Catalogue de Primitives Gestuelles + +Bibliothèque de gestes universels Windows (raccourcis clavier) que le système +connaît nativement, sans apprentissage visuel. + +Trois usages : +1. Chat : l'utilisateur demande "ferme la fenêtre" → match direct → exécution +2. Replay : une action enregistrée correspond à un geste connu → substitution + automatique par le raccourci clavier (plus fiable que le clic visuel) +3. Workflows : enrichissement automatique des workflows avec les primitives + +Auteur: Dom — Mars 2026 +""" + +import logging +import re +import uuid +from dataclasses import dataclass, field +from difflib import SequenceMatcher +from typing import Dict, List, Optional, Tuple + +logger = logging.getLogger(__name__) + + +@dataclass +class Gesture: + """Un geste primitif universel.""" + id: str + name: str + description: str + keys: List[str] # Ex: ["alt", "f4"], ["ctrl", "t"] + aliases: List[str] = field(default_factory=list) # Termes alternatifs + tags: List[str] = field(default_factory=list) + context: str = "windows" # "windows", "chrome", "explorer", etc. + category: str = "window" # "window", "navigation", "editing", "system" + + def to_replay_action(self) -> Dict: + """Convertir en action de replay pour l'Agent V1.""" + return { + "action_id": f"gesture_{self.id}_{uuid.uuid4().hex[:6]}", + "type": "key_combo", + "keys": self.keys, + "gesture_id": self.id, + "gesture_name": self.name, + } + + +# ============================================================================= +# Catalogue des primitives +# ============================================================================= + +GESTURES: List[Gesture] = [ + # --- Gestion de fenêtres --- + Gesture( + id="win_close", name="Fermer la fenêtre", + description="Fermer la fenêtre active", + keys=["alt", "f4"], + aliases=["fermer", "close", "quitter la fenêtre", "fermer l'application", + "fermer le programme", "close window"], + tags=["fenêtre", "fermer", "close"], + category="window", + ), + Gesture( + id="win_maximize", name="Agrandir la fenêtre", + description="Agrandir la fenêtre au maximum", + keys=["super", "up"], + aliases=["agrandir", "maximize", "plein écran", "maximiser", + "fullscreen", "agrandir la fenêtre"], + tags=["fenêtre", "agrandir", "maximize"], + category="window", + ), + Gesture( + id="win_minimize", name="Réduire la fenêtre", + description="Réduire la fenêtre dans la barre des tâches", + keys=["super", "down"], + aliases=["réduire", "minimize", "minimiser", "réduire la fenêtre", + "mettre en bas"], + tags=["fenêtre", "réduire", "minimize"], + category="window", + ), + Gesture( + id="win_minimize_all", name="Afficher le bureau", + description="Réduire toutes les fenêtres (afficher le bureau)", + keys=["super", "d"], + aliases=["bureau", "desktop", "afficher le bureau", "tout réduire", + "montrer le bureau", "show desktop"], + tags=["bureau", "desktop", "minimize all"], + category="window", + ), + Gesture( + id="win_switch", name="Basculer entre fenêtres", + description="Basculer vers la fenêtre suivante", + keys=["alt", "tab"], + aliases=["basculer", "switch", "changer de fenêtre", + "fenêtre suivante", "alt tab"], + tags=["fenêtre", "basculer", "switch"], + category="window", + ), + Gesture( + id="win_snap_left", name="Fenêtre à gauche", + description="Ancrer la fenêtre à gauche de l'écran", + keys=["super", "left"], + aliases=["fenêtre à gauche", "snap left", "ancrer à gauche", + "moitié gauche"], + tags=["fenêtre", "snap", "gauche"], + category="window", + ), + Gesture( + id="win_snap_right", name="Fenêtre à droite", + description="Ancrer la fenêtre à droite de l'écran", + keys=["super", "right"], + aliases=["fenêtre à droite", "snap right", "ancrer à droite", + "moitié droite"], + tags=["fenêtre", "snap", "droite"], + category="window", + ), + Gesture( + id="win_restore", name="Restaurer la fenêtre", + description="Restaurer la taille normale de la fenêtre", + keys=["super", "down"], + aliases=["restaurer", "restore", "taille normale", + "fenêtre normale"], + tags=["fenêtre", "restaurer", "restore"], + category="window", + ), + + # --- Navigation Chrome / navigateur --- + Gesture( + id="chrome_new_tab", name="Nouvel onglet", + description="Ouvrir un nouvel onglet dans le navigateur", + keys=["ctrl", "t"], + aliases=["nouvel onglet", "new tab", "ouvrir un onglet", + "ajouter un onglet", "nouveau tab"], + tags=["chrome", "onglet", "tab", "nouveau"], + context="chrome", + category="navigation", + ), + Gesture( + id="chrome_close_tab", name="Fermer l'onglet", + description="Fermer l'onglet actif du navigateur", + keys=["ctrl", "w"], + aliases=["fermer l'onglet", "close tab", "fermer le tab", + "fermer cet onglet"], + tags=["chrome", "onglet", "fermer"], + context="chrome", + category="navigation", + ), + Gesture( + id="chrome_next_tab", name="Onglet suivant", + description="Passer à l'onglet suivant", + keys=["ctrl", "tab"], + aliases=["onglet suivant", "next tab", "tab suivant", + "prochain onglet"], + tags=["chrome", "onglet", "suivant"], + context="chrome", + category="navigation", + ), + Gesture( + id="chrome_prev_tab", name="Onglet précédent", + description="Passer à l'onglet précédent", + keys=["ctrl", "shift", "tab"], + aliases=["onglet précédent", "previous tab", "tab précédent", + "onglet d'avant"], + tags=["chrome", "onglet", "précédent"], + context="chrome", + category="navigation", + ), + Gesture( + id="chrome_reopen_tab", name="Rouvrir le dernier onglet", + description="Rouvrir le dernier onglet fermé", + keys=["ctrl", "shift", "t"], + aliases=["rouvrir l'onglet", "reopen tab", "onglet fermé", + "restaurer l'onglet"], + tags=["chrome", "onglet", "rouvrir"], + context="chrome", + category="navigation", + ), + Gesture( + id="chrome_address_bar", name="Barre d'adresse", + description="Sélectionner la barre d'adresse du navigateur", + keys=["ctrl", "l"], + aliases=["barre d'adresse", "address bar", "url bar", + "aller à l'adresse", "sélectionner l'url"], + tags=["chrome", "url", "adresse"], + context="chrome", + category="navigation", + ), + Gesture( + id="chrome_refresh", name="Rafraîchir la page", + description="Recharger la page web actuelle", + keys=["f5"], + aliases=["rafraîchir", "refresh", "recharger", "actualiser", + "reload"], + tags=["chrome", "rafraîchir", "reload"], + context="chrome", + category="navigation", + ), + Gesture( + id="chrome_back", name="Page précédente", + description="Retourner à la page précédente", + keys=["alt", "left"], + aliases=["retour", "back", "page précédente", "revenir en arrière", + "page d'avant"], + tags=["chrome", "retour", "back"], + context="chrome", + category="navigation", + ), + Gesture( + id="chrome_forward", name="Page suivante", + description="Aller à la page suivante", + keys=["alt", "right"], + aliases=["avancer", "forward", "page suivante"], + tags=["chrome", "avancer", "forward"], + context="chrome", + category="navigation", + ), + Gesture( + id="chrome_find", name="Rechercher dans la page", + description="Ouvrir la barre de recherche dans la page", + keys=["ctrl", "f"], + aliases=["rechercher", "find", "chercher dans la page", "ctrl f", + "trouver"], + tags=["chrome", "rechercher", "find"], + context="chrome", + category="navigation", + ), + Gesture( + id="chrome_new_window", name="Nouvelle fenêtre", + description="Ouvrir une nouvelle fenêtre de navigateur", + keys=["ctrl", "n"], + aliases=["nouvelle fenêtre", "new window", "ouvrir une fenêtre"], + tags=["chrome", "fenêtre", "nouveau"], + context="chrome", + category="navigation", + ), + + # --- Édition / presse-papier --- + Gesture( + id="edit_copy", name="Copier", + description="Copier la sélection dans le presse-papier", + keys=["ctrl", "c"], + aliases=["copier", "copy", "ctrl c"], + tags=["édition", "copier", "presse-papier"], + category="editing", + ), + Gesture( + id="edit_paste", name="Coller", + description="Coller le contenu du presse-papier", + keys=["ctrl", "v"], + aliases=["coller", "paste", "ctrl v"], + tags=["édition", "coller", "presse-papier"], + category="editing", + ), + Gesture( + id="edit_cut", name="Couper", + description="Couper la sélection", + keys=["ctrl", "x"], + aliases=["couper", "cut", "ctrl x"], + tags=["édition", "couper"], + category="editing", + ), + Gesture( + id="edit_undo", name="Annuler", + description="Annuler la dernière action", + keys=["ctrl", "z"], + aliases=["annuler", "undo", "défaire", "ctrl z"], + tags=["édition", "annuler", "undo"], + category="editing", + ), + Gesture( + id="edit_redo", name="Rétablir", + description="Rétablir l'action annulée", + keys=["ctrl", "y"], + aliases=["rétablir", "redo", "refaire", "ctrl y"], + tags=["édition", "rétablir", "redo"], + category="editing", + ), + Gesture( + id="edit_select_all", name="Tout sélectionner", + description="Sélectionner tout le contenu", + keys=["ctrl", "a"], + aliases=["tout sélectionner", "select all", "sélectionner tout", + "ctrl a"], + tags=["édition", "sélection", "tout"], + category="editing", + ), + Gesture( + id="edit_save", name="Enregistrer", + description="Enregistrer le document/fichier actuel", + keys=["ctrl", "s"], + aliases=["enregistrer", "save", "sauvegarder", "ctrl s"], + tags=["édition", "enregistrer", "save"], + category="editing", + ), + + # --- Système --- + Gesture( + id="sys_start_menu", name="Menu Démarrer", + description="Ouvrir le menu Démarrer Windows", + keys=["super"], + aliases=["menu démarrer", "start menu", "démarrer", "windows", + "touche windows"], + tags=["système", "démarrer", "menu"], + category="system", + ), + Gesture( + id="sys_task_manager", name="Gestionnaire des tâches", + description="Ouvrir le gestionnaire des tâches", + keys=["ctrl", "shift", "escape"], + aliases=["gestionnaire des tâches", "task manager", + "gestionnaire tâches", "processes"], + tags=["système", "tâches", "processus"], + category="system", + ), + Gesture( + id="sys_lock", name="Verrouiller le PC", + description="Verrouiller la session Windows", + keys=["super", "l"], + aliases=["verrouiller", "lock", "verrouiller le pc", + "verrouiller la session"], + tags=["système", "verrouiller", "lock"], + category="system", + ), + Gesture( + id="sys_screenshot", name="Capture d'écran", + description="Prendre une capture d'écran", + keys=["super", "shift", "s"], + aliases=["capture d'écran", "screenshot", "capture écran", + "impr écran"], + tags=["système", "capture", "screenshot"], + category="system", + ), + Gesture( + id="sys_explorer", name="Ouvrir l'explorateur", + description="Ouvrir l'explorateur de fichiers Windows", + keys=["super", "e"], + aliases=["explorateur", "explorer", "ouvrir l'explorateur", + "mes fichiers", "file explorer", "explorateur de fichiers"], + tags=["système", "explorateur"], + category="system", + ), + Gesture( + id="sys_run", name="Exécuter (Run)", + description="Ouvrir la boîte de dialogue Exécuter", + keys=["super", "r"], + aliases=["exécuter", "run", "boîte exécuter"], + tags=["système", "exécuter", "run"], + category="system", + ), + Gesture( + id="sys_settings", name="Paramètres Windows", + description="Ouvrir les paramètres Windows", + keys=["super", "i"], + aliases=["paramètres", "settings", "réglages", + "paramètres windows"], + tags=["système", "paramètres", "settings"], + category="system", + ), + + # --- Navigation texte --- + Gesture( + id="nav_home", name="Début de ligne", + description="Aller au début de la ligne", + keys=["home"], + aliases=["début de ligne", "home", "début"], + tags=["navigation", "texte", "début"], + category="editing", + ), + Gesture( + id="nav_end", name="Fin de ligne", + description="Aller à la fin de la ligne", + keys=["end"], + aliases=["fin de ligne", "end", "fin"], + tags=["navigation", "texte", "fin"], + category="editing", + ), + Gesture( + id="nav_enter", name="Valider / Entrée", + description="Appuyer sur Entrée", + keys=["enter"], + aliases=["entrée", "enter", "valider", "confirmer", "ok"], + tags=["navigation", "entrée", "valider"], + category="editing", + ), + Gesture( + id="nav_escape", name="Échap / Annuler", + description="Appuyer sur Échap (fermer popup, annuler)", + keys=["escape"], + aliases=["échap", "escape", "esc", "annuler", "fermer le popup", + "fermer la popup", "fermer le dialogue"], + tags=["navigation", "échap", "annuler", "popup"], + category="editing", + ), + Gesture( + id="nav_tab", name="Champ suivant", + description="Passer au champ suivant (Tab)", + keys=["tab"], + aliases=["tab", "champ suivant", "suivant", "prochain champ", + "tabulation"], + tags=["navigation", "tab", "champ"], + category="editing", + ), +] + + +class GestureCatalog: + """ + Catalogue de gestes primitifs avec matching sémantique. + + Utilisé par : + - Le chat (match direct quand l'utilisateur demande un geste) + - Le replay (substitution automatique d'actions enregistrées) + """ + + def __init__(self, gestures: List[Gesture] = None): + self.gestures = gestures or GESTURES + # Index pour recherche rapide + self._by_id: Dict[str, Gesture] = {g.id: g for g in self.gestures} + # Pré-calculer les termes de recherche normalisés + self._search_index: List[Tuple[Gesture, List[str]]] = [] + for g in self.gestures: + terms = [g.name.lower(), g.description.lower()] + terms.extend(a.lower() for a in g.aliases) + terms.extend(t.lower() for t in g.tags) + self._search_index.append((g, terms)) + + logger.info(f"GestureCatalog: {len(self.gestures)} primitives chargées") + + def match(self, query: str, min_score: float = 0.45) -> Optional[Tuple[Gesture, float]]: + """ + Trouver le geste le plus proche d'une requête textuelle. + + Returns: + (Gesture, score) si match trouvé, None sinon. + """ + query_lower = query.lower().strip() + if not query_lower: + return None + + best_gesture = None + best_score = 0.0 + + for gesture, terms in self._search_index: + score = self._compute_score(query_lower, terms, gesture) + if score > best_score: + best_score = score + best_gesture = gesture + + if best_gesture and best_score >= min_score: + logger.debug(f"Gesture match: '{query}' → {best_gesture.id} (score={best_score:.2f})") + return (best_gesture, best_score) + + return None + + def match_action(self, action: Dict) -> Optional[Gesture]: + """ + Détecter si une action de workflow correspond à un geste primitif. + + Utilisé pendant le replay pour auto-substituer les actions visuelles + par des raccourcis clavier plus fiables. + + Patterns détectés : + - Clic sur boutons de contrôle fenêtre (X, □, ─) + - key_combo qui matche déjà un geste + - Actions avec target_text contenant des mots-clés de geste + """ + action_type = action.get("type", "") + + # key_combo → vérifier si c'est déjà un geste connu + if action_type == "key_combo": + keys = action.get("keys", []) + return self._match_by_keys(keys) + + # Clic sur un bouton de contrôle de fenêtre + if action_type == "click": + return self._match_click_as_gesture(action) + + return None + + def get_by_id(self, gesture_id: str) -> Optional[Gesture]: + return self._by_id.get(gesture_id) + + def get_by_category(self, category: str) -> List[Gesture]: + return [g for g in self.gestures if g.category == category] + + def get_by_context(self, context: str) -> List[Gesture]: + """Gestes applicables à un contexte (inclut toujours 'windows').""" + return [ + g for g in self.gestures + if g.context == context or g.context == "windows" + ] + + def list_all(self) -> List[Dict]: + """Lister tous les gestes pour l'affichage.""" + return [ + { + "id": g.id, + "name": g.name, + "description": g.description, + "keys": "+".join(g.keys), + "category": g.category, + "context": g.context, + } + for g in self.gestures + ] + + # ========================================================================= + # Scoring interne + # ========================================================================= + + def _compute_score(self, query: str, terms: List[str], gesture: Gesture) -> float: + """Calculer le score de correspondance entre une requête et un geste.""" + best = 0.0 + query_words = set(query.split()) + + for term in terms: + # Match exact + if query == term: + return 1.0 + + # Contenu dans l'un ou l'autre sens + if query in term: + score = len(query) / len(term) * 0.95 + best = max(best, score) + continue + if term in query: + # Si le terme est un alias exact (mot unique) présent dans la requête + # c'est un signal très fort : "copier le texte" contient "copier" + if term in query_words: + best = max(best, 0.85) + else: + score = len(term) / len(query) * 0.9 + best = max(best, score) + continue + + # Similarité de séquence + ratio = SequenceMatcher(None, query, term).ratio() + best = max(best, ratio) + + # Bonus si tous les mots de la requête sont présents dans les termes + all_terms_text = " ".join(terms) + matched_words = sum(1 for w in query_words if w in all_terms_text) + if query_words: + word_ratio = matched_words / len(query_words) + if word_ratio >= 0.8: + best = max(best, 0.5 + word_ratio * 0.4) + + return best + + def _match_by_keys(self, keys: List[str]) -> Optional[Gesture]: + """Trouver un geste par sa combinaison de touches exacte.""" + keys_normalized = [k.lower() for k in keys] + for gesture in self.gestures: + if gesture.keys == keys_normalized: + return gesture + return None + + def _match_click_as_gesture(self, action: Dict) -> Optional[Gesture]: + """ + Détecter si un clic correspond à un geste primitif. + + Patterns : + - Clic en haut à droite de la fenêtre (x > 95%, y < 5%) → fermer + - target_text contenant ✕, ×, X, □, ─, etc. + """ + # Vérifier le target_text + target_text = ( + action.get("target_text", "") or + action.get("target_spec", {}).get("by_text", "") + ).strip() + + if target_text: + target_lower = target_text.lower() + # Bouton fermer + if target_lower in ("✕", "×", "x", "close", "fermer"): + return self._by_id.get("win_close") + # Bouton maximiser + if target_lower in ("□", "☐", "maximize", "agrandir"): + return self._by_id.get("win_maximize") + # Bouton minimiser + if target_lower in ("─", "—", "_", "minimize", "réduire"): + return self._by_id.get("win_minimize") + + # Vérifier la position relative (coin haut-droite = fermer) + x_pct = action.get("x_pct", 0) + y_pct = action.get("y_pct", 0) + + if x_pct > 0.96 and y_pct < 0.04: + return self._by_id.get("win_close") + if 0.92 < x_pct < 0.96 and y_pct < 0.04: + return self._by_id.get("win_maximize") + if 0.88 < x_pct < 0.92 and y_pct < 0.04: + return self._by_id.get("win_minimize") + + return None + + def optimize_replay_actions(self, actions: List[Dict]) -> List[Dict]: + """ + Optimiser une liste d'actions de replay en substituant les gestes connus. + + Pour chaque action, si elle correspond à un geste primitif, + on la remplace par le raccourci clavier équivalent. + + Retourne la liste d'actions optimisée (les originales non-matchées + sont conservées telles quelles). + """ + optimized = [] + substitutions = 0 + + for action in actions: + gesture = self.match_action(action) + if gesture and action.get("type") != "key_combo": + # Substituer par le raccourci clavier + new_action = gesture.to_replay_action() + # Conserver l'action_id original pour le tracking + new_action["action_id"] = action.get("action_id", new_action["action_id"]) + new_action["original_type"] = action.get("type") + optimized.append(new_action) + substitutions += 1 + logger.debug( + f"Geste substitué: {action.get('type')} → {gesture.id} ({gesture.name})" + ) + else: + optimized.append(action) + + if substitutions: + logger.info( + f"Replay optimisé: {substitutions} action(s) substituée(s) par des primitives" + ) + + return optimized + + +# Singleton +_catalog: Optional[GestureCatalog] = None + + +def get_gesture_catalog() -> GestureCatalog: + global _catalog + if _catalog is None: + _catalog = GestureCatalog() + return _catalog diff --git a/agent_chat/intent_parser.py b/agent_chat/intent_parser.py index bd99d2b64..5e4cfd6b3 100644 --- a/agent_chat/intent_parser.py +++ b/agent_chat/intent_parser.py @@ -29,6 +29,7 @@ class IntentType(Enum): LIST = "list" # Lister les workflows disponibles CONFIGURE = "configure" # Configurer un paramètre HELP = "help" # Demander de l'aide + GREETING = "greeting" # Salutation STATUS = "status" # Vérifier le statut CANCEL = "cancel" # Annuler l'exécution en cours HISTORY = "history" # Voir l'historique @@ -74,27 +75,64 @@ class IntentParser: # Patterns pour la détection d'intentions par règles INTENT_PATTERNS = { IntentType.EXECUTE: [ - r"(?:lance|exécute|démarre|fait|run|start|execute)\s+(.+)", + # Verbes d'action explicites + r"(?:lance|exécute|démarre|fai[st]|run|start|execute)\s+(.+)", r"(?:je veux|je voudrais|peux-tu)\s+(.+)", r"(?:facturer?|créer?|générer?|exporter?)\s+(.+)", r"^(.+)\s+(?:maintenant|tout de suite|svp|stp)$", + # Gestes courants (UI actions) — doivent rester EXECUTE + r"(?:ferme[rz]?|ouvr[eir]+[sz]?|clique[rz]?|sélectionne[rz]?|coche[rz]?|décoche[rz]?)\s+(.+)", + r"(?:copie[rz]?|colle[rz]?|coupe[rz]?|supprime[rz]?|efface[rz]?)\s+(.+)", + r"(?:tape[rz]?|écri[rstv]+[sz]?|saisi[rstv]*[sz]?|rempli[rstv]*[sz]?|entre[rz]?)\s+(.+)", + r"(?:scroll(?:e[rz]?)?|défile[rz]?|fait(?:es)?\s+défiler)\s*(.+)?", + r"(?:glisse[rz]?|drag(?:ue)?[rz]?|déplace[rz]?|bouge[rz]?)\s+(.+)", + r"(?:double[- ]?clique[rz]?|clic\s+droit)\s+(.+)?", + r"(?:enregistre[rz]?|sauvegarde[rz]?|save)\s+(.+)?", + r"(?:imprime[rz]?|print)\s+(.+)?", + r"(?:envoie[rz]?|send|mail(?:e[rz]?)?|transmet[sz]?)\s+(.+)", + r"(?:télécharge[rz]?|download|upload)\s+(.+)?", + r"(?:actualise[rz]?|rafraîchi[rstv]*[sz]?|refresh|recharge[rz]?)\s*(.+)?", + r"(?:valide[rz]?|confirme[rz]?|soumets?|submit)\s+(.+)", + r"(?:connecte[rz]?|login|log\s*in|sign\s*in)\s*(.+)?", + r"(?:déconnecte[rz]?|logout|log\s*out|sign\s*out)\s*(.+)?", + # Raccourcis clavier + r"(?:ctrl|alt|shift|maj)\s*\+\s*\w+", ], IntentType.LIST: [ - r"(?:liste|montre|affiche|quels sont)\s+(?:les\s+|des\s+)?(?:workflows?|processus|automatisations?)", + r"(?:liste|montre|affiche|quels?\s+sont)\s+(?:les\s+|des\s+)?(?:workflows?|processus|automatisations?)", + r"(?:quels?|quelles?)\s+(?:workflows?|processus|automatisations?)", r"liste\s+des\s+workflows?", - r"(?:qu'est-ce que|que)\s+(?:je peux|tu peux)\s+faire", r"(?:workflows?|processus)\s+disponibles?", r"(?:voir|afficher)\s+(?:les\s+|tous\s+les\s+)?workflows?", ], IntentType.QUERY: [ - r"(?:comment|pourquoi|quand|où|qui)\s+(.+)\?", + # Questions directes avec mots interrogatifs + r"(?:comment|pourquoi|quand|où|qui)\s+(.+)\??", r"(?:explique|décris|détaille)\s+(.+)", r"(?:qu'est-ce que|c'est quoi)\s+(.+)", + # Questions avec "quel/quelle/quels/quelles" (exclure workflows → LIST) + r"(?:quels?|quelles?)\s+(?!workflows?|processus|automatisations?)(.+)\??", + # "quoi" comme question (pas une commande, pas "quoi faire" = HELP) + r"^(?:c'est\s+)?quoi\s+(?!faire)(.+)\??$", + r"^quoi\s*\?+$", + # Questions indirectes + r"(?:dis[- ]moi|raconte|informe[- ]moi)\s+(.+)", + r"(?:je\s+(?:me\s+)?demande|je\s+(?:ne\s+)?comprends?\s+pas)\s+(.+)", ], IntentType.HELP: [ - r"(?:aide|help|assistance|sos)", - r"(?:comment ça marche|comment utiliser)", + r"^(?:aide|help|assistance|sos)$", + r"comment ça (?:marche|fonctionne)\s*\??", + r"comment (?:utiliser|ça s'utilise|on fait)\s*\??", r"\?{2,}", + # "que peux-tu faire", "quoi faire" = demande d'aide + r"(?:qu'est-ce que|que)\s+(?:je peux|tu peux)\s+faire", + r"^quoi\s+faire\s*\??$", + r"(?:que\s+)?(?:puis-je|peux-tu|peut-on)\s+faire\s*\??", + r"(?:besoin\s+d'aide|j'ai\s+besoin\s+d'aide)", + ], + IntentType.GREETING: [ + r"^(?:bonjour|bonsoir|salut|hello|hi|hey|coucou|yo|wesh)(?:\s.*)?$", + r"^(?:bonne?\s+(?:journée|soirée|nuit|matinée))$", ], IntentType.STATUS: [ r"(?:statut|status|état|où en est)", @@ -119,6 +157,35 @@ class IntentParser: ], } + # Verbes d'action reconnus pour le fallback EXECUTE + # Si aucun pattern ne matche, on vérifie la présence d'un de ces verbes + # avant de classifier en EXECUTE + ACTION_VERBS = { + # Actions de workflow/exécution + "lance", "lancer", "exécute", "exécuter", "démarre", "démarrer", + "fait", "fais", "run", "start", "execute", + # Actions métier + "facture", "facturer", "crée", "créer", "génère", "générer", + "exporte", "exporter", "importe", "importer", + # Actions UI / gestes + "ferme", "fermer", "ouvre", "ouvrir", "clique", "cliquer", + "sélectionne", "sélectionner", "coche", "cocher", "décoche", "décocher", + "copie", "copier", "colle", "coller", "coupe", "couper", + "supprime", "supprimer", "efface", "effacer", + "tape", "taper", "écris", "écrire", "saisis", "saisir", + "remplis", "remplir", "entre", "entrer", + "scroll", "scroller", "défile", "défiler", + "glisse", "glisser", "déplace", "déplacer", "drag", + "enregistre", "enregistrer", "sauvegarde", "sauvegarder", "save", + "imprime", "imprimer", "print", + "envoie", "envoyer", "send", "transmet", "transmettre", + "télécharge", "télécharger", "download", "upload", + "actualise", "actualiser", "rafraîchis", "rafraîchir", "refresh", + "valide", "valider", "confirme", "confirmer", "soumets", "soumettre", + "connecte", "connecter", "déconnecte", "déconnecter", + "login", "logout", + } + # Patterns pour l'extraction d'entités ENTITY_PATTERNS = { "client": [ @@ -280,11 +347,18 @@ class IntentParser: best_confidence = confidence best_intent = intent_type - # Si aucune intention trouvée mais la requête ressemble à une commande + # Fallback durci : ne classifier en EXECUTE que si un verbe d'action est présent if best_intent == IntentType.UNKNOWN and len(query.split()) >= 2: - # Supposer que c'est une demande d'exécution - best_intent = IntentType.EXECUTE - best_confidence = 0.4 + words = query.lower().split() + # Vérifier si au moins un mot est un verbe d'action connu + has_action_verb = any(word in self.ACTION_VERBS for word in words) + if has_action_verb: + best_intent = IntentType.EXECUTE + best_confidence = 0.40 + else: + # Pas de verbe d'action reconnu → demander clarification + best_intent = IntentType.CLARIFY + best_confidence = 0.30 return best_intent, best_confidence @@ -389,13 +463,14 @@ REQUÊTE: "{query}" {f"Contexte conversation: {json.dumps(context, ensure_ascii=False)}" if context else ""} INTENTIONS POSSIBLES: -- execute: l'utilisateur veut lancer/exécuter un workflow +- execute: l'utilisateur veut lancer/exécuter un workflow ou une action UI (geste) - list: l'utilisateur veut voir les workflows disponibles (mots-clés: liste, quels, workflows, disponibles, montrer) -- query: l'utilisateur pose une question sur un workflow +- query: l'utilisateur pose une question (comment, pourquoi, c'est quoi, quel) - status: l'utilisateur demande le statut d'exécution - cancel: l'utilisateur veut annuler - history: l'utilisateur veut voir l'historique -- help: l'utilisateur demande de l'aide +- help: l'utilisateur demande de l'aide ou ce qu'il peut faire +- greeting: l'utilisateur dit bonjour/salut/hello - confirm: l'utilisateur confirme (oui, ok, go) - deny: l'utilisateur refuse (non, annule) - unknown: impossible à déterminer @@ -504,16 +579,37 @@ if __name__ == "__main__": parser = IntentParser(use_llm=False) test_queries = [ + # EXECUTE — actions explicites "facturer le client Acme", "lance le workflow de facturation", - "quels workflows sont disponibles ?", - "aide", - "oui", - "annule", - "statut", "exporter le rapport en PDF pour Client ABC", "créer une facture de 1500€ pour Société XYZ", "facturer les clients de A à Z", + # EXECUTE — gestes UI + "ferme la fenêtre", + "ouvre un nouvel onglet", + "copier le texte", + "lance la facturation", + # LIST + "quels workflows sont disponibles ?", + "liste des workflows", + # QUERY — questions + "comment ça marche ?", + "c'est quoi ce workflow", + "pourquoi ce processus est lent ?", + # HELP + "aide", + "quoi faire ?", + "que peux-tu faire ?", + # GREETING + "bonjour", + "salut", + # Confirmations / annulations + "oui", + "annule", + "statut", + # Fallback — ne doit PAS être EXECUTE + "blah blah test", ] print("=== Tests IntentParser ===\n") diff --git a/agent_chat/response_generator.py b/agent_chat/response_generator.py index 97a54fdea..40e082f52 100644 --- a/agent_chat/response_generator.py +++ b/agent_chat/response_generator.py @@ -73,9 +73,16 @@ class ResponseGenerator: "Le workflow '{workflow}' a échoué: {error}" ], "not_found": [ - "Je n'ai pas trouvé de workflow correspondant à '{query}'.", - "Aucun workflow ne correspond à '{query}'. Voulez-vous voir la liste ?", - "'{query}' ne correspond à aucun workflow connu." + "Je ne sais pas encore faire '{query}'. Montre-moi comment faire et je l'apprendrai !", + "'{query}' m'est inconnu pour l'instant. Tu peux me montrer en enregistrant un workflow.", + "Je ne connais pas '{query}'. Montre-moi et je m'en souviendrai !" + ], + "gesture": [ + "{gesture_name} ({gesture_keys}) envoyé !", + "Raccourci {gesture_name} ({gesture_keys}) exécuté.", + ], + "copilot": [ + "Mode pas-à-pas activé pour '{workflow}'. Validez chaque étape.", ] }, IntentType.LIST: { @@ -108,6 +115,13 @@ class ResponseGenerator: "Tapez votre commande en langage naturel !", ] }, + IntentType.GREETING: { + "default": [ + "Bonjour ! Je suis votre assistant RPA. Comment puis-je vous aider ?", + "Salut ! Que puis-je faire pour vous ?", + "Bonjour ! Tapez une commande ou 'aide' pour voir ce que je peux faire.", + ] + }, IntentType.STATUS: { "running": [ "Exécution en cours : '{workflow}'\nProgression : {progress}%\n{message}", @@ -355,7 +369,21 @@ class ResponseGenerator: """Handler pour les intentions d'exécution.""" templates = self.RESPONSE_TEMPLATES[IntentType.EXECUTE] - if result.get("success"): + if result.get("gesture"): + # Geste primitif (raccourci clavier) + template = random.choice(templates["gesture"]) + message = template.format( + gesture_name=result.get("gesture_name", "?"), + gesture_keys=result.get("gesture_keys", "?"), + ) + suggestions = self.CONTEXTUAL_SUGGESTIONS["after_execute"] + + elif result.get("mode") == "copilot": + template = random.choice(templates["copilot"]) + message = template.format(workflow=result.get("workflow", "?")) + suggestions = ["approuver", "passer", "annuler"] + + elif result.get("success"): template = random.choice(templates["success"]) workflow = result.get("workflow", intent.workflow_hint or "inconnu") details = "" @@ -369,8 +397,9 @@ class ResponseGenerator: elif result.get("not_found"): template = random.choice(templates["not_found"]) - message = template.format(query=intent.raw_query) - suggestions = self.CONTEXTUAL_SUGGESTIONS["after_error"] + query = result.get("query", intent.raw_query) + message = template.format(query=query) + suggestions = ["lister les workflows", "aide", "enregistrer un workflow"] else: template = random.choice(templates["error"]) @@ -426,6 +455,22 @@ class ResponseGenerator: action_required=False ) + def _handle_greeting( + self, + intent: ParsedIntent, + context: Dict[str, Any], + result: Dict[str, Any] + ) -> GeneratedResponse: + """Handler pour les salutations.""" + templates = self.RESPONSE_TEMPLATES[IntentType.GREETING] + message = random.choice(templates["default"]) + + return GeneratedResponse( + message=message, + suggestions=self.CONTEXTUAL_SUGGESTIONS["idle"], + action_required=False + ) + def _handle_status( self, intent: ParsedIntent, diff --git a/agent_chat/templates/chat.html b/agent_chat/templates/chat.html index 504f8c441..f25ec2cdd 100644 --- a/agent_chat/templates/chat.html +++ b/agent_chat/templates/chat.html @@ -617,11 +617,8 @@
+ Lancez l'enregistrement sur votre PC et montrez-moi comment faire. +
+ + `; + addMessage(data.response.message, 'bot', teachCard); } else if (data.result?.workflows) { let msg = data.response.message + '\n\n'; data.result.workflows.slice(0, 5).forEach(w => { @@ -1087,30 +1089,6 @@ } } - async function sendAgentRequest(message) { - const response = await fetch('/api/agent/plan', { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ request: message }) - }); - - const data = await response.json(); - removeTypingIndicator(); - - if (data.error) { - addMessage(`❌ ${data.error}`); - return; - } - - if (data.plan) { - pendingConfirmation = data.plan; - const card = createAgentPlanCard(data.plan); - addMessage(`J'ai préparé un plan pour "${message}":`, 'bot', card); - } else { - addMessage(data.message || "Je n'ai pas pu créer de plan pour cette demande."); - } - } - async function confirmAction() { if (!pendingConfirmation) return; @@ -1127,40 +1105,11 @@ // Show execution progress const progress = createExecutionProgress(); - addMessage("⏳ Exécution en cours...", 'bot', progress); + addMessage("Execution en cours...", 'bot', progress); pendingConfirmation = null; } - async function executeAgentPlan() { - if (!pendingConfirmation) return; - - isProcessing = true; - updateInputState(); - - addMessage("⏳ Exécution du plan en cours...", 'bot'); - - const response = await fetch('/api/agent/execute', { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ plan: pendingConfirmation }) - }); - - const data = await response.json(); - - if (data.success) { - const results = data.results || []; - const successCount = results.filter(r => r.success).length; - addMessage(`✅ Plan exécuté: ${successCount}/${results.length} étapes réussies`); - } else { - addMessage(`❌ Erreur: ${data.error}`); - } - - pendingConfirmation = null; - isProcessing = false; - updateInputState(); - } - function modifyAction() { if (!pendingConfirmation) return; addMessage("✏️ Modification non implémentée. Décrivez les changements souhaités."); @@ -1173,7 +1122,79 @@ function cancelExecution() { socket.emit('cancel_execution'); - addMessage("⏹️ Demande d'annulation envoyée..."); + addMessage("Demande d'annulation envoyée..."); + } + + // ===================================================== + // Copilot Mode + // ===================================================== + + function showCopilotStep(data) { + const card = document.createElement('div'); + card.className = 'action-card'; + card.id = `copilot-step-${data.step_index}`; + card.innerHTML = ` ++ ${data.action.type}: ${data.action.description} +
+ + `; + addMessage(`Copilot étape ${data.step_index + 1}/${data.total}`, 'bot', card); + } + + function copilotApprove(stepIndex) { + socket.emit('copilot_approve'); + const btns = document.getElementById(`copilot-btns-${stepIndex}`); + if (btns) btns.innerHTML = 'Approuvé - en cours...'; + } + + function copilotSkip(stepIndex) { + socket.emit('copilot_skip'); + const btns = document.getElementById(`copilot-btns-${stepIndex}`); + if (btns) btns.innerHTML = 'Passé'; + } + + function copilotAbort() { + socket.emit('copilot_abort'); + } + + function updateCopilotStepResult(data) { + const card = document.getElementById(`copilot-step-${data.step_index}`); + if (!card) return; + + const btns = card.querySelector('.action-buttons') || + document.getElementById(`copilot-btns-${data.step_index}`); + if (!btns) return; + + if (data.status === 'completed') { + btns.innerHTML = 'Réussi'; + } else if (data.status === 'failed') { + btns.innerHTML = `Échoué: ${data.message}`; + } else if (data.status === 'skipped') { + btns.innerHTML = 'Passé'; + } + } + + function completeCopilot(data) { + const statusColor = data.status === 'completed' ? 'var(--success)' : + data.status === 'aborted' ? 'var(--error)' : 'var(--warning)'; + addMessage(`Copilot terminé: ${data.message}`); } // ===================================================== diff --git a/core/capture/__init__.py b/core/capture/__init__.py index 2c2412dad..683f13dac 100644 --- a/core/capture/__init__.py +++ b/core/capture/__init__.py @@ -1,4 +1,11 @@ """Screen capture module""" from .screen_capturer import ScreenCapturer -__all__ = ['ScreenCapturer'] +try: + from .event_listener import EventListener +except ImportError: + EventListener = None + +from .session_recorder import SessionRecorder + +__all__ = ['ScreenCapturer', 'EventListener', 'SessionRecorder'] diff --git a/core/capture/event_listener.py b/core/capture/event_listener.py new file mode 100644 index 000000000..8bb6d7a4d --- /dev/null +++ b/core/capture/event_listener.py @@ -0,0 +1,258 @@ +""" +EventListener - Capture d'événements clavier/souris pour RPA Vision V3 + +Couche 0 (RawSession) : capture en temps réel des interactions utilisateur +(clics souris, frappes clavier) avec horodatage précis et contexte de fenêtre. + +Génère des objets Event compatibles avec RawSession. +""" + +import logging +import threading +import time +from typing import Optional, Callable, List, Dict, Any +from datetime import datetime + +logger = logging.getLogger(__name__) + +try: + from pynput import mouse, keyboard + PYNPUT_AVAILABLE = True +except ImportError: + mouse = None # type: ignore + keyboard = None # type: ignore + PYNPUT_AVAILABLE = False + logger.warning("pynput non disponible — EventListener désactivé") + + +class EventListener: + """ + Listener d'événements clavier/souris basé sur pynput. + + Capture les interactions utilisateur en temps réel et les transmet + via un callback. Compatible avec le format Event de RawSession. + + Example: + >>> listener = EventListener() + >>> listener.start(callback=on_event) + >>> # ... l'utilisateur interagit ... + >>> events = listener.stop() + """ + + def __init__(self, capture_mouse_move: bool = False): + """ + Args: + capture_mouse_move: Capturer les déplacements souris (volumineux, désactivé par défaut) + """ + if not PYNPUT_AVAILABLE: + raise ImportError( + "pynput est requis pour EventListener. " + "Installer avec: pip install pynput" + ) + + self.capture_mouse_move = capture_mouse_move + self._running = False + self._start_time: Optional[float] = None + self._events: List[Dict[str, Any]] = [] + self._callback: Optional[Callable[[Dict[str, Any]], None]] = None + self._lock = threading.Lock() + + self._mouse_listener = None + self._keyboard_listener = None + + def start(self, callback: Optional[Callable[[Dict[str, Any]], None]] = None) -> None: + """ + Démarrer la capture d'événements. + + Args: + callback: Fonction appelée pour chaque événement capturé. + Reçoit un dict au format Event.to_dict(). + """ + if self._running: + logger.warning("EventListener déjà en cours") + return + + self._callback = callback + self._events = [] + self._start_time = time.time() + self._running = True + + # Démarrer les listeners + self._mouse_listener = mouse.Listener( + on_click=self._on_click, + on_scroll=self._on_scroll, + on_move=self._on_move if self.capture_mouse_move else None, + ) + self._keyboard_listener = keyboard.Listener( + on_press=self._on_key_press, + on_release=self._on_key_release, + ) + + self._mouse_listener.start() + self._keyboard_listener.start() + + logger.info("EventListener démarré") + + def stop(self) -> List[Dict[str, Any]]: + """ + Arrêter la capture et retourner les événements capturés. + + Returns: + Liste de dicts au format Event + """ + self._running = False + + if self._mouse_listener: + self._mouse_listener.stop() + self._mouse_listener = None + if self._keyboard_listener: + self._keyboard_listener.stop() + self._keyboard_listener = None + + logger.info(f"EventListener arrêté — {len(self._events)} événements capturés") + + with self._lock: + return list(self._events) + + @property + def is_running(self) -> bool: + return self._running + + @property + def event_count(self) -> int: + with self._lock: + return len(self._events) + + def _relative_time(self) -> float: + """Temps relatif depuis le début de la capture.""" + if self._start_time is None: + return 0.0 + return round(time.time() - self._start_time, 3) + + def _get_window_context(self) -> Dict[str, str]: + """Obtenir le contexte de la fenêtre active.""" + try: + import subprocess + # Utiliser xdotool sur Linux pour obtenir la fenêtre active + result = subprocess.run( + ["xdotool", "getactivewindow", "getwindowname"], + capture_output=True, text=True, timeout=1 + ) + title = result.stdout.strip() if result.returncode == 0 else "Unknown" + + result2 = subprocess.run( + ["xdotool", "getactivewindow", "getwindowpid"], + capture_output=True, text=True, timeout=1 + ) + pid = result2.stdout.strip() if result2.returncode == 0 else "" + + # Essayer d'obtenir le nom du process + app_name = "unknown" + if pid: + try: + result3 = subprocess.run( + ["ps", "-p", pid, "-o", "comm="], + capture_output=True, text=True, timeout=1 + ) + app_name = result3.stdout.strip() if result3.returncode == 0 else "unknown" + except Exception: + pass + + return {"title": title, "app_name": app_name} + except Exception: + return {"title": "Unknown", "app_name": "unknown"} + + def _emit_event(self, event: Dict[str, Any]) -> None: + """Enregistrer et émettre un événement.""" + with self._lock: + self._events.append(event) + + if self._callback: + try: + self._callback(event) + except Exception as e: + logger.error(f"Erreur callback événement: {e}") + + # === Handlers souris === + + def _on_click(self, x: int, y: int, button, pressed: bool) -> None: + if not self._running or not pressed: + return + + event = { + "t": self._relative_time(), + "type": "mouse_click", + "button": button.name, + "pos": [x, y], + "window": self._get_window_context(), + "screenshot_id": None, + } + self._emit_event(event) + + def _on_scroll(self, x: int, y: int, dx: int, dy: int) -> None: + if not self._running: + return + + event = { + "t": self._relative_time(), + "type": "mouse_scroll", + "delta": dy * 120, + "pos": [x, y], + "window": self._get_window_context(), + "screenshot_id": None, + } + self._emit_event(event) + + def _on_move(self, x: int, y: int) -> None: + if not self._running: + return + + event = { + "t": self._relative_time(), + "type": "mouse_move", + "pos": [x, y], + "window": self._get_window_context(), + "screenshot_id": None, + } + self._emit_event(event) + + # === Handlers clavier === + + def _on_key_press(self, key) -> None: + if not self._running: + return + + key_name = self._key_to_string(key) + + event = { + "t": self._relative_time(), + "type": "key_press", + "keys": [key_name], + "window": self._get_window_context(), + "screenshot_id": None, + } + self._emit_event(event) + + def _on_key_release(self, key) -> None: + if not self._running: + return + + key_name = self._key_to_string(key) + + event = { + "t": self._relative_time(), + "type": "key_release", + "keys": [key_name], + "window": self._get_window_context(), + "screenshot_id": None, + } + self._emit_event(event) + + @staticmethod + def _key_to_string(key) -> str: + """Convertir une touche pynput en string lisible.""" + if hasattr(key, 'char') and key.char: + return key.char + if hasattr(key, 'name'): + return key.name.upper() + return str(key) diff --git a/core/capture/session_recorder.py b/core/capture/session_recorder.py new file mode 100644 index 000000000..47d1fda17 --- /dev/null +++ b/core/capture/session_recorder.py @@ -0,0 +1,344 @@ +""" +SessionRecorder - Enregistrement de sessions RPA complètes + +Orchestre EventListener + ScreenCapturer pour produire un RawSession : + - Capture les événements clavier/souris en continu + - Prend un screenshot à chaque clic (ou périodiquement) + - Sauvegarde les screenshots sur disque + - Produit un RawSession complet avec events + screenshots liés + +Usage: + >>> recorder = SessionRecorder(output_dir="data/sessions") + >>> recorder.start(workflow_name="login_workflow") + >>> # ... l'utilisateur effectue ses actions ... + >>> session = recorder.stop() + >>> print(f"{len(session.events)} events, {len(session.screenshots)} screenshots") +""" + +import logging +import os +import platform +import threading +import time +from datetime import datetime +from pathlib import Path +from typing import Optional, Callable, Dict, Any, List + +from core.models.raw_session import RawSession, Event, Screenshot, RawWindowContext + +logger = logging.getLogger(__name__) + + +class SessionRecorder: + """ + Enregistreur de sessions RPA complet. + + Combine EventListener (clavier/souris) et ScreenCapturer (screenshots) + pour produire une RawSession exploitable par le GraphBuilder. + """ + + def __init__( + self, + output_dir: str = "data/training/sessions", + screenshot_on_click: bool = True, + screenshot_interval_ms: int = 0, + capture_keyboard: bool = True, + ): + """ + Args: + output_dir: Répertoire de sortie pour les sessions + screenshot_on_click: Prendre un screenshot à chaque clic + screenshot_interval_ms: Intervalle de capture périodique (0 = désactivé) + capture_keyboard: Capturer les frappes clavier + """ + self.output_dir = Path(output_dir) + self.screenshot_on_click = screenshot_on_click + self.screenshot_interval_ms = screenshot_interval_ms + self.capture_keyboard = capture_keyboard + + self._session: Optional[RawSession] = None + self._session_dir: Optional[Path] = None + self._screenshots_dir: Optional[Path] = None + self._running = False + self._screenshot_counter = 0 + self._lock = threading.Lock() + + # Composants (lazy init) + self._event_listener = None + self._screen_capturer = None + self._periodic_thread: Optional[threading.Thread] = None + + # Callbacks optionnels + self._on_event: Optional[Callable[[Dict[str, Any]], None]] = None + self._on_screenshot: Optional[Callable[[str], None]] = None + + def start( + self, + workflow_name: str = "", + session_id: Optional[str] = None, + on_event: Optional[Callable[[Dict[str, Any]], None]] = None, + on_screenshot: Optional[Callable[[str], None]] = None, + ) -> str: + """ + Démarrer l'enregistrement d'une session. + + Args: + workflow_name: Nom du workflow pour le contexte + session_id: ID de session (généré si None) + on_event: Callback appelé pour chaque événement + on_screenshot: Callback appelé pour chaque screenshot + + Returns: + session_id de la session démarrée + """ + if self._running: + logger.warning("SessionRecorder déjà en cours") + return self._session.session_id if self._session else "" + + # Générer ID de session + if session_id is None: + session_id = f"session_{datetime.now().strftime('%Y%m%d_%H%M%S')}" + + # Créer répertoires + self._session_dir = self.output_dir / session_id + self._screenshots_dir = self._session_dir / session_id / "screenshots" + self._screenshots_dir.mkdir(parents=True, exist_ok=True) + + # Initialiser la session + self._session = RawSession( + session_id=session_id, + agent_version="rpa_vision_v3", + environment=self._get_environment(), + user={"id": os.getenv("USER", "unknown")}, + context={"workflow": workflow_name, "tags": []}, + started_at=datetime.now(), + ) + + self._screenshot_counter = 0 + self._on_event = on_event + self._on_screenshot = on_screenshot + self._running = True + + # Démarrer le listener d'événements + self._start_event_listener() + + # Démarrer la capture périodique si configurée + if self.screenshot_interval_ms > 0: + self._start_periodic_capture() + + logger.info( + f"SessionRecorder démarré: {session_id} " + f"(screenshots_dir={self._screenshots_dir})" + ) + return session_id + + def stop(self) -> RawSession: + """ + Arrêter l'enregistrement et retourner la session complète. + + Returns: + RawSession avec tous les événements et screenshots + """ + if not self._running: + logger.warning("SessionRecorder non démarré") + return self._session + + self._running = False + + # Arrêter la capture périodique + if self._periodic_thread and self._periodic_thread.is_alive(): + self._periodic_thread.join(timeout=2) + + # Arrêter le listener d'événements + if self._event_listener: + self._event_listener.stop() + + # Finaliser la session + self._session.ended_at = datetime.now() + + # Sauvegarder la session JSON + session_path = self._session_dir / f"{self._session.session_id}.json" + self._session.save_to_file(session_path) + + logger.info( + f"SessionRecorder arrêté: {self._session.session_id} " + f"({len(self._session.events)} events, " + f"{len(self._session.screenshots)} screenshots) " + f"→ {session_path}" + ) + + return self._session + + @property + def is_running(self) -> bool: + return self._running + + @property + def event_count(self) -> int: + return len(self._session.events) if self._session else 0 + + @property + def screenshot_count(self) -> int: + return len(self._session.screenshots) if self._session else 0 + + # ========================================================================= + # Capture d'événements + # ========================================================================= + + def _start_event_listener(self) -> None: + """Démarrer le listener d'événements.""" + try: + from core.capture.event_listener import EventListener + + self._event_listener = EventListener(capture_mouse_move=False) + self._event_listener.start(callback=self._on_raw_event) + logger.info("EventListener démarré") + except ImportError: + logger.warning( + "EventListener non disponible (pynput manquant). " + "Seuls les screenshots périodiques seront capturés." + ) + + def _on_raw_event(self, raw_event: Dict[str, Any]) -> None: + """Callback appelé par EventListener pour chaque événement.""" + if not self._running or not self._session: + return + + # Convertir en Event + event = Event( + t=raw_event.get("t", 0.0), + type=raw_event.get("type", "unknown"), + window=RawWindowContext( + title=raw_event.get("window", {}).get("title", "Unknown"), + app_name=raw_event.get("window", {}).get("app_name", "unknown"), + ), + screenshot_id=None, + data={ + k: v + for k, v in raw_event.items() + if k not in ("t", "type", "window", "screenshot_id") + }, + ) + + # Screenshot sur clic + if self.screenshot_on_click and event.type == "mouse_click": + screenshot_id = self._take_screenshot() + if screenshot_id: + event.screenshot_id = screenshot_id + + with self._lock: + self._session.add_event(event) + + # Callback utilisateur + if self._on_event: + try: + self._on_event(raw_event) + except Exception as e: + logger.warning(f"Erreur callback on_event: {e}") + + # ========================================================================= + # Capture de screenshots + # ========================================================================= + + def _take_screenshot(self) -> Optional[str]: + """Prendre un screenshot et le sauvegarder.""" + if not self._running or not self._session: + return None + + try: + self._ensure_screen_capturer() + if self._screen_capturer is None: + return None + + frame = self._screen_capturer.capture_frame() + if frame is None: + return None + + # Sauvegarder + self._screenshot_counter += 1 + screenshot_id = f"ss_{self._screenshot_counter:04d}" + filename = f"screen_{self._screenshot_counter:04d}.png" + filepath = self._screenshots_dir / filename + + self._screen_capturer.save_frame(frame, str(filepath)) + + # Enregistrer dans la session + screenshot = Screenshot( + screenshot_id=screenshot_id, + relative_path=f"screenshots/{filename}", + captured_at=datetime.now().isoformat(), + ) + + with self._lock: + self._session.add_screenshot(screenshot) + + # Callback utilisateur + if self._on_screenshot: + try: + self._on_screenshot(str(filepath)) + except Exception as e: + logger.warning(f"Erreur callback on_screenshot: {e}") + + return screenshot_id + + except Exception as e: + logger.warning(f"Erreur capture screenshot: {e}") + return None + + def _ensure_screen_capturer(self) -> None: + """Initialiser le ScreenCapturer (lazy).""" + if self._screen_capturer is not None: + return + + try: + from core.capture.screen_capturer import ScreenCapturer + + self._screen_capturer = ScreenCapturer( + buffer_size=5, + detect_changes=False, + ) + except Exception as e: + logger.warning(f"ScreenCapturer non disponible: {e}") + + def _start_periodic_capture(self) -> None: + """Démarrer la capture périodique en thread.""" + interval_s = self.screenshot_interval_ms / 1000.0 + + def _periodic_loop(): + while self._running: + self._take_screenshot() + time.sleep(interval_s) + + self._periodic_thread = threading.Thread( + target=_periodic_loop, daemon=True, name="periodic_capture" + ) + self._periodic_thread.start() + logger.info( + f"Capture périodique démarrée (intervalle={self.screenshot_interval_ms}ms)" + ) + + # ========================================================================= + # Helpers + # ========================================================================= + + def _get_environment(self) -> Dict[str, Any]: + """Collecter les informations d'environnement.""" + env = { + "os": platform.system().lower(), + "os_version": platform.version(), + "hostname": platform.node(), + "screen": {}, + } + + # Résolution d'écran + try: + self._ensure_screen_capturer() + if self._screen_capturer: + w, h = self._screen_capturer.get_screen_resolution() + env["screen"] = { + "primary_resolution": [w, h], + } + except Exception: + env["screen"] = {"primary_resolution": [1920, 1080]} + + return env diff --git a/core/detection/ui_detector.py b/core/detection/ui_detector.py index 5f5794bdd..8866d3e6c 100644 --- a/core/detection/ui_detector.py +++ b/core/detection/ui_detector.py @@ -69,9 +69,10 @@ class DetectionConfig: """Configuration de la détection UI hybride""" # VLM # Modèles recommandés: - # - "qwen2.5vl:7b" (plus rapide, meilleur avec format='json', recommandé) + # - "qwen2.5vl:3b" (léger, tient en GPU 12GB avec split partiel) + # - "qwen2.5vl:7b" (meilleur mais 13GB mémoire, CPU-only sur RTX 5070) # - "qwen3-vl:8b" (plus gros, supporté mais plus d'erreurs JSON) - vlm_model: str = "qwen2.5vl:7b" + vlm_model: str = "qwen2.5vl:3b" vlm_endpoint: str = "http://localhost:11434" use_vlm_classification: bool = True # Utiliser VLM pour classifier diff --git a/core/embedding/faiss_manager.py b/core/embedding/faiss_manager.py index 0acfca368..aab14c548 100644 --- a/core/embedding/faiss_manager.py +++ b/core/embedding/faiss_manager.py @@ -451,6 +451,9 @@ class FAISSManager: return results + # Alias pour compatibilité (WorkflowPipeline, NodeMatcher) + search = search_similar + def remove_embedding(self, faiss_id: int) -> bool: """ Supprimer un embedding de l'index diff --git a/core/embedding/state_embedding_builder.py b/core/embedding/state_embedding_builder.py index 28962d1da..a88656c05 100644 --- a/core/embedding/state_embedding_builder.py +++ b/core/embedding/state_embedding_builder.py @@ -212,8 +212,8 @@ class StateEmbeddingBuilder: # Concaténer tous les textes détectés texts = [] - if hasattr(screen_state.perception, 'detected_texts'): - texts = screen_state.perception.detected_texts + if hasattr(screen_state.perception, 'detected_text'): + texts = screen_state.perception.detected_text combined_text = " ".join(texts) if texts else "" diff --git a/core/evaluation/workflow_simulation_report.py b/core/evaluation/workflow_simulation_report.py index 96ceae6eb..98dda531c 100644 --- a/core/evaluation/workflow_simulation_report.py +++ b/core/evaluation/workflow_simulation_report.py @@ -664,12 +664,12 @@ class WorkflowSimulator: try: if check.kind == "text_present": # Vérifier présence de texte - detected_texts = getattr(screen_state.perception_level, 'detected_texts', []) if hasattr(screen_state, 'perception_level') else [] + detected_texts = getattr(screen_state.perception, 'detected_text', []) if hasattr(screen_state, 'perception') else [] return any(check.value in text for text in detected_texts) elif check.kind == "text_absent": # Vérifier absence de texte - detected_texts = getattr(screen_state.perception_level, 'detected_texts', []) if hasattr(screen_state, 'perception_level') else [] + detected_texts = getattr(screen_state.perception, 'detected_text', []) if hasattr(screen_state, 'perception') else [] return not any(check.value in text for text in detected_texts) elif check.kind == "element_present": @@ -681,7 +681,7 @@ class WorkflowSimulator: elif check.kind == "window_title_contains": # Vérifier titre de fenêtre - window_title = getattr(screen_state.raw_level, 'window_title', '') if hasattr(screen_state, 'raw_level') else '' + window_title = getattr(screen_state.window, 'window_title', '') if hasattr(screen_state, 'window') else '' return check.value in window_title else: diff --git a/core/execution/error_handler.py b/core/execution/error_handler.py index 523a4d805..33362eb48 100644 --- a/core/execution/error_handler.py +++ b/core/execution/error_handler.py @@ -509,13 +509,13 @@ class ErrorHandler: 'workflow_edge': edge, 'action': action, 'details': { - 'target_role': action.target.role if hasattr(action.target, 'role') else None, - 'target_text': action.target.text_pattern if hasattr(action.target, 'text_pattern') else None + 'target_role': action.target.by_role if hasattr(action.target, 'by_role') else None, + 'target_text': action.target.by_text if hasattr(action.target, 'by_text') else None }, 'original_data': { 'target': { - 'role': action.target.role if hasattr(action.target, 'role') else None, - 'text_pattern': action.target.text_pattern if hasattr(action.target, 'text_pattern') else None, + 'by_role': action.target.by_role if hasattr(action.target, 'by_role') else None, + 'by_text': action.target.by_text if hasattr(action.target, 'by_text') else None, 'bbox': getattr(action.target, 'bbox', None) } } diff --git a/core/extraction/__init__.py b/core/extraction/__init__.py new file mode 100644 index 000000000..4d7b3b49f --- /dev/null +++ b/core/extraction/__init__.py @@ -0,0 +1,29 @@ +""" +Module d'extraction de donnees structurees depuis des captures d'ecran. + +Ce module orchestre le cycle complet : + schema YAML -> navigation -> screenshot -> VLM/OCR -> validation -> SQLite -> CSV/Excel + +Classes principales : + - ExtractionSchema : definition des champs et regles de navigation + - ExtractionField : definition d'un champ individuel + - FieldExtractor : extraction via VLM (Ollama) ou OCR (docTR) + - DataStore : stockage SQLite + export CSV/Excel + - IterationController : controle de la boucle de navigation + - ExtractionEngine : orchestrateur principal +""" + +from .schema import ExtractionField, ExtractionSchema +from .field_extractor import FieldExtractor +from .data_store import DataStore +from .iteration_controller import IterationController +from .extraction_engine import ExtractionEngine + +__all__ = [ + "ExtractionField", + "ExtractionSchema", + "FieldExtractor", + "DataStore", + "IterationController", + "ExtractionEngine", +] diff --git a/core/extraction/data_store.py b/core/extraction/data_store.py new file mode 100644 index 000000000..5d71b05cc --- /dev/null +++ b/core/extraction/data_store.py @@ -0,0 +1,420 @@ +""" +DataStore - Stockage SQLite des donnees extraites + export CSV/Excel + +Chaque session d'extraction (ExtractionSchema applique a un ecran) cree +une entree dans la table `extractions`. Les enregistrements individuels +sont stockes dans la table `records` avec leurs donnees JSON, le chemin +du screenshot source et un score de confiance. +""" + +import csv +import json +import logging +import sqlite3 +import uuid +from datetime import datetime +from io import StringIO +from pathlib import Path +from typing import Any, Dict, List, Optional + +from .schema import ExtractionSchema + +logger = logging.getLogger(__name__) + + +class DataStore: + """Stockage des donnees extraites dans SQLite avec export CSV/Excel.""" + + def __init__(self, db_path: str = "data/extractions/store.db"): + self.db_path = Path(db_path) + self.db_path.parent.mkdir(parents=True, exist_ok=True) + self._init_db() + + # ------------------------------------------------------------------ + # Initialisation + # ------------------------------------------------------------------ + + def _init_db(self) -> None: + """Creer les tables si necessaire.""" + with self._connect() as conn: + conn.execute(""" + CREATE TABLE IF NOT EXISTS extractions ( + id TEXT PRIMARY KEY, + schema_name TEXT NOT NULL, + schema_json TEXT NOT NULL, + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'in_progress', + record_count INTEGER NOT NULL DEFAULT 0 + ) + """) + conn.execute(""" + CREATE TABLE IF NOT EXISTS records ( + id TEXT PRIMARY KEY, + extraction_id TEXT NOT NULL, + data_json TEXT NOT NULL, + screenshot_path TEXT, + confidence REAL NOT NULL DEFAULT 0.0, + errors_json TEXT, + created_at TEXT NOT NULL, + FOREIGN KEY (extraction_id) REFERENCES extractions(id) + ) + """) + conn.execute(""" + CREATE INDEX IF NOT EXISTS idx_records_extraction + ON records(extraction_id) + """) + + def _connect(self) -> sqlite3.Connection: + """Ouvrir une connexion SQLite.""" + conn = sqlite3.connect(str(self.db_path)) + conn.row_factory = sqlite3.Row + conn.execute("PRAGMA journal_mode=WAL") + return conn + + # ------------------------------------------------------------------ + # Extractions (sessions) + # ------------------------------------------------------------------ + + def create_extraction(self, schema: ExtractionSchema) -> str: + """ + Creer une nouvelle session d'extraction. + + Args: + schema: Schema d'extraction + + Returns: + extraction_id (UUID) + """ + extraction_id = str(uuid.uuid4()) + now = datetime.utcnow().isoformat() + + with self._connect() as conn: + conn.execute( + """ + INSERT INTO extractions (id, schema_name, schema_json, created_at, updated_at, status) + VALUES (?, ?, ?, ?, ?, ?) + """, + ( + extraction_id, + schema.name, + json.dumps(schema.to_dict(), ensure_ascii=False), + now, + now, + "in_progress", + ), + ) + + logger.info( + "Extraction creee : %s (schema=%s)", extraction_id[:8], schema.name + ) + return extraction_id + + def finish_extraction(self, extraction_id: str, status: str = "completed") -> None: + """Marquer une extraction comme terminee.""" + now = datetime.utcnow().isoformat() + with self._connect() as conn: + conn.execute( + "UPDATE extractions SET status = ?, updated_at = ? WHERE id = ?", + (status, now, extraction_id), + ) + + def get_extraction(self, extraction_id: str) -> Optional[Dict[str, Any]]: + """Recuperer les metadonnees d'une extraction.""" + with self._connect() as conn: + row = conn.execute( + "SELECT * FROM extractions WHERE id = ?", (extraction_id,) + ).fetchone() + if row: + return dict(row) + return None + + def list_extractions(self, limit: int = 50) -> List[Dict[str, Any]]: + """Lister les extractions recentes.""" + with self._connect() as conn: + rows = conn.execute( + "SELECT * FROM extractions ORDER BY created_at DESC LIMIT ?", + (limit,), + ).fetchall() + return [dict(r) for r in rows] + + # ------------------------------------------------------------------ + # Records (enregistrements) + # ------------------------------------------------------------------ + + def add_record( + self, + extraction_id: str, + data: Dict[str, Any], + screenshot_path: Optional[str] = None, + confidence: float = 0.0, + errors: Optional[List[str]] = None, + ) -> str: + """ + Ajouter un enregistrement extrait. + + Args: + extraction_id: ID de la session d'extraction + data: Donnees extraites (dict) + screenshot_path: Chemin du screenshot source + confidence: Score de confiance [0, 1] + errors: Liste d'erreurs de validation + + Returns: + record_id (UUID) + """ + record_id = str(uuid.uuid4()) + now = datetime.utcnow().isoformat() + + with self._connect() as conn: + conn.execute( + """ + INSERT INTO records (id, extraction_id, data_json, screenshot_path, + confidence, errors_json, created_at) + VALUES (?, ?, ?, ?, ?, ?, ?) + """, + ( + record_id, + extraction_id, + json.dumps(data, ensure_ascii=False), + screenshot_path, + confidence, + json.dumps(errors or [], ensure_ascii=False), + now, + ), + ) + # Mettre a jour le compteur + conn.execute( + """ + UPDATE extractions + SET record_count = record_count + 1, updated_at = ? + WHERE id = ? + """, + (now, extraction_id), + ) + + logger.debug( + "Record ajoute : %s (extraction=%s, confiance=%.2f)", + record_id[:8], + extraction_id[:8], + confidence, + ) + return record_id + + def get_records(self, extraction_id: str) -> List[Dict[str, Any]]: + """ + Recuperer tous les enregistrements d'une extraction. + + Returns: + Liste de dicts avec les cles : id, data, screenshot_path, + confidence, errors, created_at + """ + with self._connect() as conn: + rows = conn.execute( + """ + SELECT id, data_json, screenshot_path, confidence, + errors_json, created_at + FROM records + WHERE extraction_id = ? + ORDER BY created_at ASC + """, + (extraction_id,), + ).fetchall() + + results = [] + for row in rows: + results.append({ + "id": row["id"], + "data": json.loads(row["data_json"]), + "screenshot_path": row["screenshot_path"], + "confidence": row["confidence"], + "errors": json.loads(row["errors_json"]) if row["errors_json"] else [], + "created_at": row["created_at"], + }) + return results + + # ------------------------------------------------------------------ + # Export + # ------------------------------------------------------------------ + + def export_csv(self, extraction_id: str, output_path: str) -> str: + """ + Exporter les enregistrements en CSV. + + Args: + extraction_id: ID de la session + output_path: Chemin du fichier CSV de sortie + + Returns: + Chemin du fichier cree + """ + records = self.get_records(extraction_id) + if not records: + raise ValueError(f"Aucun enregistrement pour l'extraction {extraction_id}") + + out = Path(output_path) + out.parent.mkdir(parents=True, exist_ok=True) + + # Determiner les colonnes depuis le premier record + all_keys = self._collect_all_keys(records) + + with open(out, "w", newline="", encoding="utf-8-sig") as f: + writer = csv.DictWriter(f, fieldnames=all_keys, extrasaction="ignore") + writer.writeheader() + for rec in records: + writer.writerow(rec["data"]) + + logger.info("Export CSV : %s (%d lignes)", output_path, len(records)) + return str(out) + + def export_excel(self, extraction_id: str, output_path: str) -> str: + """ + Exporter les enregistrements en Excel (openpyxl). + + Args: + extraction_id: ID de la session + output_path: Chemin du fichier Excel de sortie + + Returns: + Chemin du fichier cree + + Raises: + ImportError: Si openpyxl n'est pas installe + """ + try: + import openpyxl + except ImportError: + raise ImportError( + "openpyxl est requis pour l'export Excel. " + "Installez-le : pip install openpyxl" + ) + + records = self.get_records(extraction_id) + if not records: + raise ValueError(f"Aucun enregistrement pour l'extraction {extraction_id}") + + out = Path(output_path) + out.parent.mkdir(parents=True, exist_ok=True) + + all_keys = self._collect_all_keys(records) + + wb = openpyxl.Workbook() + ws = wb.active + ws.title = "Extraction" + + # En-tetes + for col_idx, key in enumerate(all_keys, start=1): + cell = ws.cell(row=1, column=col_idx, value=key) + cell.font = openpyxl.styles.Font(bold=True) + + # Donnees + for row_idx, rec in enumerate(records, start=2): + for col_idx, key in enumerate(all_keys, start=1): + ws.cell(row=row_idx, column=col_idx, value=rec["data"].get(key, "")) + + # Ajuster la largeur des colonnes + for col_idx, key in enumerate(all_keys, start=1): + max_len = max( + len(str(key)), + *(len(str(rec["data"].get(key, ""))) for rec in records), + ) + ws.column_dimensions[openpyxl.utils.get_column_letter(col_idx)].width = min(max_len + 2, 50) + + wb.save(str(out)) + logger.info("Export Excel : %s (%d lignes)", output_path, len(records)) + return str(out) + + # ------------------------------------------------------------------ + # Statistiques + # ------------------------------------------------------------------ + + def get_stats(self, extraction_id: str) -> Dict[str, Any]: + """ + Statistiques d'une extraction. + + Returns: + Dict avec : record_count, avg_confidence, completeness, + field_coverage, status, duration + """ + extraction = self.get_extraction(extraction_id) + if not extraction: + return {"error": f"Extraction {extraction_id} introuvable"} + + records = self.get_records(extraction_id) + + if not records: + return { + "extraction_id": extraction_id, + "schema_name": extraction["schema_name"], + "status": extraction["status"], + "record_count": 0, + "avg_confidence": 0.0, + "completeness": 0.0, + "field_coverage": {}, + } + + # Confiance moyenne + confidences = [r["confidence"] for r in records] + avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0 + + # Couverture par champ : pourcentage de records ayant une valeur non-nulle + schema_data = json.loads(extraction["schema_json"]) + field_names = [f["name"] for f in schema_data.get("fields", [])] + + field_coverage = {} + for fname in field_names: + filled = sum( + 1 for r in records + if r["data"].get(fname) is not None + and str(r["data"][fname]).strip() != "" + ) + field_coverage[fname] = filled / len(records) if records else 0.0 + + # Completude globale + completeness = ( + sum(field_coverage.values()) / len(field_coverage) + if field_coverage else 0.0 + ) + + # Erreurs + total_errors = sum(len(r.get("errors", [])) for r in records) + + return { + "extraction_id": extraction_id, + "schema_name": extraction["schema_name"], + "status": extraction["status"], + "record_count": len(records), + "avg_confidence": round(avg_confidence, 3), + "completeness": round(completeness, 3), + "field_coverage": {k: round(v, 3) for k, v in field_coverage.items()}, + "total_errors": total_errors, + "created_at": extraction["created_at"], + "updated_at": extraction["updated_at"], + } + + # ------------------------------------------------------------------ + # Nettoyage + # ------------------------------------------------------------------ + + def delete_extraction(self, extraction_id: str) -> bool: + """Supprimer une extraction et tous ses records.""" + with self._connect() as conn: + conn.execute("DELETE FROM records WHERE extraction_id = ?", (extraction_id,)) + result = conn.execute("DELETE FROM extractions WHERE id = ?", (extraction_id,)) + return result.rowcount > 0 + + # ------------------------------------------------------------------ + # Utilitaires internes + # ------------------------------------------------------------------ + + @staticmethod + def _collect_all_keys(records: List[Dict[str, Any]]) -> List[str]: + """Collecter toutes les cles uniques des records, en preservant l'ordre.""" + seen = set() + keys = [] + for rec in records: + for k in rec["data"].keys(): + if k not in seen: + seen.add(k) + keys.append(k) + return keys diff --git a/core/extraction/extraction_engine.py b/core/extraction/extraction_engine.py new file mode 100644 index 000000000..f0cd2ab81 --- /dev/null +++ b/core/extraction/extraction_engine.py @@ -0,0 +1,312 @@ +""" +ExtractionEngine - Orchestrateur principal du moteur d'extraction de donnees + +Orchestre le cycle complet : + naviguer -> screenshot -> extraire -> valider -> stocker -> suivant + +S'appuie sur FieldExtractor (VLM/OCR), DataStore (SQLite), et +IterationController (navigation) pour realiser l'extraction automatisee +de donnees depuis des interfaces utilisateur. +""" + +import logging +import time +from datetime import datetime +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional + +import requests + +from .data_store import DataStore +from .field_extractor import FieldExtractor +from .iteration_controller import IterationController +from .schema import ExtractionSchema + +logger = logging.getLogger(__name__) + + +class ExtractionEngine: + """ + Moteur d'extraction principal. + + Orchestre le cycle : naviguer -> screenshot -> extraire -> stocker -> suivant. + + Modes d'utilisation : + 1. Automatique : start_extraction() — boucle complete avec navigation + 2. Manuel : extract_current_screen() — extraction ponctuelle d'un screenshot + """ + + def __init__( + self, + schema: ExtractionSchema, + store: Optional[DataStore] = None, + field_extractor: Optional[FieldExtractor] = None, + streaming_server_url: str = "http://localhost:5005", + screenshot_dir: str = "data/extractions/screenshots", + ): + """ + Args: + schema: Schema d'extraction decrivant les champs et la navigation + store: DataStore pour le stockage (cree un par defaut si absent) + field_extractor: Extracteur de champs (cree un par defaut si absent) + streaming_server_url: URL du streaming server Agent V1 + screenshot_dir: Repertoire pour sauvegarder les screenshots + """ + self.schema = schema + self.store = store or DataStore() + self.field_extractor = field_extractor or FieldExtractor() + self.controller = IterationController(schema, streaming_server_url) + self.streaming_server_url = streaming_server_url.rstrip("/") + self.screenshot_dir = Path(screenshot_dir) + self.screenshot_dir.mkdir(parents=True, exist_ok=True) + + # Etat interne + self._current_extraction_id: Optional[str] = None + self._is_running = False + self._should_stop = False + self._progress_callback: Optional[Callable] = None + + # ------------------------------------------------------------------ + # API publique - Extraction automatique + # ------------------------------------------------------------------ + + def start_extraction( + self, + session_id: str, + on_progress: Optional[Callable[[Dict[str, Any]], None]] = None, + ) -> str: + """ + Demarrer une session d'extraction automatique. + + Boucle : + 1. Creer l'extraction dans le store + 2. Pour chaque enregistrement : + a. Prendre un screenshot + b. Extraire les champs + c. Valider + d. Stocker + e. Naviguer au suivant + 3. Finaliser et retourner l'extraction_id + + Args: + session_id: ID de la session de streaming (pour navigation) + on_progress: Callback appele a chaque record (optionnel) + + Returns: + extraction_id + """ + self._is_running = True + self._should_stop = False + self._progress_callback = on_progress + + # Creer la session d'extraction + extraction_id = self.store.create_extraction(self.schema) + self._current_extraction_id = extraction_id + + logger.info( + "Demarrage extraction %s (schema=%s, max=%d)", + extraction_id[:8], + self.schema.name, + self.controller.max_records, + ) + + try: + while self.controller.has_next() and not self._should_stop: + idx = self.controller.current_index + + # 1. Screenshot + screenshot_path = self._take_screenshot(session_id, idx) + if screenshot_path is None: + logger.warning("Screenshot echoue a l'index %d, on continue", idx) + # Naviguer quand meme pour ne pas rester bloque + self.controller.navigate_to_next(session_id) + continue + + # 2. Extraction + result = self.extract_current_screen(screenshot_path) + + # 3. Stockage + self.store.add_record( + extraction_id=extraction_id, + data=result["data"], + screenshot_path=screenshot_path, + confidence=result["confidence"], + errors=result.get("errors"), + ) + + # 4. Callback de progression + if self._progress_callback: + progress = self.get_progress() + progress["last_record"] = result["data"] + progress["last_confidence"] = result["confidence"] + self._progress_callback(progress) + + logger.info( + "Record %d/%d extrait (confiance=%.2f)", + idx + 1, + self.controller.max_records, + result["confidence"], + ) + + # 5. Navigation + if not self.controller.navigate_to_next(session_id): + logger.info("Fin de navigation a l'index %d", idx) + break + + # Finaliser + status = "stopped" if self._should_stop else "completed" + self.store.finish_extraction(extraction_id, status=status) + + logger.info( + "Extraction %s terminee : %s (%d records)", + extraction_id[:8], + status, + self.controller.current_index, + ) + + except Exception as e: + logger.error("Erreur pendant l'extraction : %s", e) + self.store.finish_extraction(extraction_id, status="error") + raise + + finally: + self._is_running = False + self._current_extraction_id = None + + return extraction_id + + def stop_extraction(self) -> None: + """Demander l'arret de l'extraction en cours.""" + if self._is_running: + logger.info("Arret demande pour l'extraction en cours") + self._should_stop = True + + # ------------------------------------------------------------------ + # API publique - Extraction ponctuelle + # ------------------------------------------------------------------ + + def extract_current_screen(self, screenshot_path: str) -> Dict[str, Any]: + """ + Extraire les champs du screenshot actuel sans navigation. + + Args: + screenshot_path: Chemin vers le screenshot + + Returns: + Dict avec 'data', 'confidence', 'errors', 'validation' + """ + # Extraction + result = self.field_extractor.extract_fields(screenshot_path, self.schema) + + # Validation contre le schema + validation = self.schema.validate_record(result["data"]) + result["validation"] = validation + + return result + + # ------------------------------------------------------------------ + # API publique - Progression + # ------------------------------------------------------------------ + + def get_progress(self) -> Dict[str, Any]: + """Retourne la progression actuelle de l'extraction.""" + nav_progress = self.controller.progress + stats = {} + + if self._current_extraction_id: + stats = self.store.get_stats(self._current_extraction_id) + + return { + "extraction_id": self._current_extraction_id, + "is_running": self._is_running, + "navigation": nav_progress, + "stats": stats, + "schema_name": self.schema.name, + } + + # ------------------------------------------------------------------ + # Screenshot + # ------------------------------------------------------------------ + + def _take_screenshot(self, session_id: str, index: int) -> Optional[str]: + """ + Prendre un screenshot via le streaming server. + + Essaie d'appeler l'API du streaming server pour obtenir + le screenshot courant. En cas d'echec, retourne None. + + Args: + session_id: ID de la session de streaming + index: Index de l'enregistrement courant + + Returns: + Chemin du screenshot sauvegarde, ou None + """ + try: + response = requests.get( + f"{self.streaming_server_url}/api/screenshot", + params={"session_id": session_id}, + timeout=10, + ) + + if response.status_code == 200: + # Sauvegarder le screenshot + timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S") + filename = f"record_{index:04d}_{timestamp}.png" + filepath = self.screenshot_dir / filename + + with open(filepath, "wb") as f: + f.write(response.content) + + return str(filepath) + else: + logger.warning( + "Screenshot echoue : HTTP %d", response.status_code + ) + return None + + except requests.exceptions.ConnectionError: + logger.warning( + "Streaming server non accessible pour screenshot" + ) + return None + + except Exception as e: + logger.error("Erreur screenshot : %s", e) + return None + + # ------------------------------------------------------------------ + # Utilitaires + # ------------------------------------------------------------------ + + def extract_from_file(self, screenshot_path: str) -> Dict[str, Any]: + """ + Raccourci pour extraire depuis un fichier existant + et stocker le resultat. + + Utile pour du retraitement offline de screenshots. + + Args: + screenshot_path: Chemin vers un screenshot existant + + Returns: + Dict avec les donnees extraites et le record_id + """ + if self._current_extraction_id is None: + extraction_id = self.store.create_extraction(self.schema) + else: + extraction_id = self._current_extraction_id + + result = self.extract_current_screen(screenshot_path) + + record_id = self.store.add_record( + extraction_id=extraction_id, + data=result["data"], + screenshot_path=screenshot_path, + confidence=result["confidence"], + errors=result.get("errors"), + ) + + result["record_id"] = record_id + result["extraction_id"] = extraction_id + return result diff --git a/core/extraction/field_extractor.py b/core/extraction/field_extractor.py new file mode 100644 index 000000000..a627334c2 --- /dev/null +++ b/core/extraction/field_extractor.py @@ -0,0 +1,327 @@ +""" +FieldExtractor - Extraction de champs structures depuis des screenshots + +Utilise un VLM (Ollama) pour comprendre le contenu visuel et en extraire +des donnees structurees selon un schema predefini. +Fallback OCR via docTR si le VLM echoue. +""" + +import base64 +import json +import logging +import os +import re +from pathlib import Path +from typing import Any, Dict, List, Optional + +import requests + +from .schema import ExtractionField, ExtractionSchema + +logger = logging.getLogger(__name__) + +# Configuration Ollama (coherente avec le reste du projet) +OLLAMA_DEFAULT_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434") +OLLAMA_DEFAULT_MODEL = os.environ.get("VLM_MODEL", "qwen3-vl:8b") + + +class FieldExtractor: + """ + Extrait des champs structures depuis un screenshot. + + Pipeline : + 1. VLM : envoyer screenshot + schema au VLM pour extraction structuree + 2. Validation : verifier les regex, types, champs requis + 3. (Optionnel) OCR fallback si VLM indisponible + """ + + def __init__( + self, + ollama_url: str = OLLAMA_DEFAULT_URL, + ollama_model: str = OLLAMA_DEFAULT_MODEL, + timeout: int = 60, + ): + """ + Args: + ollama_url: URL du serveur Ollama + ollama_model: Modele VLM a utiliser + timeout: Timeout en secondes pour les appels VLM + """ + self.ollama_url = ollama_url.rstrip("/") + self.ollama_model = ollama_model + self.timeout = timeout + + # ------------------------------------------------------------------ + # API publique + # ------------------------------------------------------------------ + + def extract_fields( + self, + screenshot_path: str, + schema: ExtractionSchema, + ) -> Dict[str, Any]: + """ + Extraire les champs definis par le schema depuis un screenshot. + + Args: + screenshot_path: Chemin vers l'image (PNG/JPEG) + schema: Schema d'extraction + + Returns: + Dict avec les champs extraits + metadonnees + { + "data": {"nom": "DUPONT", "prenom": "Jean", ...}, + "confidence": 0.85, + "errors": [], + "raw_response": "..." + } + """ + path = Path(screenshot_path) + if not path.exists(): + return { + "data": {}, + "confidence": 0.0, + "errors": [f"Fichier introuvable : {screenshot_path}"], + "raw_response": None, + } + + # Encoder l'image en base64 + image_b64 = self._encode_image(path) + + # Extraction via VLM + raw_data, raw_response = self._extract_via_vlm(image_b64, schema.fields) + + if raw_data is None: + logger.warning("VLM extraction echouee, tentative OCR fallback") + raw_data = self._extract_via_ocr_fallback(path, schema.fields) + raw_response = "(ocr fallback)" + + # Validation et nettoyage + validated = {} + errors: List[str] = [] + valid_count = 0 + + for fld in schema.fields: + value = raw_data.get(fld.name) if raw_data else None + # Nettoyer + if value is not None: + value = str(value).strip() + if value == "" or value.lower() in ("null", "none", "n/a"): + value = None + + validated[fld.name] = value + + if not fld.validate_value(value): + errors.append( + f"Champ '{fld.name}' invalide ou manquant : {value!r}" + ) + else: + if value is not None and str(value).strip(): + valid_count += 1 + + total = len(schema.fields) if schema.fields else 1 + confidence = valid_count / total + + return { + "data": validated, + "confidence": confidence, + "errors": errors, + "raw_response": raw_response, + } + + # ------------------------------------------------------------------ + # Extraction VLM + # ------------------------------------------------------------------ + + def _extract_via_vlm( + self, image_b64: str, fields: List[ExtractionField] + ) -> tuple: + """ + Appeler le VLM (Ollama) pour extraction structuree. + + Returns: + (dict_donnees | None, raw_response_text | None) + """ + prompt = self._build_extraction_prompt(fields) + + try: + # Desactiver le mode thinking pour Qwen3 + effective_prompt = prompt + if "qwen" in self.ollama_model.lower(): + effective_prompt = f"/nothink {prompt}" + + payload = { + "model": self.ollama_model, + "prompt": effective_prompt, + "images": [image_b64], + "stream": False, + "format": "json", + "options": { + "temperature": 0.1, + "num_predict": 2000, + }, + } + + response = requests.post( + f"{self.ollama_url}/api/generate", + json=payload, + timeout=self.timeout, + ) + + if response.status_code != 200: + logger.error( + "Erreur Ollama %d : %s", + response.status_code, + response.text[:300], + ) + return None, None + + result = response.json() + raw_text = result.get("response", "").strip() + logger.debug("Reponse VLM brute : %s", raw_text[:500]) + + parsed = self._parse_vlm_response(raw_text) + return parsed, raw_text + + except requests.exceptions.Timeout: + logger.error("Timeout VLM apres %ds", self.timeout) + return None, None + + except requests.exceptions.ConnectionError: + logger.error("Ollama non accessible a %s", self.ollama_url) + return None, None + + except Exception as e: + logger.error("Erreur VLM inattendue : %s", e) + return None, None + + def _build_extraction_prompt(self, fields: List[ExtractionField]) -> str: + """Construire le prompt d'extraction structure pour le VLM.""" + field_descriptions = [] + for f in fields: + desc = f"- {f.name} ({f.field_type}): {f.description}" + if f.required: + desc += " [OBLIGATOIRE]" + if f.validation_regex: + desc += f" (format: {f.validation_regex})" + field_descriptions.append(desc) + + fields_text = "\n".join(field_descriptions) + + return f"""Regarde cette capture d'ecran et extrais les informations suivantes. + +CHAMPS A EXTRAIRE : +{fields_text} + +INSTRUCTIONS : +1. Extrais chaque champ tel qu'il apparait a l'ecran +2. Si un champ n'est pas visible, mets null +3. Pour les dates, conserve le format tel qu'affiche +4. Pour les nombres, conserve le format avec virgule si present +5. Reponds UNIQUEMENT en JSON valide + +FORMAT DE REPONSE : +Un objet JSON avec les cles correspondant aux noms de champs ci-dessus. +Exemple : {{"nom": "DUPONT", "prenom": "Jean", "date_naissance": "15/03/1965"}} + +Extrais maintenant les donnees :""" + + def _parse_vlm_response(self, text: str) -> Optional[Dict[str, Any]]: + """Parser la reponse JSON du VLM.""" + if not text: + return None + + # Essayer le parse direct + try: + return json.loads(text) + except json.JSONDecodeError: + pass + + # Chercher un objet JSON dans la reponse + match = re.search(r"\{[\s\S]*\}", text) + if match: + try: + return json.loads(match.group()) + except json.JSONDecodeError: + pass + + # Chercher entre balises ```json ... ``` + match = re.search(r"```(?:json)?\s*(\{[\s\S]*?\})\s*```", text) + if match: + try: + return json.loads(match.group(1)) + except json.JSONDecodeError: + pass + + logger.warning("Impossible de parser la reponse VLM en JSON") + return None + + # ------------------------------------------------------------------ + # OCR Fallback + # ------------------------------------------------------------------ + + def _extract_via_ocr_fallback( + self, image_path: Path, fields: List[ExtractionField] + ) -> Optional[Dict[str, Any]]: + """ + Fallback : extraire du texte brut via OCR (docTR) puis tenter + un mapping basique vers les champs. + + Ce fallback est tres basique ; il fournit le texte brut + sans mapping intelligent. Le VLM reste la methode privilegiee. + """ + try: + from PIL import Image as PILImage + + img = PILImage.open(str(image_path)) + + # Tenter docTR + try: + from doctr.io import DocumentFile + from doctr.models import ocr_predictor + + predictor = ocr_predictor(det_arch="db_mobilenet_v3_large", reco_arch="crnn_mobilenet_v3_large", pretrained=True) + doc = DocumentFile.from_images([str(image_path)]) + result = predictor(doc) + + # Extraire tout le texte + all_text = [] + for page in result.pages: + for block in page.blocks: + for line in block.lines: + line_text = " ".join(w.value for w in line.words) + all_text.append(line_text) + + full_text = "\n".join(all_text) + logger.info("OCR fallback : %d lignes extraites", len(all_text)) + + # Retourner le texte complet dans un champ special + return {"_ocr_text": full_text} + + except ImportError: + logger.warning("docTR non disponible pour le fallback OCR") + return None + + except Exception as e: + logger.error("Erreur OCR fallback : %s", e) + return None + + # ------------------------------------------------------------------ + # Utilitaires + # ------------------------------------------------------------------ + + @staticmethod + def _encode_image(path: Path) -> str: + """Encoder une image en base64.""" + with open(path, "rb") as f: + return base64.b64encode(f.read()).decode("utf-8") + + def check_vlm_available(self) -> bool: + """Verifier si le VLM Ollama est accessible.""" + try: + response = requests.get( + f"{self.ollama_url}/api/tags", timeout=5 + ) + return response.status_code == 200 + except (requests.RequestException, ConnectionError, TimeoutError): + return False diff --git a/core/extraction/iteration_controller.py b/core/extraction/iteration_controller.py new file mode 100644 index 000000000..05bd7138b --- /dev/null +++ b/core/extraction/iteration_controller.py @@ -0,0 +1,258 @@ +""" +IterationController - Controle de navigation entre enregistrements + +Gere la boucle de navigation : passage au record suivant, pagination, +scroll, etc. Communique avec le streaming server (Agent V1) pour +envoyer les actions de navigation sur la machine cible. +""" + +import logging +import time +from typing import Any, Dict, Optional + +import requests + +from .schema import ExtractionSchema + +logger = logging.getLogger(__name__) + + +class IterationController: + """ + Controle la navigation entre les enregistrements a extraire. + + Types de navigation supportes : + - list_detail : cliquer sur chaque element d'une liste + - pagination : bouton suivant / page suivante + - scroll : defilement vertical + - manual : l'utilisateur navigue manuellement + """ + + def __init__( + self, + schema: ExtractionSchema, + streaming_server_url: str = "http://localhost:5005", + ): + """ + Args: + schema: Schema d'extraction (contient les regles de navigation) + streaming_server_url: URL du streaming server Agent V1 + """ + self.schema = schema + self.server_url = streaming_server_url.rstrip("/") + self.current_index = 0 + self.max_records = schema.navigation.get("max_records", 100) + self.nav_type = schema.navigation.get("type", "manual") + self.nav_action = schema.navigation.get("next_record", "click_next_in_list") + self.nav_delay = schema.navigation.get("delay_ms", 1000) + + # Etat interne + self._started = False + self._finished = False + + # ------------------------------------------------------------------ + # API publique + # ------------------------------------------------------------------ + + def has_next(self) -> bool: + """Retourne True s'il reste des enregistrements a traiter.""" + if self._finished: + return False + return self.current_index < self.max_records + + def navigate_to_next(self, session_id: str) -> bool: + """ + Naviguer vers l'enregistrement suivant. + + Envoie les actions de navigation au streaming server + en fonction du type de navigation defini dans le schema. + + Args: + session_id: ID de la session de streaming + + Returns: + True si la navigation a reussi + """ + if not self.has_next(): + logger.info("Plus d'enregistrements a traiter (index=%d)", self.current_index) + return False + + success = False + + if self.nav_type == "manual": + # Mode manuel : on attend juste un delai + logger.info( + "Navigation manuelle : attente de %dms (index=%d)", + self.nav_delay, + self.current_index, + ) + time.sleep(self.nav_delay / 1000) + success = True + + elif self.nav_type == "pagination": + success = self._navigate_pagination(session_id) + + elif self.nav_type == "list_detail": + success = self._navigate_list_detail(session_id) + + elif self.nav_type == "scroll": + success = self._navigate_scroll(session_id) + + else: + logger.warning("Type de navigation inconnu : %s", self.nav_type) + success = False + + if success: + self.current_index += 1 + logger.debug( + "Navigation reussie -> index=%d/%d", + self.current_index, + self.max_records, + ) + + return success + + def navigate_to_record(self, session_id: str, index: int) -> bool: + """ + Naviguer vers un enregistrement specifique. + + Args: + session_id: ID de la session de streaming + index: Index de l'enregistrement cible + + Returns: + True si la navigation a reussi + """ + if index < 0 or index >= self.max_records: + logger.error("Index hors limites : %d (max=%d)", index, self.max_records) + return False + + # Naviguer pas a pas jusqu'a l'index cible + steps = index - self.current_index + if steps < 0: + logger.warning( + "Navigation arriere non supportee (current=%d, target=%d)", + self.current_index, + index, + ) + return False + + for _ in range(steps): + if not self.navigate_to_next(session_id): + return False + + return True + + def reset(self) -> None: + """Reinitialiser le controleur.""" + self.current_index = 0 + self._started = False + self._finished = False + + def mark_finished(self) -> None: + """Marquer l'iteration comme terminee (ex: fin de liste detectee).""" + self._finished = True + logger.info("Iteration marquee comme terminee a l'index %d", self.current_index) + + @property + def progress(self) -> Dict[str, Any]: + """Retourne la progression actuelle.""" + return { + "current_index": self.current_index, + "max_records": self.max_records, + "progress_pct": round( + (self.current_index / self.max_records * 100) + if self.max_records > 0 else 0, + 1, + ), + "nav_type": self.nav_type, + "finished": self._finished, + } + + # ------------------------------------------------------------------ + # Navigation specifique + # ------------------------------------------------------------------ + + def _navigate_pagination(self, session_id: str) -> bool: + """Navigation par pagination (bouton suivant).""" + action = { + "type": "click", + "target": self.nav_action, + "description": "Cliquer sur le bouton suivant / page suivante", + } + return self._send_action(session_id, action) + + def _navigate_list_detail(self, session_id: str) -> bool: + """Navigation dans une liste (cliquer sur l'element suivant).""" + action = { + "type": "click", + "target": self.nav_action, + "index": self.current_index, + "description": f"Cliquer sur l'element {self.current_index + 1} de la liste", + } + return self._send_action(session_id, action) + + def _navigate_scroll(self, session_id: str) -> bool: + """Navigation par defilement.""" + action = { + "type": "scroll", + "direction": "down", + "amount": self.schema.navigation.get("scroll_amount", 300), + "description": "Defiler vers le bas", + } + return self._send_action(session_id, action) + + # ------------------------------------------------------------------ + # Communication avec le streaming server + # ------------------------------------------------------------------ + + def _send_action(self, session_id: str, action: Dict[str, Any]) -> bool: + """ + Envoyer une action de navigation au streaming server. + + L'action est envoyee via l'API du streaming server (port 5005). + Si le serveur n'est pas disponible, on simule un delai. + + Args: + session_id: ID de la session de streaming + action: Description de l'action a executer + + Returns: + True si l'action a ete executee ou simulee + """ + try: + payload = { + "session_id": session_id, + "action": action, + } + + response = requests.post( + f"{self.server_url}/api/action", + json=payload, + timeout=10, + ) + + if response.status_code == 200: + # Attendre le delai de navigation + if self.nav_delay > 0: + time.sleep(self.nav_delay / 1000) + return True + else: + logger.warning( + "Action de navigation echouee : HTTP %d", response.status_code + ) + return False + + except requests.exceptions.ConnectionError: + logger.warning( + "Streaming server non accessible a %s — simulation du delai", + self.server_url, + ) + # Simuler l'attente de navigation (mode degrade) + if self.nav_delay > 0: + time.sleep(self.nav_delay / 1000) + return True + + except Exception as e: + logger.error("Erreur envoi action de navigation : %s", e) + return False diff --git a/core/extraction/schema.py b/core/extraction/schema.py new file mode 100644 index 000000000..1880d7521 --- /dev/null +++ b/core/extraction/schema.py @@ -0,0 +1,217 @@ +""" +Schema d'extraction de donnees - Definition des champs et navigation + +Permet de definir un schema YAML decrivant les champs a extraire +depuis des captures d'ecran (DPI, formulaires, listes...). +""" + +import re +from dataclasses import dataclass, field +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List, Optional + +import yaml + + +@dataclass +class ExtractionField: + """Definition d'un champ a extraire depuis un screenshot.""" + + name: str # Ex: "nom_patient", "date_naissance" + description: str # Description pour le VLM + field_type: str = "text" # "text", "date", "number", "boolean" + required: bool = True + validation_regex: Optional[str] = None # Regex de validation optionnelle + + def validate_value(self, value: Optional[str]) -> bool: + """ + Valider une valeur extraite pour ce champ. + + Returns: + True si la valeur est valide + """ + # Champ requis mais absent + if self.required and (value is None or str(value).strip() == ""): + return False + + # Pas de valeur et pas requis => OK + if value is None or str(value).strip() == "": + return True + + value_str = str(value).strip() + + # Validation par type + if self.field_type == "number": + try: + float(value_str.replace(",", ".").replace(" ", "")) + except ValueError: + return False + + elif self.field_type == "boolean": + if value_str.lower() not in ( + "true", "false", "oui", "non", "1", "0", "vrai", "faux" + ): + return False + + elif self.field_type == "date": + # Accepter les formats courants FR + date_patterns = [ + r"\d{2}/\d{2}/\d{4}", # JJ/MM/AAAA + r"\d{2}-\d{2}-\d{4}", # JJ-MM-AAAA + r"\d{4}-\d{2}-\d{2}", # AAAA-MM-JJ (ISO) + r"\d{2}\.\d{2}\.\d{4}", # JJ.MM.AAAA + ] + if not any(re.fullmatch(p, value_str) for p in date_patterns): + return False + + # Validation regex custom + if self.validation_regex: + if not re.fullmatch(self.validation_regex, value_str): + return False + + return True + + +@dataclass +class ExtractionSchema: + """ + Schema complet d'extraction : liste de champs + regles de navigation. + + Peut etre charge/sauvegarde en YAML pour reutilisation. + """ + + name: str # Ex: "dossier_patient_DPI" + description: str + fields: List[ExtractionField] = field(default_factory=list) + navigation: Dict[str, Any] = field(default_factory=dict) + + # --- Serialisation YAML --- + + @classmethod + def from_yaml(cls, path: str) -> "ExtractionSchema": + """ + Charger un schema depuis un fichier YAML. + + Args: + path: Chemin vers le fichier YAML + + Returns: + Instance ExtractionSchema + """ + yaml_path = Path(path) + if not yaml_path.exists(): + raise FileNotFoundError(f"Schema YAML non trouve : {path}") + + with open(yaml_path, "r", encoding="utf-8") as f: + data = yaml.safe_load(f) + + if not isinstance(data, dict): + raise ValueError(f"Le fichier YAML doit contenir un dictionnaire, pas {type(data).__name__}") + + return cls._from_dict(data) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "ExtractionSchema": + """Construire un schema depuis un dictionnaire Python.""" + return cls._from_dict(data) + + @classmethod + def _from_dict(cls, data: Dict[str, Any]) -> "ExtractionSchema": + """Construction interne depuis un dict.""" + fields_raw = data.get("fields", []) + fields = [] + for fd in fields_raw: + fields.append(ExtractionField( + name=fd["name"], + description=fd.get("description", ""), + field_type=fd.get("type", fd.get("field_type", "text")), + required=fd.get("required", True), + validation_regex=fd.get("validation", fd.get("validation_regex")), + )) + + return cls( + name=data.get("name", "unnamed"), + description=data.get("description", ""), + fields=fields, + navigation=data.get("navigation", {}), + ) + + def to_yaml(self, path: str) -> None: + """ + Sauvegarder le schema en fichier YAML. + + Args: + path: Chemin de sortie + """ + yaml_path = Path(path) + yaml_path.parent.mkdir(parents=True, exist_ok=True) + + data = self.to_dict() + + with open(yaml_path, "w", encoding="utf-8") as f: + yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) + + def to_dict(self) -> Dict[str, Any]: + """Convertir en dictionnaire serialisable.""" + return { + "name": self.name, + "description": self.description, + "fields": [ + { + "name": f.name, + "description": f.description, + "type": f.field_type, + "required": f.required, + **({"validation": f.validation_regex} if f.validation_regex else {}), + } + for f in self.fields + ], + "navigation": self.navigation, + } + + # --- Utilitaires --- + + @property + def required_fields(self) -> List[ExtractionField]: + """Retourne la liste des champs obligatoires.""" + return [f for f in self.fields if f.required] + + @property + def field_names(self) -> List[str]: + """Retourne la liste des noms de champs.""" + return [f.name for f in self.fields] + + def get_field(self, name: str) -> Optional[ExtractionField]: + """Recuperer un champ par son nom.""" + for f in self.fields: + if f.name == name: + return f + return None + + def validate_record(self, record: Dict[str, Any]) -> Dict[str, Any]: + """ + Valider un enregistrement complet contre le schema. + + Returns: + Dict avec 'valid' (bool), 'errors' (list), 'completeness' (float) + """ + errors = [] + valid_count = 0 + + for fld in self.fields: + value = record.get(fld.name) + if fld.validate_value(value): + if value is not None and str(value).strip(): + valid_count += 1 + else: + errors.append(f"Champ '{fld.name}' invalide: {value!r}") + + total = len(self.fields) if self.fields else 1 + completeness = valid_count / total + + return { + "valid": len(errors) == 0, + "errors": errors, + "completeness": completeness, + } diff --git a/core/graph/graph_builder.py b/core/graph/graph_builder.py index 0146a1d78..b36b60c6d 100644 --- a/core/graph/graph_builder.py +++ b/core/graph/graph_builder.py @@ -24,8 +24,9 @@ Example: """ import logging -from typing import List, Dict, Optional, Tuple -from collections import defaultdict +import os +from typing import List, Dict, Optional, Tuple, Any +from collections import defaultdict, Counter from datetime import datetime from pathlib import Path @@ -106,6 +107,7 @@ class GraphBuilder: self.clustering_eps = clustering_eps self.clustering_min_samples = clustering_min_samples self.enable_quality_validation = enable_quality_validation + self._screen_analyzer = None # ScreenAnalyzer (lazy import) logger.info( f"GraphBuilder initialized: " @@ -119,39 +121,47 @@ class GraphBuilder: self, session: RawSession, workflow_name: Optional[str] = None, + precomputed_states: Optional[List["ScreenState"]] = None, ) -> Workflow: """ Construire un Workflow complet depuis une RawSession. - + Processus: - 1. Créer ScreenStates depuis screenshots + 1. Créer ScreenStates depuis screenshots (ou utiliser precomputed_states) 2. Calculer embeddings pour chaque état 3. Détecter patterns via clustering 4. Construire nodes depuis clusters 5. Construire edges depuis transitions - + Args: session: Session brute à analyser workflow_name: Nom du workflow (généré si None) - + precomputed_states: ScreenStates déjà analysés (streaming). + Si fourni, saute l'étape 1 (pas de re-analyse via ScreenAnalyzer). + Returns: Workflow construit avec nodes et edges - + Raises: ValueError: Si la session est vide ou invalide """ - if not session.screenshots: - raise ValueError("Session has no screenshots") - + if not precomputed_states and not session.screenshots: + raise ValueError("Session has no screenshots and no precomputed states") + logger.info( f"Building workflow from session {session.session_id} " - f"with {len(session.screenshots)} screenshots" + f"with {len(precomputed_states or session.screenshots)} " + f"{'precomputed states' if precomputed_states else 'screenshots'}" ) - - # Étape 1: Créer ScreenStates - screen_states = self._create_screen_states(session) - logger.debug(f"Created {len(screen_states)} screen states") - + + # Étape 1: Créer ScreenStates (ou réutiliser ceux pré-calculés) + if precomputed_states: + screen_states = precomputed_states + logger.debug(f"Using {len(screen_states)} precomputed screen states") + else: + screen_states = self._create_screen_states(session) + logger.debug(f"Created {len(screen_states)} screen states") + # Étape 2: Calculer embeddings embeddings = self._compute_embeddings(screen_states) logger.debug(f"Computed {len(embeddings)} embeddings") @@ -315,16 +325,31 @@ class GraphBuilder: file_size_bytes=screenshot_path.stat().st_size if screenshot_path.exists() else 0 ) - # Créer PerceptionLevel (sera enrichi par embedding_builder) + # Créer PerceptionLevel — enrichir avec OCR si le screenshot existe + detected_text = [] + text_method = "none" + + if screenshot_path.exists(): + try: + if self._screen_analyzer is None: + from core.pipeline.screen_analyzer import ScreenAnalyzer + self._screen_analyzer = ScreenAnalyzer(session_id=session.session_id) + extracted = self._screen_analyzer._extract_text(str(screenshot_path)) + if extracted: + detected_text = extracted + text_method = self._screen_analyzer._get_ocr_method_name() + except Exception as e: + logger.debug(f"OCR échoué pour {screenshot_path}: {e}") + perception = PerceptionLevel( embedding=EmbeddingRef( provider="openclip_ViT-B-32", vector_id=f"data/embeddings/screens/{session.session_id}_state_{i:04d}.npy", dimensions=512 ), - detected_text=[], # Sera rempli par VLM/OCR - text_detection_method="pending", - confidence_avg=0.0 + detected_text=detected_text, + text_detection_method=text_method, + confidence_avg=0.85 if detected_text else 0.0 ) # Créer ContextLevel @@ -504,8 +529,12 @@ class GraphBuilder: node = WorkflowNode( node_id=f"node_{cluster_id:03d}", name=f"State Pattern {cluster_id}", - screen_template=template, - observation_count=len(indices), + description=f"Pattern auto-détecté ({len(indices)} observations)", + template=template, + metadata={ + "observation_count": len(indices), + "_prototype_vector": prototype.tolist(), + }, ) nodes.append(node) @@ -522,27 +551,172 @@ class GraphBuilder: ) -> ScreenTemplate: """ Créer un ScreenTemplate depuis un cluster d'états. - - TODO: Implémenter extraction intelligente de: - - window_title_pattern (regex depuis titres communs) - - required_text_patterns (texte présent dans tous les états) - - required_ui_elements (éléments UI communs) - + + Extrait les contraintes communes à tous les états du cluster : + - window_title_pattern : titre de fenêtre commun + - required_text_patterns : textes présents dans la majorité des états + - required_ui_elements : rôles/types UI récurrents + Args: states: États du cluster prototype_embedding: Embedding prototype - + Returns: - ScreenTemplate avec contraintes + ScreenTemplate avec contraintes extraites """ - # Pour l'instant, template basique avec seulement l'embedding - return ScreenTemplate( - embedding_prototype=prototype_embedding.tolist(), - similarity_threshold=0.85, - window_title_pattern=None, # TODO: Extraire - required_text_patterns=[], # TODO: Extraire - required_ui_elements=[], # TODO: Extraire + # --- Extraction du titre de fenêtre commun --- + window_title_pattern = self._extract_window_pattern(states) + + # --- Extraction des textes récurrents --- + required_text_patterns = self._extract_common_texts(states) + + # --- Extraction des éléments UI récurrents --- + required_ui_elements = self._extract_common_ui_elements(states) + + # Construire les sous-objets de contraintes + window_constraint = WindowConstraint( + title_pattern=window_title_pattern, + title_contains=window_title_pattern, ) + + text_constraint = TextConstraint( + required_texts=required_text_patterns, + ) + + ui_roles = [ + e.get("role", "") for e in required_ui_elements if e.get("role") + ] + ui_constraint = UIConstraint( + required_roles=ui_roles, + ) + + embedding_proto = EmbeddingPrototype( + provider="openclip_ViT-B-32", + vector_id="", # Le vecteur est stocké dans node.metadata._prototype_vector + min_cosine_similarity=0.85, + sample_count=len(states), + ) + + return ScreenTemplate( + window=window_constraint, + text=text_constraint, + ui=ui_constraint, + embedding=embedding_proto, + ) + + def _extract_window_pattern(self, states: List[ScreenState]) -> Optional[str]: + """Extraire un pattern de titre de fenêtre commun aux états du cluster.""" + titles = [s.window.window_title for s in states if s.window.window_title] + if not titles: + return None + + # Si tous les titres sont identiques, retourner directement + if len(set(titles)) == 1: + return titles[0] + + # Trouver le préfixe commun le plus long + prefix = os.path.commonprefix(titles) + if len(prefix) >= 5: + return prefix.rstrip(" -–—|") + + # Fallback: le titre le plus fréquent + from collections import Counter + most_common = Counter(titles).most_common(1)[0][0] + return most_common + + def _extract_common_texts( + self, states: List[ScreenState], min_presence_ratio: float = 0.6 + ) -> List[str]: + """ + Extraire les textes présents dans la majorité des états du cluster. + + Args: + states: États du cluster + min_presence_ratio: Proportion minimale de présence (0.6 = 60% des états) + """ + if not states: + return [] + + # Collecter les textes de chaque état + text_counts: Dict[str, int] = defaultdict(int) + states_with_text = 0 + + for state in states: + if hasattr(state.perception, 'detected_text') and state.perception.detected_text: + states_with_text += 1 + seen_in_state = set() + for text in state.perception.detected_text: + normalized = text.strip().lower() + if len(normalized) >= 3 and normalized not in seen_in_state: + text_counts[normalized] += 1 + seen_in_state.add(normalized) + + if states_with_text == 0: + return [] + + # Garder les textes présents dans au moins min_presence_ratio des états + threshold = max(2, int(states_with_text * min_presence_ratio)) + common_texts = [ + text for text, count in text_counts.items() + if count >= threshold + ] + + # Limiter à 10 textes les plus fréquents + common_texts.sort(key=lambda t: text_counts[t], reverse=True) + return common_texts[:10] + + def _extract_common_ui_elements( + self, states: List[ScreenState], min_presence_ratio: float = 0.5 + ) -> List[Dict[str, Any]]: + """ + Extraire les types/rôles d'éléments UI récurrents dans le cluster. + + Retourne une liste de contraintes UI au format: + [{"type": "button", "role": "validate", "min_count": 1}, ...] + """ + if not states: + return [] + + # Compter les paires (type, role) dans chaque état + role_counts: Dict[str, int] = defaultdict(int) + type_counts: Dict[str, int] = defaultdict(int) + states_with_ui = 0 + + for state in states: + if state.ui_elements: + states_with_ui += 1 + seen_roles = set() + seen_types = set() + for el in state.ui_elements: + el_type = getattr(el, 'type', 'unknown') + el_role = getattr(el, 'role', 'unknown') + + if el_role != 'unknown' and el_role not in seen_roles: + role_counts[el_role] += 1 + seen_roles.add(el_role) + + if el_type != 'unknown' and el_type not in seen_types: + type_counts[el_type] += 1 + seen_types.add(el_type) + + if states_with_ui == 0: + return [] + + threshold = max(2, int(states_with_ui * min_presence_ratio)) + + constraints = [] + + # Ajouter les rôles récurrents + for role, count in role_counts.items(): + if count >= threshold: + constraints.append({ + "role": role, + "min_count": 1, + }) + + # Limiter à 8 contraintes + constraints.sort(key=lambda c: role_counts.get(c.get("role", ""), 0), reverse=True) + return constraints[:8] def _build_edges( self, @@ -633,9 +807,14 @@ class GraphBuilder: # Récupérer les embeddings des prototypes de nodes node_prototypes = {} for node in nodes: - if hasattr(node, 'template') and node.template: - if hasattr(node.template, 'embedding_prototype'): - node_prototypes[node.node_id] = np.array(node.template.embedding_prototype) + # Priorité : vecteur en mémoire (metadata), sinon chargement depuis disque + proto_list = node.metadata.get("_prototype_vector") + if proto_list is not None: + node_prototypes[node.node_id] = np.array(proto_list, dtype=np.float32) + elif node.template and node.template.embedding and node.template.embedding.vector_id: + proto_path = Path(node.template.embedding.vector_id) + if proto_path.exists(): + node_prototypes[node.node_id] = np.load(proto_path) if not node_prototypes: logger.warning("No node prototypes available for mapping") @@ -741,7 +920,7 @@ class GraphBuilder: action = Action( type=action_type, target=TargetSpec( - role=target_role, + by_role=target_role, selection_policy="first", fallback_strategy="visual_similarity" ), diff --git a/core/graph/node_matcher.py b/core/graph/node_matcher.py index 4fd4f1740..baeff5ed5 100644 --- a/core/graph/node_matcher.py +++ b/core/graph/node_matcher.py @@ -133,10 +133,10 @@ class NodeMatcher: node: WorkflowNode ) -> bool: """Valider les contraintes du node contre l'état.""" - template = node.screen_template - - if template.window_title_pattern: - if not state.raw_level or not state.raw_level.window_title: + template = node.template + + if template and template.window and template.window.title_pattern: + if not state.window or not state.window.window_title: return False return True @@ -179,13 +179,14 @@ class NodeMatcher: # Calculer similarités avec tous les nodes similarities = [] for node in candidate_nodes: - if node.screen_template.embedding_prototype_path: + proto_path = node.template.embedding.vector_id if (node.template and node.template.embedding) else None + if proto_path: try: - prototype = np.load(node.screen_template.embedding_prototype_path) + prototype = np.load(proto_path) similarity = float(np.dot(state_vector, prototype)) similarities.append({ 'node_id': node.node_id, - 'node_label': node.label, + 'node_label': node.name, 'similarity': similarity, 'threshold': self.similarity_threshold, 'matched': similarity >= self.similarity_threshold @@ -204,9 +205,9 @@ class NodeMatcher: 'timestamp': timestamp, 'failed_match_id': failed_match_id, 'state': { - 'window_title': state.raw_level.window_title if state.raw_level else None, - 'screenshot_path': str(state.raw_level.screenshot_path) if state.raw_level else None, - 'ui_elements_count': len(state.perception_level.ui_elements) if state.perception_level else 0 + 'window_title': state.window.window_title if getattr(state, 'window', None) else None, + 'screenshot_path': str(state.raw.screenshot_path) if getattr(state, 'raw', None) else None, + 'ui_elements_count': len(state.ui_elements) if getattr(state, 'ui_elements', None) else 0 }, 'matching_results': { 'best_confidence': best_confidence, diff --git a/core/matching/hierarchical_matcher.py b/core/matching/hierarchical_matcher.py index b2a2326a0..70bf4186c 100644 --- a/core/matching/hierarchical_matcher.py +++ b/core/matching/hierarchical_matcher.py @@ -303,7 +303,7 @@ class HierarchicalMatcher: if not window_info: return 0.5 # Score neutre si pas d'info - template = getattr(node, 'screen_template', None) + template = getattr(node, 'template', None) if not template: return 0.5 @@ -311,7 +311,7 @@ class HierarchicalMatcher: # Matching du titre current_title = window_info.get('title', '') - template_pattern = getattr(template, 'window_title_pattern', None) + template_pattern = getattr(template.window, 'title_pattern', None) if getattr(template, 'window', None) else None if template_pattern and current_title: if self.config.use_regex_title_matching: @@ -329,7 +329,7 @@ class HierarchicalMatcher: # Matching du processus current_process = window_info.get('process_name', '') - template_process = getattr(template, 'process_name', None) + template_process = getattr(template.window, 'process_name', None) if getattr(template, 'window', None) else None if template_process and current_process: if current_process.lower() == template_process.lower(): @@ -367,12 +367,12 @@ class HierarchicalMatcher: Returns: Score de confiance 0.0-1.0 """ - template = getattr(node, 'screen_template', None) + template = getattr(node, 'template', None) if not template: return 0.5 # Récupérer embedding prototype du template - prototype = getattr(template, 'embedding_prototype', None) + prototype = getattr(template.embedding, 'vector_id', None) if getattr(template, 'embedding', None) else None if prototype is None: return 0.5 @@ -445,7 +445,7 @@ class HierarchicalMatcher: if not detected_elements: return 0.5 - template = getattr(node, 'screen_template', None) + template = getattr(node, 'template', None) if not template: return 0.5 diff --git a/core/models/__init__.py b/core/models/__init__.py index 966a613ed..3b0c47a57 100644 --- a/core/models/__init__.py +++ b/core/models/__init__.py @@ -92,6 +92,41 @@ def get_execution_result(): from .execution_result import WorkflowExecutionResult return WorkflowExecutionResult +# Lazy import via __getattr__ pour éviter les imports circulaires +_LAZY_IMPORTS = { + "StateEmbedding": "core.models.state_embedding", + "EmbeddingComponent": "core.models.state_embedding", + "Workflow": "core.models.workflow_graph", + "WorkflowNode": "core.models.workflow_graph", + "WorkflowEdge": "core.models.workflow_graph", + "ScreenTemplate": "core.models.workflow_graph", + "Action": "core.models.workflow_graph", + "TargetSpec": "core.models.workflow_graph", + "ActionType": "core.models.workflow_graph", + "EdgeConstraints": "core.models.workflow_graph", + "PostConditions": "core.models.workflow_graph", + "LearningState": "core.models.workflow_graph", + "SelectionPolicy": "core.models.workflow_graph", + "WindowConstraint": "core.models.workflow_graph", + "TextConstraint": "core.models.workflow_graph", + "UIConstraint": "core.models.workflow_graph", + "EmbeddingPrototype": "core.models.workflow_graph", + "EdgeStats": "core.models.workflow_graph", + "SafetyRules": "core.models.workflow_graph", + "WorkflowStats": "core.models.workflow_graph", + "LearningConfig": "core.models.workflow_graph", + "WorkflowExecutionResult": "core.models.execution_result", + "PerformanceMetrics": "core.models.execution_result", +} + +def __getattr__(name): + if name in _LAZY_IMPORTS: + import importlib + module = importlib.import_module(_LAZY_IMPORTS[name]) + return getattr(module, name) + raise AttributeError(f"module 'core.models' has no attribute {name!r}") + + __all__ = [ # Modèles de base standardisés (Tâche 4) "BBox", diff --git a/core/models/base_models.py b/core/models/base_models.py index e8e92591b..3a3d4e7e6 100644 --- a/core/models/base_models.py +++ b/core/models/base_models.py @@ -45,6 +45,25 @@ class BBox(BaseModel): return int(v) raise ValueError("Dimensions must be numeric") + def __iter__(self): + """Permet le unpacking: x, y, w, h = bbox""" + return iter((self.x, self.y, self.width, self.height)) + + def __getitem__(self, index): + """Permet l'accès par index: bbox[0], bbox[1], etc.""" + return (self.x, self.y, self.width, self.height)[index] + + def __len__(self): + return 4 + + def __eq__(self, other): + if isinstance(other, BBox): + return (self.x == other.x and self.y == other.y and + self.width == other.width and self.height == other.height) + if isinstance(other, (tuple, list)) and len(other) == 4: + return (self.x, self.y, self.width, self.height) == tuple(other) + return NotImplemented + def to_tuple(self) -> Tuple[int, int, int, int]: """Conversion vers tuple (x, y, w, h)""" return (self.x, self.y, self.width, self.height) diff --git a/core/models/workflow_graph.py b/core/models/workflow_graph.py index d64776334..6ad5b85b5 100644 --- a/core/models/workflow_graph.py +++ b/core/models/workflow_graph.py @@ -311,8 +311,8 @@ class ScreenTemplate: # Vérifier contraintes de texte if hasattr(screen_state, 'perception'): - detected_texts = getattr(screen_state.perception, 'detected_texts', []) - if not self.text.matches(detected_texts): + detected_text = getattr(screen_state.perception, 'detected_text', []) + if not self.text.matches(detected_text): return False, 0.0 # Vérifier contraintes UI diff --git a/core/pipeline/__init__.py b/core/pipeline/__init__.py index a4cdfc632..1f80977eb 100644 --- a/core/pipeline/__init__.py +++ b/core/pipeline/__init__.py @@ -3,5 +3,6 @@ Pipeline module - Orchestration du flux RPA Vision V3 """ from .workflow_pipeline import WorkflowPipeline, create_pipeline +from .screen_analyzer import ScreenAnalyzer -__all__ = ["WorkflowPipeline", "create_pipeline"] +__all__ = ["WorkflowPipeline", "create_pipeline", "ScreenAnalyzer"] diff --git a/core/pipeline/screen_analyzer.py b/core/pipeline/screen_analyzer.py new file mode 100644 index 000000000..acf7d6c96 --- /dev/null +++ b/core/pipeline/screen_analyzer.py @@ -0,0 +1,343 @@ +""" +ScreenAnalyzer - Construction complète d'un ScreenState depuis un screenshot + +Orchestre les 4 niveaux du ScreenState : + Niveau 1 (Raw) : métadonnées de l'image + Niveau 2 (Perception): OCR + embedding global + Niveau 3 (UI) : détection d'éléments UI + Niveau 4 (Contexte) : fenêtre active, workflow en cours + +Ce module comble le chaînon manquant entre la capture brute (Couche 0) +et la construction d'embeddings (Couche 3). +""" + +import logging +import os +from datetime import datetime +from pathlib import Path +from typing import Optional, Dict, Any, List + +from PIL import Image + +from core.models.screen_state import ( + ScreenState, + RawLevel, + PerceptionLevel, + ContextLevel, + WindowContext, + EmbeddingRef, +) +from core.models.ui_element import UIElement + +logger = logging.getLogger(__name__) + + +class ScreenAnalyzer: + """ + Construit un ScreenState complet (4 niveaux) depuis un screenshot. + + Utilise le UIDetector pour la détection d'éléments et un OCR + (docTR ou Tesseract) pour l'extraction de texte. + + Example: + >>> analyzer = ScreenAnalyzer() + >>> state = analyzer.analyze("/path/to/screenshot.png") + >>> print(state.perception.detected_text) + >>> print(len(state.ui_elements)) + """ + + def __init__( + self, + ui_detector=None, + ocr_engine: Optional[str] = None, + session_id: str = "", + ): + """ + Args: + ui_detector: Instance de UIDetector (créé si None) + ocr_engine: Moteur OCR à utiliser ("doctr", "tesseract", None=auto) + session_id: ID de la session en cours + """ + self._ui_detector = ui_detector + self._ocr_engine_name = ocr_engine + self._ocr = None + self.session_id = session_id + self._state_counter = 0 + + # Initialisation lazy pour éviter les imports lourds au démarrage + self._ui_detector_initialized = ui_detector is not None + self._ocr_initialized = False + + # ========================================================================= + # API publique + # ========================================================================= + + def analyze( + self, + screenshot_path: str, + window_info: Optional[Dict[str, Any]] = None, + context: Optional[Dict[str, Any]] = None, + ) -> ScreenState: + """ + Analyser un screenshot et construire un ScreenState complet. + + Args: + screenshot_path: Chemin vers le fichier image + window_info: Infos fenêtre active {"title": ..., "app_name": ...} + context: Contexte métier optionnel + + Returns: + ScreenState avec les 4 niveaux remplis + """ + screenshot_path = str(screenshot_path) + self._state_counter += 1 + + state_id = f"{self.session_id}_state_{self._state_counter:04d}" if self.session_id else f"state_{self._state_counter:04d}" + + # Niveau 1 : Raw + raw = self._build_raw_level(screenshot_path) + + # Niveau 2 : Perception (OCR) + detected_text = self._extract_text(screenshot_path) + perception = PerceptionLevel( + embedding=EmbeddingRef( + provider="openclip_ViT-B-32", + vector_id=f"data/embeddings/screens/{state_id}.npy", + dimensions=512, + ), + detected_text=detected_text, + text_detection_method=self._get_ocr_method_name(), + confidence_avg=0.85 if detected_text else 0.0, + ) + + # Niveau 3 : UI Elements + ui_elements = self._detect_ui_elements(screenshot_path, window_info) + + # Niveau 4 : Contexte + window_ctx = self._build_window_context(window_info) + context_level = self._build_context_level(context) + + state = ScreenState( + screen_state_id=state_id, + timestamp=datetime.now(), + session_id=self.session_id, + window=window_ctx, + raw=raw, + perception=perception, + context=context_level, + metadata={ + "analyzer_version": "1.0", + "ui_elements_count": len(ui_elements), + "text_regions_count": len(detected_text), + }, + ui_elements=ui_elements, + ) + + logger.info( + f"ScreenState {state_id} construit: " + f"{len(ui_elements)} éléments UI, {len(detected_text)} textes détectés" + ) + return state + + def analyze_image( + self, + image: Image.Image, + save_dir: str = "data/screens", + window_info: Optional[Dict[str, Any]] = None, + context: Optional[Dict[str, Any]] = None, + ) -> ScreenState: + """ + Analyser une PIL Image (utile quand on a déjà l'image en mémoire). + + Sauvegarde l'image sur disque puis appelle analyze(). + """ + save_path = Path(save_dir) + save_path.mkdir(parents=True, exist_ok=True) + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f") + filename = f"screen_{timestamp}.png" + filepath = save_path / filename + + image.save(str(filepath)) + return self.analyze(str(filepath), window_info=window_info, context=context) + + # ========================================================================= + # Niveau 1 : Raw + # ========================================================================= + + def _build_raw_level(self, screenshot_path: str) -> RawLevel: + file_size = 0 + try: + file_size = os.path.getsize(screenshot_path) + except OSError: + pass + + return RawLevel( + screenshot_path=screenshot_path, + capture_method="mss", + file_size_bytes=file_size, + ) + + # ========================================================================= + # Niveau 2 : Perception — OCR + # ========================================================================= + + def _extract_text(self, screenshot_path: str) -> List[str]: + """Extraire le texte d'un screenshot via OCR.""" + self._ensure_ocr() + + if self._ocr is None: + return [] + + try: + return self._ocr(screenshot_path) + except Exception as e: + logger.warning(f"OCR échoué: {e}") + return [] + + def _ensure_ocr(self) -> None: + """Initialiser le moteur OCR (lazy).""" + if self._ocr_initialized: + return + self._ocr_initialized = True + + engine = self._ocr_engine_name + + # Auto-détection : essayer docTR puis Tesseract + if engine is None or engine == "doctr": + try: + self._ocr = self._create_doctr_ocr() + logger.info("OCR initialisé avec docTR") + return + except Exception as e: + if engine == "doctr": + logger.warning(f"docTR non disponible: {e}") + return + + if engine is None or engine == "tesseract": + try: + self._ocr = self._create_tesseract_ocr() + logger.info("OCR initialisé avec Tesseract") + return + except Exception as e: + logger.warning(f"Tesseract non disponible: {e}") + + logger.warning("Aucun moteur OCR disponible — detected_text sera vide") + + def _create_doctr_ocr(self): + """Créer une fonction OCR basée sur docTR.""" + from doctr.io import DocumentFile + from doctr.models import ocr_predictor + + predictor = ocr_predictor(det_arch="db_resnet50", reco_arch="crnn_vgg16_bn", pretrained=True) + + def ocr_func(image_path: str) -> List[str]: + doc = DocumentFile.from_images(image_path) + result = predictor(doc) + texts = [] + for page in result.pages: + for block in page.blocks: + for line in block.lines: + line_text = " ".join(word.value for word in line.words) + if line_text.strip(): + texts.append(line_text.strip()) + return texts + + return ocr_func + + def _create_tesseract_ocr(self): + """Créer une fonction OCR basée sur Tesseract.""" + import pytesseract + + def ocr_func(image_path: str) -> List[str]: + img = Image.open(image_path) + raw_text = pytesseract.image_to_string(img, lang="fra+eng") + lines = [line.strip() for line in raw_text.split("\n") if line.strip()] + return lines + + return ocr_func + + def _get_ocr_method_name(self) -> str: + if self._ocr is None: + return "none" + if self._ocr_engine_name: + return self._ocr_engine_name + return "doctr" + + # ========================================================================= + # Niveau 3 : UI Elements + # ========================================================================= + + def _detect_ui_elements( + self, + screenshot_path: str, + window_info: Optional[Dict[str, Any]] = None, + ) -> List[UIElement]: + """Détecter les éléments UI dans le screenshot.""" + self._ensure_ui_detector() + + if self._ui_detector is None: + return [] + + try: + elements = self._ui_detector.detect( + screenshot_path, window_context=window_info + ) + return elements + except Exception as e: + logger.warning(f"Détection UI échouée: {e}") + return [] + + def _ensure_ui_detector(self) -> None: + """Initialiser le UIDetector (lazy).""" + if self._ui_detector_initialized: + return + self._ui_detector_initialized = True + + try: + from core.detection.ui_detector import UIDetector, DetectionConfig + + config = DetectionConfig( + use_owl_detection=False, # Désactiver OWL par défaut (lourd) + use_vlm_classification=True, + confidence_threshold=0.6, + ) + self._ui_detector = UIDetector(config) + logger.info("UIDetector initialisé") + except Exception as e: + logger.warning(f"UIDetector non disponible: {e}") + self._ui_detector = None + + # ========================================================================= + # Niveau 4 : Contexte + # ========================================================================= + + def _build_window_context( + self, window_info: Optional[Dict[str, Any]] = None + ) -> WindowContext: + if window_info: + return WindowContext( + app_name=window_info.get("app_name", "unknown"), + window_title=window_info.get("title", "Unknown"), + screen_resolution=window_info.get("screen_resolution", [1920, 1080]), + workspace=window_info.get("workspace", "main"), + ) + return WindowContext( + app_name="unknown", + window_title="Unknown", + screen_resolution=[1920, 1080], + workspace="main", + ) + + def _build_context_level( + self, context: Optional[Dict[str, Any]] = None + ) -> ContextLevel: + if context: + return ContextLevel( + current_workflow_candidate=context.get("workflow_candidate"), + workflow_step=context.get("workflow_step"), + user_id=context.get("user_id", ""), + tags=context.get("tags", []), + business_variables=context.get("business_variables", {}), + ) + return ContextLevel() diff --git a/core/pipeline/workflow_pipeline.py b/core/pipeline/workflow_pipeline.py index a9030909c..ff49061b6 100644 --- a/core/pipeline/workflow_pipeline.py +++ b/core/pipeline/workflow_pipeline.py @@ -319,17 +319,25 @@ class WorkflowPipeline: np.ndarray ou None si aucun vecteur trouvé """ - # v1: prototype stocké en liste directement + # v3: prototype stocké dans metadata (Phase 0, mars 2026) + meta = getattr(node, "metadata", {}) or {} + proto_list = meta.get("_prototype_vector") + if proto_list is not None and isinstance(proto_list, list): + try: + return np.array(proto_list, dtype=np.float32) + except Exception as e: + logger.debug(f"Failed to convert metadata prototype: {e}") + + # v1: prototype stocké en liste directement sur template tpl = getattr(node, "template", None) if tpl is not None: proto_list = getattr(tpl, "embedding_prototype", None) if isinstance(proto_list, list): try: - v = np.array(proto_list, dtype=np.float32) - return v + return np.array(proto_list, dtype=np.float32) except Exception as e: logger.debug(f"Failed to convert embedding_prototype list: {e}") - + # v2: prototype stocké sur disque via EmbeddingPrototype.vector_id if tpl is not None: emb = getattr(tpl, "embedding", None) @@ -341,16 +349,6 @@ class WorkflowPipeline: except Exception as e: logger.debug(f"Failed to load vector from {vector_id}: {e}") - # fallback (ancienne nomenclature) - st = getattr(node, "screen_template", None) - if st is not None: - p = getattr(st, "embedding_prototype_path", None) - if p: - try: - return np.load(p).astype(np.float32) - except Exception as e: - logger.debug(f"Failed to load legacy vector from {p}: {e}") - return None # ========================================================================= @@ -918,18 +916,6 @@ class WorkflowPipeline: "recovery_attempted": recovery_result.success, "recovery_message": recovery_result.message if recovery_result else None } - self.error_handler.error_history.append(error_ctx) - self.error_handler._log_error(error_ctx) - - return { - "execution_id": execution_id, - "workflow_id": workflow_id, - "success": False, - "step_type": "execution_error", - "error": str(e), - "execution_time_ms": total_time_ms, - "correlation_id": execution_id - } # ============================================================================= diff --git a/core/training/quality_validator.py b/core/training/quality_validator.py index c03b46e8e..80f341ad2 100644 --- a/core/training/quality_validator.py +++ b/core/training/quality_validator.py @@ -210,7 +210,7 @@ class TrainingQualityValidator: # 3. Vérifier observations par node nodes = getattr(workflow, 'nodes', []) for node in nodes: - obs_count = getattr(node, 'observation_count', 0) + obs_count = (node.metadata.get('observation_count', 0) if getattr(node, 'metadata', None) else 0) if obs_count < self.config.min_observations_per_node: recommendations.append( f"Node '{getattr(node, 'node_id', 'unknown')}' a seulement {obs_count} observations " @@ -240,7 +240,7 @@ class TrainingQualityValidator: len(outlier_indices) <= len(embeddings) * self.config.max_outlier_ratio and (validation_result is None or validation_result.is_valid) and all( - getattr(node, 'observation_count', 0) >= self.config.min_observations_per_node + (node.metadata.get('observation_count', 0) if getattr(node, 'metadata', None) else 0) >= self.config.min_observations_per_node for node in nodes ) ) diff --git a/examples/capture_and_test.py b/examples/capture_and_test.py index eab4773fa..324e44d01 100755 --- a/examples/capture_and_test.py +++ b/examples/capture_and_test.py @@ -167,7 +167,8 @@ def test_workflow_construction(session, session_file): if workflow.nodes: logger.info(f"\n📊 {len(workflow.nodes)} patterns détectés:") for node in workflow.nodes: - logger.info(f" • {node.node_id}: {node.observation_count} observations") + obs = node.metadata.get("observation_count", "?") if node.metadata else "?" + logger.info(f" • {node.node_id}: {obs} observations") else: logger.warning("\n⚠️ Aucun pattern détecté") logger.info(" Conseils:") @@ -178,7 +179,8 @@ def test_workflow_construction(session, session_file): if workflow.edges: logger.info(f"\n🔗 {len(workflow.edges)} transitions détectées:") for edge in workflow.edges: - logger.info(f" • {edge.from_node_id} → {edge.to_node_id} ({edge.observation_count}x)") + count = edge.stats.execution_count if edge.stats else 0 + logger.info(f" • {edge.from_node} → {edge.to_node} ({count}x)") logger.info(f"\n💾 Index FAISS: {faiss_manager.index.ntotal} vecteurs") logger.info(f"📁 Session: {session_file}") diff --git a/examples/test_workflow_construction.py b/examples/test_workflow_construction.py index 24ce13a8a..01e05bf43 100644 --- a/examples/test_workflow_construction.py +++ b/examples/test_workflow_construction.py @@ -88,17 +88,19 @@ def test_workflow_construction(session_path: str): for node in workflow.nodes: logger.info(f" Node {node.node_id}:") logger.info(f" - Name: {node.name}") - logger.info(f" - Observations: {node.observation_count}") + obs = node.metadata.get("observation_count", "?") if node.metadata else "?" + logger.info(f" - Observations: {obs}") logger.info(f" - Similarity threshold: {node.template.embedding.min_cosine_similarity}") # Étape 5: Analyser les edges logger.info("\n[5/5] Analyse des edges") for edge in workflow.edges: logger.info(f" Edge {edge.edge_id}:") - logger.info(f" - From: {edge.from_node_id} → To: {edge.to_node_id}") - logger.info(f" - Action: {edge.action.type}") - logger.info(f" - Target: {edge.action.target.role}") - logger.info(f" - Observations: {edge.observation_count}") + logger.info(f" - From: {edge.from_node} → To: {edge.to_node}") + logger.info(f" - Action: {edge.action.type if edge.action else '?'}") + logger.info(f" - Target: {edge.action.target.by_role if edge.action and edge.action.target else '?'}") + count = edge.stats.execution_count if edge.stats else 0 + logger.info(f" - Observations: {count}") # Résumé logger.info("\n" + "=" * 70) diff --git a/monitoring_server.py b/monitoring_server.py index b2314c43a..b8f31776a 100644 --- a/monitoring_server.py +++ b/monitoring_server.py @@ -1,6 +1,7 @@ +"""RPA Vision V3 - Serveur de Monitoring (port 5003).""" + from flask import Flask, render_template_string import psutil -import json from datetime import datetime app = Flask(__name__) @@ -11,7 +12,7 @@ def monitoring(): -✅ Fiche #1 & #2 Corrections Applied
-🎯 BBOX Precision: ~95% (improved from ~60%)
-🔧 All contrats de données unified
-✅ Fiche #1 & #2 Corrections Applied
-🎯 BBOX Precision: ~95% (improved from ~60%)
-🔧 All contrats de données unified
-🚀 Full ecosystem running!
-Aucun workflow en attente de validation
++ Les workflows importes depuis le streaming apparaitront ici. +
+Aucune etape
+ ) : ( +Visualisation des donnees extraites par le moteur RPA depuis les ecrans des applications (scraping visuel).
+Le module core.extraction n'est pas encore installe. Cette fonctionnalite sera disponible dans une prochaine version.
{{ ext.description or '' }} | {{ ext.date or '' }}
+Les extractions apparaitront ici lorsque le moteur RPA aura extrait des donnees depuis les ecrans des applications.
+Liste de tous les gestes (raccourcis clavier, actions systeme) connus par le moteur RPA. Ces gestes sont utilisables dans les workflows pour interagir avec les applications.
+Le module agent_chat.gesture_catalog n'est pas installe ou accessible. Les gestes ci-dessous sont le catalogue integre par defaut.
Suivi des sessions de capture, replays en cours et statistiques du serveur de streaming (port 5005).
+