feat: chat unifié, GestureCatalog, Copilot, Léa UI, extraction données, vérification replay

Refonte majeure du système Agent Chat et ajout de nombreux modules :

- Chat unifié : suppression du dual Workflows/Agent Libre, tout passe par /api/chat
  avec résolution en 3 niveaux (workflow → geste → "montre-moi")
- GestureCatalog : 38 raccourcis clavier universels Windows avec matching sémantique,
  substitution automatique dans les replays, et endpoint /api/gestures
- Mode Copilot : exécution pas-à-pas des workflows avec validation humaine via WebSocket
  (approve/skip/abort) avant chaque action
- Léa UI (agent_v0/lea_ui/) : interface PyQt5 pour Windows avec overlay transparent
  pour feedback visuel pendant le replay
- Data Extraction (core/extraction/) : moteur d'extraction visuelle de données
  (OCR + VLM → SQLite), avec schémas YAML et export CSV/Excel
- ReplayVerifier (agent_v0/server_v1/) : vérification post-action par comparaison
  de screenshots, avec logique de retry (max 3)
- IntentParser durci : meilleur fallback regex, type GREETING, patterns améliorés
- Dashboard : nouvelles pages gestures, streaming, extractions
- Tests : 63 tests GestureCatalog, 47 tests extraction, corrections tests existants
- Dépréciation : /api/agent/plan et /api/agent/execute retournent HTTP 410,
  suppression du code hardcodé _plan_to_replay_actions

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Dom
2026-03-15 10:02:09 +01:00
parent 74a1cb4e03
commit cf495dd82f
93 changed files with 12463 additions and 1080 deletions

View File

@@ -1,4 +1,11 @@
"""Screen capture module"""
from .screen_capturer import ScreenCapturer
__all__ = ['ScreenCapturer']
try:
from .event_listener import EventListener
except ImportError:
EventListener = None
from .session_recorder import SessionRecorder
__all__ = ['ScreenCapturer', 'EventListener', 'SessionRecorder']

View File

@@ -0,0 +1,258 @@
"""
EventListener - Capture d'événements clavier/souris pour RPA Vision V3
Couche 0 (RawSession) : capture en temps réel des interactions utilisateur
(clics souris, frappes clavier) avec horodatage précis et contexte de fenêtre.
Génère des objets Event compatibles avec RawSession.
"""
import logging
import threading
import time
from typing import Optional, Callable, List, Dict, Any
from datetime import datetime
logger = logging.getLogger(__name__)
try:
from pynput import mouse, keyboard
PYNPUT_AVAILABLE = True
except ImportError:
mouse = None # type: ignore
keyboard = None # type: ignore
PYNPUT_AVAILABLE = False
logger.warning("pynput non disponible — EventListener désactivé")
class EventListener:
"""
Listener d'événements clavier/souris basé sur pynput.
Capture les interactions utilisateur en temps réel et les transmet
via un callback. Compatible avec le format Event de RawSession.
Example:
>>> listener = EventListener()
>>> listener.start(callback=on_event)
>>> # ... l'utilisateur interagit ...
>>> events = listener.stop()
"""
def __init__(self, capture_mouse_move: bool = False):
"""
Args:
capture_mouse_move: Capturer les déplacements souris (volumineux, désactivé par défaut)
"""
if not PYNPUT_AVAILABLE:
raise ImportError(
"pynput est requis pour EventListener. "
"Installer avec: pip install pynput"
)
self.capture_mouse_move = capture_mouse_move
self._running = False
self._start_time: Optional[float] = None
self._events: List[Dict[str, Any]] = []
self._callback: Optional[Callable[[Dict[str, Any]], None]] = None
self._lock = threading.Lock()
self._mouse_listener = None
self._keyboard_listener = None
def start(self, callback: Optional[Callable[[Dict[str, Any]], None]] = None) -> None:
"""
Démarrer la capture d'événements.
Args:
callback: Fonction appelée pour chaque événement capturé.
Reçoit un dict au format Event.to_dict().
"""
if self._running:
logger.warning("EventListener déjà en cours")
return
self._callback = callback
self._events = []
self._start_time = time.time()
self._running = True
# Démarrer les listeners
self._mouse_listener = mouse.Listener(
on_click=self._on_click,
on_scroll=self._on_scroll,
on_move=self._on_move if self.capture_mouse_move else None,
)
self._keyboard_listener = keyboard.Listener(
on_press=self._on_key_press,
on_release=self._on_key_release,
)
self._mouse_listener.start()
self._keyboard_listener.start()
logger.info("EventListener démarré")
def stop(self) -> List[Dict[str, Any]]:
"""
Arrêter la capture et retourner les événements capturés.
Returns:
Liste de dicts au format Event
"""
self._running = False
if self._mouse_listener:
self._mouse_listener.stop()
self._mouse_listener = None
if self._keyboard_listener:
self._keyboard_listener.stop()
self._keyboard_listener = None
logger.info(f"EventListener arrêté — {len(self._events)} événements capturés")
with self._lock:
return list(self._events)
@property
def is_running(self) -> bool:
return self._running
@property
def event_count(self) -> int:
with self._lock:
return len(self._events)
def _relative_time(self) -> float:
"""Temps relatif depuis le début de la capture."""
if self._start_time is None:
return 0.0
return round(time.time() - self._start_time, 3)
def _get_window_context(self) -> Dict[str, str]:
"""Obtenir le contexte de la fenêtre active."""
try:
import subprocess
# Utiliser xdotool sur Linux pour obtenir la fenêtre active
result = subprocess.run(
["xdotool", "getactivewindow", "getwindowname"],
capture_output=True, text=True, timeout=1
)
title = result.stdout.strip() if result.returncode == 0 else "Unknown"
result2 = subprocess.run(
["xdotool", "getactivewindow", "getwindowpid"],
capture_output=True, text=True, timeout=1
)
pid = result2.stdout.strip() if result2.returncode == 0 else ""
# Essayer d'obtenir le nom du process
app_name = "unknown"
if pid:
try:
result3 = subprocess.run(
["ps", "-p", pid, "-o", "comm="],
capture_output=True, text=True, timeout=1
)
app_name = result3.stdout.strip() if result3.returncode == 0 else "unknown"
except Exception:
pass
return {"title": title, "app_name": app_name}
except Exception:
return {"title": "Unknown", "app_name": "unknown"}
def _emit_event(self, event: Dict[str, Any]) -> None:
"""Enregistrer et émettre un événement."""
with self._lock:
self._events.append(event)
if self._callback:
try:
self._callback(event)
except Exception as e:
logger.error(f"Erreur callback événement: {e}")
# === Handlers souris ===
def _on_click(self, x: int, y: int, button, pressed: bool) -> None:
if not self._running or not pressed:
return
event = {
"t": self._relative_time(),
"type": "mouse_click",
"button": button.name,
"pos": [x, y],
"window": self._get_window_context(),
"screenshot_id": None,
}
self._emit_event(event)
def _on_scroll(self, x: int, y: int, dx: int, dy: int) -> None:
if not self._running:
return
event = {
"t": self._relative_time(),
"type": "mouse_scroll",
"delta": dy * 120,
"pos": [x, y],
"window": self._get_window_context(),
"screenshot_id": None,
}
self._emit_event(event)
def _on_move(self, x: int, y: int) -> None:
if not self._running:
return
event = {
"t": self._relative_time(),
"type": "mouse_move",
"pos": [x, y],
"window": self._get_window_context(),
"screenshot_id": None,
}
self._emit_event(event)
# === Handlers clavier ===
def _on_key_press(self, key) -> None:
if not self._running:
return
key_name = self._key_to_string(key)
event = {
"t": self._relative_time(),
"type": "key_press",
"keys": [key_name],
"window": self._get_window_context(),
"screenshot_id": None,
}
self._emit_event(event)
def _on_key_release(self, key) -> None:
if not self._running:
return
key_name = self._key_to_string(key)
event = {
"t": self._relative_time(),
"type": "key_release",
"keys": [key_name],
"window": self._get_window_context(),
"screenshot_id": None,
}
self._emit_event(event)
@staticmethod
def _key_to_string(key) -> str:
"""Convertir une touche pynput en string lisible."""
if hasattr(key, 'char') and key.char:
return key.char
if hasattr(key, 'name'):
return key.name.upper()
return str(key)

View File

@@ -0,0 +1,344 @@
"""
SessionRecorder - Enregistrement de sessions RPA complètes
Orchestre EventListener + ScreenCapturer pour produire un RawSession :
- Capture les événements clavier/souris en continu
- Prend un screenshot à chaque clic (ou périodiquement)
- Sauvegarde les screenshots sur disque
- Produit un RawSession complet avec events + screenshots liés
Usage:
>>> recorder = SessionRecorder(output_dir="data/sessions")
>>> recorder.start(workflow_name="login_workflow")
>>> # ... l'utilisateur effectue ses actions ...
>>> session = recorder.stop()
>>> print(f"{len(session.events)} events, {len(session.screenshots)} screenshots")
"""
import logging
import os
import platform
import threading
import time
from datetime import datetime
from pathlib import Path
from typing import Optional, Callable, Dict, Any, List
from core.models.raw_session import RawSession, Event, Screenshot, RawWindowContext
logger = logging.getLogger(__name__)
class SessionRecorder:
"""
Enregistreur de sessions RPA complet.
Combine EventListener (clavier/souris) et ScreenCapturer (screenshots)
pour produire une RawSession exploitable par le GraphBuilder.
"""
def __init__(
self,
output_dir: str = "data/training/sessions",
screenshot_on_click: bool = True,
screenshot_interval_ms: int = 0,
capture_keyboard: bool = True,
):
"""
Args:
output_dir: Répertoire de sortie pour les sessions
screenshot_on_click: Prendre un screenshot à chaque clic
screenshot_interval_ms: Intervalle de capture périodique (0 = désactivé)
capture_keyboard: Capturer les frappes clavier
"""
self.output_dir = Path(output_dir)
self.screenshot_on_click = screenshot_on_click
self.screenshot_interval_ms = screenshot_interval_ms
self.capture_keyboard = capture_keyboard
self._session: Optional[RawSession] = None
self._session_dir: Optional[Path] = None
self._screenshots_dir: Optional[Path] = None
self._running = False
self._screenshot_counter = 0
self._lock = threading.Lock()
# Composants (lazy init)
self._event_listener = None
self._screen_capturer = None
self._periodic_thread: Optional[threading.Thread] = None
# Callbacks optionnels
self._on_event: Optional[Callable[[Dict[str, Any]], None]] = None
self._on_screenshot: Optional[Callable[[str], None]] = None
def start(
self,
workflow_name: str = "",
session_id: Optional[str] = None,
on_event: Optional[Callable[[Dict[str, Any]], None]] = None,
on_screenshot: Optional[Callable[[str], None]] = None,
) -> str:
"""
Démarrer l'enregistrement d'une session.
Args:
workflow_name: Nom du workflow pour le contexte
session_id: ID de session (généré si None)
on_event: Callback appelé pour chaque événement
on_screenshot: Callback appelé pour chaque screenshot
Returns:
session_id de la session démarrée
"""
if self._running:
logger.warning("SessionRecorder déjà en cours")
return self._session.session_id if self._session else ""
# Générer ID de session
if session_id is None:
session_id = f"session_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
# Créer répertoires
self._session_dir = self.output_dir / session_id
self._screenshots_dir = self._session_dir / session_id / "screenshots"
self._screenshots_dir.mkdir(parents=True, exist_ok=True)
# Initialiser la session
self._session = RawSession(
session_id=session_id,
agent_version="rpa_vision_v3",
environment=self._get_environment(),
user={"id": os.getenv("USER", "unknown")},
context={"workflow": workflow_name, "tags": []},
started_at=datetime.now(),
)
self._screenshot_counter = 0
self._on_event = on_event
self._on_screenshot = on_screenshot
self._running = True
# Démarrer le listener d'événements
self._start_event_listener()
# Démarrer la capture périodique si configurée
if self.screenshot_interval_ms > 0:
self._start_periodic_capture()
logger.info(
f"SessionRecorder démarré: {session_id} "
f"(screenshots_dir={self._screenshots_dir})"
)
return session_id
def stop(self) -> RawSession:
"""
Arrêter l'enregistrement et retourner la session complète.
Returns:
RawSession avec tous les événements et screenshots
"""
if not self._running:
logger.warning("SessionRecorder non démarré")
return self._session
self._running = False
# Arrêter la capture périodique
if self._periodic_thread and self._periodic_thread.is_alive():
self._periodic_thread.join(timeout=2)
# Arrêter le listener d'événements
if self._event_listener:
self._event_listener.stop()
# Finaliser la session
self._session.ended_at = datetime.now()
# Sauvegarder la session JSON
session_path = self._session_dir / f"{self._session.session_id}.json"
self._session.save_to_file(session_path)
logger.info(
f"SessionRecorder arrêté: {self._session.session_id} "
f"({len(self._session.events)} events, "
f"{len(self._session.screenshots)} screenshots) "
f"{session_path}"
)
return self._session
@property
def is_running(self) -> bool:
return self._running
@property
def event_count(self) -> int:
return len(self._session.events) if self._session else 0
@property
def screenshot_count(self) -> int:
return len(self._session.screenshots) if self._session else 0
# =========================================================================
# Capture d'événements
# =========================================================================
def _start_event_listener(self) -> None:
"""Démarrer le listener d'événements."""
try:
from core.capture.event_listener import EventListener
self._event_listener = EventListener(capture_mouse_move=False)
self._event_listener.start(callback=self._on_raw_event)
logger.info("EventListener démarré")
except ImportError:
logger.warning(
"EventListener non disponible (pynput manquant). "
"Seuls les screenshots périodiques seront capturés."
)
def _on_raw_event(self, raw_event: Dict[str, Any]) -> None:
"""Callback appelé par EventListener pour chaque événement."""
if not self._running or not self._session:
return
# Convertir en Event
event = Event(
t=raw_event.get("t", 0.0),
type=raw_event.get("type", "unknown"),
window=RawWindowContext(
title=raw_event.get("window", {}).get("title", "Unknown"),
app_name=raw_event.get("window", {}).get("app_name", "unknown"),
),
screenshot_id=None,
data={
k: v
for k, v in raw_event.items()
if k not in ("t", "type", "window", "screenshot_id")
},
)
# Screenshot sur clic
if self.screenshot_on_click and event.type == "mouse_click":
screenshot_id = self._take_screenshot()
if screenshot_id:
event.screenshot_id = screenshot_id
with self._lock:
self._session.add_event(event)
# Callback utilisateur
if self._on_event:
try:
self._on_event(raw_event)
except Exception as e:
logger.warning(f"Erreur callback on_event: {e}")
# =========================================================================
# Capture de screenshots
# =========================================================================
def _take_screenshot(self) -> Optional[str]:
"""Prendre un screenshot et le sauvegarder."""
if not self._running or not self._session:
return None
try:
self._ensure_screen_capturer()
if self._screen_capturer is None:
return None
frame = self._screen_capturer.capture_frame()
if frame is None:
return None
# Sauvegarder
self._screenshot_counter += 1
screenshot_id = f"ss_{self._screenshot_counter:04d}"
filename = f"screen_{self._screenshot_counter:04d}.png"
filepath = self._screenshots_dir / filename
self._screen_capturer.save_frame(frame, str(filepath))
# Enregistrer dans la session
screenshot = Screenshot(
screenshot_id=screenshot_id,
relative_path=f"screenshots/{filename}",
captured_at=datetime.now().isoformat(),
)
with self._lock:
self._session.add_screenshot(screenshot)
# Callback utilisateur
if self._on_screenshot:
try:
self._on_screenshot(str(filepath))
except Exception as e:
logger.warning(f"Erreur callback on_screenshot: {e}")
return screenshot_id
except Exception as e:
logger.warning(f"Erreur capture screenshot: {e}")
return None
def _ensure_screen_capturer(self) -> None:
"""Initialiser le ScreenCapturer (lazy)."""
if self._screen_capturer is not None:
return
try:
from core.capture.screen_capturer import ScreenCapturer
self._screen_capturer = ScreenCapturer(
buffer_size=5,
detect_changes=False,
)
except Exception as e:
logger.warning(f"ScreenCapturer non disponible: {e}")
def _start_periodic_capture(self) -> None:
"""Démarrer la capture périodique en thread."""
interval_s = self.screenshot_interval_ms / 1000.0
def _periodic_loop():
while self._running:
self._take_screenshot()
time.sleep(interval_s)
self._periodic_thread = threading.Thread(
target=_periodic_loop, daemon=True, name="periodic_capture"
)
self._periodic_thread.start()
logger.info(
f"Capture périodique démarrée (intervalle={self.screenshot_interval_ms}ms)"
)
# =========================================================================
# Helpers
# =========================================================================
def _get_environment(self) -> Dict[str, Any]:
"""Collecter les informations d'environnement."""
env = {
"os": platform.system().lower(),
"os_version": platform.version(),
"hostname": platform.node(),
"screen": {},
}
# Résolution d'écran
try:
self._ensure_screen_capturer()
if self._screen_capturer:
w, h = self._screen_capturer.get_screen_resolution()
env["screen"] = {
"primary_resolution": [w, h],
}
except Exception:
env["screen"] = {"primary_resolution": [1920, 1080]}
return env

View File

@@ -69,9 +69,10 @@ class DetectionConfig:
"""Configuration de la détection UI hybride"""
# VLM
# Modèles recommandés:
# - "qwen2.5vl:7b" (plus rapide, meilleur avec format='json', recommandé)
# - "qwen2.5vl:3b" (léger, tient en GPU 12GB avec split partiel)
# - "qwen2.5vl:7b" (meilleur mais 13GB mémoire, CPU-only sur RTX 5070)
# - "qwen3-vl:8b" (plus gros, supporté mais plus d'erreurs JSON)
vlm_model: str = "qwen2.5vl:7b"
vlm_model: str = "qwen2.5vl:3b"
vlm_endpoint: str = "http://localhost:11434"
use_vlm_classification: bool = True # Utiliser VLM pour classifier

View File

@@ -451,6 +451,9 @@ class FAISSManager:
return results
# Alias pour compatibilité (WorkflowPipeline, NodeMatcher)
search = search_similar
def remove_embedding(self, faiss_id: int) -> bool:
"""
Supprimer un embedding de l'index

View File

@@ -212,8 +212,8 @@ class StateEmbeddingBuilder:
# Concaténer tous les textes détectés
texts = []
if hasattr(screen_state.perception, 'detected_texts'):
texts = screen_state.perception.detected_texts
if hasattr(screen_state.perception, 'detected_text'):
texts = screen_state.perception.detected_text
combined_text = " ".join(texts) if texts else ""

View File

@@ -664,12 +664,12 @@ class WorkflowSimulator:
try:
if check.kind == "text_present":
# Vérifier présence de texte
detected_texts = getattr(screen_state.perception_level, 'detected_texts', []) if hasattr(screen_state, 'perception_level') else []
detected_texts = getattr(screen_state.perception, 'detected_text', []) if hasattr(screen_state, 'perception') else []
return any(check.value in text for text in detected_texts)
elif check.kind == "text_absent":
# Vérifier absence de texte
detected_texts = getattr(screen_state.perception_level, 'detected_texts', []) if hasattr(screen_state, 'perception_level') else []
detected_texts = getattr(screen_state.perception, 'detected_text', []) if hasattr(screen_state, 'perception') else []
return not any(check.value in text for text in detected_texts)
elif check.kind == "element_present":
@@ -681,7 +681,7 @@ class WorkflowSimulator:
elif check.kind == "window_title_contains":
# Vérifier titre de fenêtre
window_title = getattr(screen_state.raw_level, 'window_title', '') if hasattr(screen_state, 'raw_level') else ''
window_title = getattr(screen_state.window, 'window_title', '') if hasattr(screen_state, 'window') else ''
return check.value in window_title
else:

View File

@@ -509,13 +509,13 @@ class ErrorHandler:
'workflow_edge': edge,
'action': action,
'details': {
'target_role': action.target.role if hasattr(action.target, 'role') else None,
'target_text': action.target.text_pattern if hasattr(action.target, 'text_pattern') else None
'target_role': action.target.by_role if hasattr(action.target, 'by_role') else None,
'target_text': action.target.by_text if hasattr(action.target, 'by_text') else None
},
'original_data': {
'target': {
'role': action.target.role if hasattr(action.target, 'role') else None,
'text_pattern': action.target.text_pattern if hasattr(action.target, 'text_pattern') else None,
'by_role': action.target.by_role if hasattr(action.target, 'by_role') else None,
'by_text': action.target.by_text if hasattr(action.target, 'by_text') else None,
'bbox': getattr(action.target, 'bbox', None)
}
}

View File

@@ -0,0 +1,29 @@
"""
Module d'extraction de donnees structurees depuis des captures d'ecran.
Ce module orchestre le cycle complet :
schema YAML -> navigation -> screenshot -> VLM/OCR -> validation -> SQLite -> CSV/Excel
Classes principales :
- ExtractionSchema : definition des champs et regles de navigation
- ExtractionField : definition d'un champ individuel
- FieldExtractor : extraction via VLM (Ollama) ou OCR (docTR)
- DataStore : stockage SQLite + export CSV/Excel
- IterationController : controle de la boucle de navigation
- ExtractionEngine : orchestrateur principal
"""
from .schema import ExtractionField, ExtractionSchema
from .field_extractor import FieldExtractor
from .data_store import DataStore
from .iteration_controller import IterationController
from .extraction_engine import ExtractionEngine
__all__ = [
"ExtractionField",
"ExtractionSchema",
"FieldExtractor",
"DataStore",
"IterationController",
"ExtractionEngine",
]

View File

@@ -0,0 +1,420 @@
"""
DataStore - Stockage SQLite des donnees extraites + export CSV/Excel
Chaque session d'extraction (ExtractionSchema applique a un ecran) cree
une entree dans la table `extractions`. Les enregistrements individuels
sont stockes dans la table `records` avec leurs donnees JSON, le chemin
du screenshot source et un score de confiance.
"""
import csv
import json
import logging
import sqlite3
import uuid
from datetime import datetime
from io import StringIO
from pathlib import Path
from typing import Any, Dict, List, Optional
from .schema import ExtractionSchema
logger = logging.getLogger(__name__)
class DataStore:
"""Stockage des donnees extraites dans SQLite avec export CSV/Excel."""
def __init__(self, db_path: str = "data/extractions/store.db"):
self.db_path = Path(db_path)
self.db_path.parent.mkdir(parents=True, exist_ok=True)
self._init_db()
# ------------------------------------------------------------------
# Initialisation
# ------------------------------------------------------------------
def _init_db(self) -> None:
"""Creer les tables si necessaire."""
with self._connect() as conn:
conn.execute("""
CREATE TABLE IF NOT EXISTS extractions (
id TEXT PRIMARY KEY,
schema_name TEXT NOT NULL,
schema_json TEXT NOT NULL,
created_at TEXT NOT NULL,
updated_at TEXT NOT NULL,
status TEXT NOT NULL DEFAULT 'in_progress',
record_count INTEGER NOT NULL DEFAULT 0
)
""")
conn.execute("""
CREATE TABLE IF NOT EXISTS records (
id TEXT PRIMARY KEY,
extraction_id TEXT NOT NULL,
data_json TEXT NOT NULL,
screenshot_path TEXT,
confidence REAL NOT NULL DEFAULT 0.0,
errors_json TEXT,
created_at TEXT NOT NULL,
FOREIGN KEY (extraction_id) REFERENCES extractions(id)
)
""")
conn.execute("""
CREATE INDEX IF NOT EXISTS idx_records_extraction
ON records(extraction_id)
""")
def _connect(self) -> sqlite3.Connection:
"""Ouvrir une connexion SQLite."""
conn = sqlite3.connect(str(self.db_path))
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA journal_mode=WAL")
return conn
# ------------------------------------------------------------------
# Extractions (sessions)
# ------------------------------------------------------------------
def create_extraction(self, schema: ExtractionSchema) -> str:
"""
Creer une nouvelle session d'extraction.
Args:
schema: Schema d'extraction
Returns:
extraction_id (UUID)
"""
extraction_id = str(uuid.uuid4())
now = datetime.utcnow().isoformat()
with self._connect() as conn:
conn.execute(
"""
INSERT INTO extractions (id, schema_name, schema_json, created_at, updated_at, status)
VALUES (?, ?, ?, ?, ?, ?)
""",
(
extraction_id,
schema.name,
json.dumps(schema.to_dict(), ensure_ascii=False),
now,
now,
"in_progress",
),
)
logger.info(
"Extraction creee : %s (schema=%s)", extraction_id[:8], schema.name
)
return extraction_id
def finish_extraction(self, extraction_id: str, status: str = "completed") -> None:
"""Marquer une extraction comme terminee."""
now = datetime.utcnow().isoformat()
with self._connect() as conn:
conn.execute(
"UPDATE extractions SET status = ?, updated_at = ? WHERE id = ?",
(status, now, extraction_id),
)
def get_extraction(self, extraction_id: str) -> Optional[Dict[str, Any]]:
"""Recuperer les metadonnees d'une extraction."""
with self._connect() as conn:
row = conn.execute(
"SELECT * FROM extractions WHERE id = ?", (extraction_id,)
).fetchone()
if row:
return dict(row)
return None
def list_extractions(self, limit: int = 50) -> List[Dict[str, Any]]:
"""Lister les extractions recentes."""
with self._connect() as conn:
rows = conn.execute(
"SELECT * FROM extractions ORDER BY created_at DESC LIMIT ?",
(limit,),
).fetchall()
return [dict(r) for r in rows]
# ------------------------------------------------------------------
# Records (enregistrements)
# ------------------------------------------------------------------
def add_record(
self,
extraction_id: str,
data: Dict[str, Any],
screenshot_path: Optional[str] = None,
confidence: float = 0.0,
errors: Optional[List[str]] = None,
) -> str:
"""
Ajouter un enregistrement extrait.
Args:
extraction_id: ID de la session d'extraction
data: Donnees extraites (dict)
screenshot_path: Chemin du screenshot source
confidence: Score de confiance [0, 1]
errors: Liste d'erreurs de validation
Returns:
record_id (UUID)
"""
record_id = str(uuid.uuid4())
now = datetime.utcnow().isoformat()
with self._connect() as conn:
conn.execute(
"""
INSERT INTO records (id, extraction_id, data_json, screenshot_path,
confidence, errors_json, created_at)
VALUES (?, ?, ?, ?, ?, ?, ?)
""",
(
record_id,
extraction_id,
json.dumps(data, ensure_ascii=False),
screenshot_path,
confidence,
json.dumps(errors or [], ensure_ascii=False),
now,
),
)
# Mettre a jour le compteur
conn.execute(
"""
UPDATE extractions
SET record_count = record_count + 1, updated_at = ?
WHERE id = ?
""",
(now, extraction_id),
)
logger.debug(
"Record ajoute : %s (extraction=%s, confiance=%.2f)",
record_id[:8],
extraction_id[:8],
confidence,
)
return record_id
def get_records(self, extraction_id: str) -> List[Dict[str, Any]]:
"""
Recuperer tous les enregistrements d'une extraction.
Returns:
Liste de dicts avec les cles : id, data, screenshot_path,
confidence, errors, created_at
"""
with self._connect() as conn:
rows = conn.execute(
"""
SELECT id, data_json, screenshot_path, confidence,
errors_json, created_at
FROM records
WHERE extraction_id = ?
ORDER BY created_at ASC
""",
(extraction_id,),
).fetchall()
results = []
for row in rows:
results.append({
"id": row["id"],
"data": json.loads(row["data_json"]),
"screenshot_path": row["screenshot_path"],
"confidence": row["confidence"],
"errors": json.loads(row["errors_json"]) if row["errors_json"] else [],
"created_at": row["created_at"],
})
return results
# ------------------------------------------------------------------
# Export
# ------------------------------------------------------------------
def export_csv(self, extraction_id: str, output_path: str) -> str:
"""
Exporter les enregistrements en CSV.
Args:
extraction_id: ID de la session
output_path: Chemin du fichier CSV de sortie
Returns:
Chemin du fichier cree
"""
records = self.get_records(extraction_id)
if not records:
raise ValueError(f"Aucun enregistrement pour l'extraction {extraction_id}")
out = Path(output_path)
out.parent.mkdir(parents=True, exist_ok=True)
# Determiner les colonnes depuis le premier record
all_keys = self._collect_all_keys(records)
with open(out, "w", newline="", encoding="utf-8-sig") as f:
writer = csv.DictWriter(f, fieldnames=all_keys, extrasaction="ignore")
writer.writeheader()
for rec in records:
writer.writerow(rec["data"])
logger.info("Export CSV : %s (%d lignes)", output_path, len(records))
return str(out)
def export_excel(self, extraction_id: str, output_path: str) -> str:
"""
Exporter les enregistrements en Excel (openpyxl).
Args:
extraction_id: ID de la session
output_path: Chemin du fichier Excel de sortie
Returns:
Chemin du fichier cree
Raises:
ImportError: Si openpyxl n'est pas installe
"""
try:
import openpyxl
except ImportError:
raise ImportError(
"openpyxl est requis pour l'export Excel. "
"Installez-le : pip install openpyxl"
)
records = self.get_records(extraction_id)
if not records:
raise ValueError(f"Aucun enregistrement pour l'extraction {extraction_id}")
out = Path(output_path)
out.parent.mkdir(parents=True, exist_ok=True)
all_keys = self._collect_all_keys(records)
wb = openpyxl.Workbook()
ws = wb.active
ws.title = "Extraction"
# En-tetes
for col_idx, key in enumerate(all_keys, start=1):
cell = ws.cell(row=1, column=col_idx, value=key)
cell.font = openpyxl.styles.Font(bold=True)
# Donnees
for row_idx, rec in enumerate(records, start=2):
for col_idx, key in enumerate(all_keys, start=1):
ws.cell(row=row_idx, column=col_idx, value=rec["data"].get(key, ""))
# Ajuster la largeur des colonnes
for col_idx, key in enumerate(all_keys, start=1):
max_len = max(
len(str(key)),
*(len(str(rec["data"].get(key, ""))) for rec in records),
)
ws.column_dimensions[openpyxl.utils.get_column_letter(col_idx)].width = min(max_len + 2, 50)
wb.save(str(out))
logger.info("Export Excel : %s (%d lignes)", output_path, len(records))
return str(out)
# ------------------------------------------------------------------
# Statistiques
# ------------------------------------------------------------------
def get_stats(self, extraction_id: str) -> Dict[str, Any]:
"""
Statistiques d'une extraction.
Returns:
Dict avec : record_count, avg_confidence, completeness,
field_coverage, status, duration
"""
extraction = self.get_extraction(extraction_id)
if not extraction:
return {"error": f"Extraction {extraction_id} introuvable"}
records = self.get_records(extraction_id)
if not records:
return {
"extraction_id": extraction_id,
"schema_name": extraction["schema_name"],
"status": extraction["status"],
"record_count": 0,
"avg_confidence": 0.0,
"completeness": 0.0,
"field_coverage": {},
}
# Confiance moyenne
confidences = [r["confidence"] for r in records]
avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
# Couverture par champ : pourcentage de records ayant une valeur non-nulle
schema_data = json.loads(extraction["schema_json"])
field_names = [f["name"] for f in schema_data.get("fields", [])]
field_coverage = {}
for fname in field_names:
filled = sum(
1 for r in records
if r["data"].get(fname) is not None
and str(r["data"][fname]).strip() != ""
)
field_coverage[fname] = filled / len(records) if records else 0.0
# Completude globale
completeness = (
sum(field_coverage.values()) / len(field_coverage)
if field_coverage else 0.0
)
# Erreurs
total_errors = sum(len(r.get("errors", [])) for r in records)
return {
"extraction_id": extraction_id,
"schema_name": extraction["schema_name"],
"status": extraction["status"],
"record_count": len(records),
"avg_confidence": round(avg_confidence, 3),
"completeness": round(completeness, 3),
"field_coverage": {k: round(v, 3) for k, v in field_coverage.items()},
"total_errors": total_errors,
"created_at": extraction["created_at"],
"updated_at": extraction["updated_at"],
}
# ------------------------------------------------------------------
# Nettoyage
# ------------------------------------------------------------------
def delete_extraction(self, extraction_id: str) -> bool:
"""Supprimer une extraction et tous ses records."""
with self._connect() as conn:
conn.execute("DELETE FROM records WHERE extraction_id = ?", (extraction_id,))
result = conn.execute("DELETE FROM extractions WHERE id = ?", (extraction_id,))
return result.rowcount > 0
# ------------------------------------------------------------------
# Utilitaires internes
# ------------------------------------------------------------------
@staticmethod
def _collect_all_keys(records: List[Dict[str, Any]]) -> List[str]:
"""Collecter toutes les cles uniques des records, en preservant l'ordre."""
seen = set()
keys = []
for rec in records:
for k in rec["data"].keys():
if k not in seen:
seen.add(k)
keys.append(k)
return keys

View File

@@ -0,0 +1,312 @@
"""
ExtractionEngine - Orchestrateur principal du moteur d'extraction de donnees
Orchestre le cycle complet :
naviguer -> screenshot -> extraire -> valider -> stocker -> suivant
S'appuie sur FieldExtractor (VLM/OCR), DataStore (SQLite), et
IterationController (navigation) pour realiser l'extraction automatisee
de donnees depuis des interfaces utilisateur.
"""
import logging
import time
from datetime import datetime
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional
import requests
from .data_store import DataStore
from .field_extractor import FieldExtractor
from .iteration_controller import IterationController
from .schema import ExtractionSchema
logger = logging.getLogger(__name__)
class ExtractionEngine:
"""
Moteur d'extraction principal.
Orchestre le cycle : naviguer -> screenshot -> extraire -> stocker -> suivant.
Modes d'utilisation :
1. Automatique : start_extraction() — boucle complete avec navigation
2. Manuel : extract_current_screen() — extraction ponctuelle d'un screenshot
"""
def __init__(
self,
schema: ExtractionSchema,
store: Optional[DataStore] = None,
field_extractor: Optional[FieldExtractor] = None,
streaming_server_url: str = "http://localhost:5005",
screenshot_dir: str = "data/extractions/screenshots",
):
"""
Args:
schema: Schema d'extraction decrivant les champs et la navigation
store: DataStore pour le stockage (cree un par defaut si absent)
field_extractor: Extracteur de champs (cree un par defaut si absent)
streaming_server_url: URL du streaming server Agent V1
screenshot_dir: Repertoire pour sauvegarder les screenshots
"""
self.schema = schema
self.store = store or DataStore()
self.field_extractor = field_extractor or FieldExtractor()
self.controller = IterationController(schema, streaming_server_url)
self.streaming_server_url = streaming_server_url.rstrip("/")
self.screenshot_dir = Path(screenshot_dir)
self.screenshot_dir.mkdir(parents=True, exist_ok=True)
# Etat interne
self._current_extraction_id: Optional[str] = None
self._is_running = False
self._should_stop = False
self._progress_callback: Optional[Callable] = None
# ------------------------------------------------------------------
# API publique - Extraction automatique
# ------------------------------------------------------------------
def start_extraction(
self,
session_id: str,
on_progress: Optional[Callable[[Dict[str, Any]], None]] = None,
) -> str:
"""
Demarrer une session d'extraction automatique.
Boucle :
1. Creer l'extraction dans le store
2. Pour chaque enregistrement :
a. Prendre un screenshot
b. Extraire les champs
c. Valider
d. Stocker
e. Naviguer au suivant
3. Finaliser et retourner l'extraction_id
Args:
session_id: ID de la session de streaming (pour navigation)
on_progress: Callback appele a chaque record (optionnel)
Returns:
extraction_id
"""
self._is_running = True
self._should_stop = False
self._progress_callback = on_progress
# Creer la session d'extraction
extraction_id = self.store.create_extraction(self.schema)
self._current_extraction_id = extraction_id
logger.info(
"Demarrage extraction %s (schema=%s, max=%d)",
extraction_id[:8],
self.schema.name,
self.controller.max_records,
)
try:
while self.controller.has_next() and not self._should_stop:
idx = self.controller.current_index
# 1. Screenshot
screenshot_path = self._take_screenshot(session_id, idx)
if screenshot_path is None:
logger.warning("Screenshot echoue a l'index %d, on continue", idx)
# Naviguer quand meme pour ne pas rester bloque
self.controller.navigate_to_next(session_id)
continue
# 2. Extraction
result = self.extract_current_screen(screenshot_path)
# 3. Stockage
self.store.add_record(
extraction_id=extraction_id,
data=result["data"],
screenshot_path=screenshot_path,
confidence=result["confidence"],
errors=result.get("errors"),
)
# 4. Callback de progression
if self._progress_callback:
progress = self.get_progress()
progress["last_record"] = result["data"]
progress["last_confidence"] = result["confidence"]
self._progress_callback(progress)
logger.info(
"Record %d/%d extrait (confiance=%.2f)",
idx + 1,
self.controller.max_records,
result["confidence"],
)
# 5. Navigation
if not self.controller.navigate_to_next(session_id):
logger.info("Fin de navigation a l'index %d", idx)
break
# Finaliser
status = "stopped" if self._should_stop else "completed"
self.store.finish_extraction(extraction_id, status=status)
logger.info(
"Extraction %s terminee : %s (%d records)",
extraction_id[:8],
status,
self.controller.current_index,
)
except Exception as e:
logger.error("Erreur pendant l'extraction : %s", e)
self.store.finish_extraction(extraction_id, status="error")
raise
finally:
self._is_running = False
self._current_extraction_id = None
return extraction_id
def stop_extraction(self) -> None:
"""Demander l'arret de l'extraction en cours."""
if self._is_running:
logger.info("Arret demande pour l'extraction en cours")
self._should_stop = True
# ------------------------------------------------------------------
# API publique - Extraction ponctuelle
# ------------------------------------------------------------------
def extract_current_screen(self, screenshot_path: str) -> Dict[str, Any]:
"""
Extraire les champs du screenshot actuel sans navigation.
Args:
screenshot_path: Chemin vers le screenshot
Returns:
Dict avec 'data', 'confidence', 'errors', 'validation'
"""
# Extraction
result = self.field_extractor.extract_fields(screenshot_path, self.schema)
# Validation contre le schema
validation = self.schema.validate_record(result["data"])
result["validation"] = validation
return result
# ------------------------------------------------------------------
# API publique - Progression
# ------------------------------------------------------------------
def get_progress(self) -> Dict[str, Any]:
"""Retourne la progression actuelle de l'extraction."""
nav_progress = self.controller.progress
stats = {}
if self._current_extraction_id:
stats = self.store.get_stats(self._current_extraction_id)
return {
"extraction_id": self._current_extraction_id,
"is_running": self._is_running,
"navigation": nav_progress,
"stats": stats,
"schema_name": self.schema.name,
}
# ------------------------------------------------------------------
# Screenshot
# ------------------------------------------------------------------
def _take_screenshot(self, session_id: str, index: int) -> Optional[str]:
"""
Prendre un screenshot via le streaming server.
Essaie d'appeler l'API du streaming server pour obtenir
le screenshot courant. En cas d'echec, retourne None.
Args:
session_id: ID de la session de streaming
index: Index de l'enregistrement courant
Returns:
Chemin du screenshot sauvegarde, ou None
"""
try:
response = requests.get(
f"{self.streaming_server_url}/api/screenshot",
params={"session_id": session_id},
timeout=10,
)
if response.status_code == 200:
# Sauvegarder le screenshot
timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
filename = f"record_{index:04d}_{timestamp}.png"
filepath = self.screenshot_dir / filename
with open(filepath, "wb") as f:
f.write(response.content)
return str(filepath)
else:
logger.warning(
"Screenshot echoue : HTTP %d", response.status_code
)
return None
except requests.exceptions.ConnectionError:
logger.warning(
"Streaming server non accessible pour screenshot"
)
return None
except Exception as e:
logger.error("Erreur screenshot : %s", e)
return None
# ------------------------------------------------------------------
# Utilitaires
# ------------------------------------------------------------------
def extract_from_file(self, screenshot_path: str) -> Dict[str, Any]:
"""
Raccourci pour extraire depuis un fichier existant
et stocker le resultat.
Utile pour du retraitement offline de screenshots.
Args:
screenshot_path: Chemin vers un screenshot existant
Returns:
Dict avec les donnees extraites et le record_id
"""
if self._current_extraction_id is None:
extraction_id = self.store.create_extraction(self.schema)
else:
extraction_id = self._current_extraction_id
result = self.extract_current_screen(screenshot_path)
record_id = self.store.add_record(
extraction_id=extraction_id,
data=result["data"],
screenshot_path=screenshot_path,
confidence=result["confidence"],
errors=result.get("errors"),
)
result["record_id"] = record_id
result["extraction_id"] = extraction_id
return result

View File

@@ -0,0 +1,327 @@
"""
FieldExtractor - Extraction de champs structures depuis des screenshots
Utilise un VLM (Ollama) pour comprendre le contenu visuel et en extraire
des donnees structurees selon un schema predefini.
Fallback OCR via docTR si le VLM echoue.
"""
import base64
import json
import logging
import os
import re
from pathlib import Path
from typing import Any, Dict, List, Optional
import requests
from .schema import ExtractionField, ExtractionSchema
logger = logging.getLogger(__name__)
# Configuration Ollama (coherente avec le reste du projet)
OLLAMA_DEFAULT_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434")
OLLAMA_DEFAULT_MODEL = os.environ.get("VLM_MODEL", "qwen3-vl:8b")
class FieldExtractor:
"""
Extrait des champs structures depuis un screenshot.
Pipeline :
1. VLM : envoyer screenshot + schema au VLM pour extraction structuree
2. Validation : verifier les regex, types, champs requis
3. (Optionnel) OCR fallback si VLM indisponible
"""
def __init__(
self,
ollama_url: str = OLLAMA_DEFAULT_URL,
ollama_model: str = OLLAMA_DEFAULT_MODEL,
timeout: int = 60,
):
"""
Args:
ollama_url: URL du serveur Ollama
ollama_model: Modele VLM a utiliser
timeout: Timeout en secondes pour les appels VLM
"""
self.ollama_url = ollama_url.rstrip("/")
self.ollama_model = ollama_model
self.timeout = timeout
# ------------------------------------------------------------------
# API publique
# ------------------------------------------------------------------
def extract_fields(
self,
screenshot_path: str,
schema: ExtractionSchema,
) -> Dict[str, Any]:
"""
Extraire les champs definis par le schema depuis un screenshot.
Args:
screenshot_path: Chemin vers l'image (PNG/JPEG)
schema: Schema d'extraction
Returns:
Dict avec les champs extraits + metadonnees
{
"data": {"nom": "DUPONT", "prenom": "Jean", ...},
"confidence": 0.85,
"errors": [],
"raw_response": "..."
}
"""
path = Path(screenshot_path)
if not path.exists():
return {
"data": {},
"confidence": 0.0,
"errors": [f"Fichier introuvable : {screenshot_path}"],
"raw_response": None,
}
# Encoder l'image en base64
image_b64 = self._encode_image(path)
# Extraction via VLM
raw_data, raw_response = self._extract_via_vlm(image_b64, schema.fields)
if raw_data is None:
logger.warning("VLM extraction echouee, tentative OCR fallback")
raw_data = self._extract_via_ocr_fallback(path, schema.fields)
raw_response = "(ocr fallback)"
# Validation et nettoyage
validated = {}
errors: List[str] = []
valid_count = 0
for fld in schema.fields:
value = raw_data.get(fld.name) if raw_data else None
# Nettoyer
if value is not None:
value = str(value).strip()
if value == "" or value.lower() in ("null", "none", "n/a"):
value = None
validated[fld.name] = value
if not fld.validate_value(value):
errors.append(
f"Champ '{fld.name}' invalide ou manquant : {value!r}"
)
else:
if value is not None and str(value).strip():
valid_count += 1
total = len(schema.fields) if schema.fields else 1
confidence = valid_count / total
return {
"data": validated,
"confidence": confidence,
"errors": errors,
"raw_response": raw_response,
}
# ------------------------------------------------------------------
# Extraction VLM
# ------------------------------------------------------------------
def _extract_via_vlm(
self, image_b64: str, fields: List[ExtractionField]
) -> tuple:
"""
Appeler le VLM (Ollama) pour extraction structuree.
Returns:
(dict_donnees | None, raw_response_text | None)
"""
prompt = self._build_extraction_prompt(fields)
try:
# Desactiver le mode thinking pour Qwen3
effective_prompt = prompt
if "qwen" in self.ollama_model.lower():
effective_prompt = f"/nothink {prompt}"
payload = {
"model": self.ollama_model,
"prompt": effective_prompt,
"images": [image_b64],
"stream": False,
"format": "json",
"options": {
"temperature": 0.1,
"num_predict": 2000,
},
}
response = requests.post(
f"{self.ollama_url}/api/generate",
json=payload,
timeout=self.timeout,
)
if response.status_code != 200:
logger.error(
"Erreur Ollama %d : %s",
response.status_code,
response.text[:300],
)
return None, None
result = response.json()
raw_text = result.get("response", "").strip()
logger.debug("Reponse VLM brute : %s", raw_text[:500])
parsed = self._parse_vlm_response(raw_text)
return parsed, raw_text
except requests.exceptions.Timeout:
logger.error("Timeout VLM apres %ds", self.timeout)
return None, None
except requests.exceptions.ConnectionError:
logger.error("Ollama non accessible a %s", self.ollama_url)
return None, None
except Exception as e:
logger.error("Erreur VLM inattendue : %s", e)
return None, None
def _build_extraction_prompt(self, fields: List[ExtractionField]) -> str:
"""Construire le prompt d'extraction structure pour le VLM."""
field_descriptions = []
for f in fields:
desc = f"- {f.name} ({f.field_type}): {f.description}"
if f.required:
desc += " [OBLIGATOIRE]"
if f.validation_regex:
desc += f" (format: {f.validation_regex})"
field_descriptions.append(desc)
fields_text = "\n".join(field_descriptions)
return f"""Regarde cette capture d'ecran et extrais les informations suivantes.
CHAMPS A EXTRAIRE :
{fields_text}
INSTRUCTIONS :
1. Extrais chaque champ tel qu'il apparait a l'ecran
2. Si un champ n'est pas visible, mets null
3. Pour les dates, conserve le format tel qu'affiche
4. Pour les nombres, conserve le format avec virgule si present
5. Reponds UNIQUEMENT en JSON valide
FORMAT DE REPONSE :
Un objet JSON avec les cles correspondant aux noms de champs ci-dessus.
Exemple : {{"nom": "DUPONT", "prenom": "Jean", "date_naissance": "15/03/1965"}}
Extrais maintenant les donnees :"""
def _parse_vlm_response(self, text: str) -> Optional[Dict[str, Any]]:
"""Parser la reponse JSON du VLM."""
if not text:
return None
# Essayer le parse direct
try:
return json.loads(text)
except json.JSONDecodeError:
pass
# Chercher un objet JSON dans la reponse
match = re.search(r"\{[\s\S]*\}", text)
if match:
try:
return json.loads(match.group())
except json.JSONDecodeError:
pass
# Chercher entre balises ```json ... ```
match = re.search(r"```(?:json)?\s*(\{[\s\S]*?\})\s*```", text)
if match:
try:
return json.loads(match.group(1))
except json.JSONDecodeError:
pass
logger.warning("Impossible de parser la reponse VLM en JSON")
return None
# ------------------------------------------------------------------
# OCR Fallback
# ------------------------------------------------------------------
def _extract_via_ocr_fallback(
self, image_path: Path, fields: List[ExtractionField]
) -> Optional[Dict[str, Any]]:
"""
Fallback : extraire du texte brut via OCR (docTR) puis tenter
un mapping basique vers les champs.
Ce fallback est tres basique ; il fournit le texte brut
sans mapping intelligent. Le VLM reste la methode privilegiee.
"""
try:
from PIL import Image as PILImage
img = PILImage.open(str(image_path))
# Tenter docTR
try:
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
predictor = ocr_predictor(det_arch="db_mobilenet_v3_large", reco_arch="crnn_mobilenet_v3_large", pretrained=True)
doc = DocumentFile.from_images([str(image_path)])
result = predictor(doc)
# Extraire tout le texte
all_text = []
for page in result.pages:
for block in page.blocks:
for line in block.lines:
line_text = " ".join(w.value for w in line.words)
all_text.append(line_text)
full_text = "\n".join(all_text)
logger.info("OCR fallback : %d lignes extraites", len(all_text))
# Retourner le texte complet dans un champ special
return {"_ocr_text": full_text}
except ImportError:
logger.warning("docTR non disponible pour le fallback OCR")
return None
except Exception as e:
logger.error("Erreur OCR fallback : %s", e)
return None
# ------------------------------------------------------------------
# Utilitaires
# ------------------------------------------------------------------
@staticmethod
def _encode_image(path: Path) -> str:
"""Encoder une image en base64."""
with open(path, "rb") as f:
return base64.b64encode(f.read()).decode("utf-8")
def check_vlm_available(self) -> bool:
"""Verifier si le VLM Ollama est accessible."""
try:
response = requests.get(
f"{self.ollama_url}/api/tags", timeout=5
)
return response.status_code == 200
except (requests.RequestException, ConnectionError, TimeoutError):
return False

View File

@@ -0,0 +1,258 @@
"""
IterationController - Controle de navigation entre enregistrements
Gere la boucle de navigation : passage au record suivant, pagination,
scroll, etc. Communique avec le streaming server (Agent V1) pour
envoyer les actions de navigation sur la machine cible.
"""
import logging
import time
from typing import Any, Dict, Optional
import requests
from .schema import ExtractionSchema
logger = logging.getLogger(__name__)
class IterationController:
"""
Controle la navigation entre les enregistrements a extraire.
Types de navigation supportes :
- list_detail : cliquer sur chaque element d'une liste
- pagination : bouton suivant / page suivante
- scroll : defilement vertical
- manual : l'utilisateur navigue manuellement
"""
def __init__(
self,
schema: ExtractionSchema,
streaming_server_url: str = "http://localhost:5005",
):
"""
Args:
schema: Schema d'extraction (contient les regles de navigation)
streaming_server_url: URL du streaming server Agent V1
"""
self.schema = schema
self.server_url = streaming_server_url.rstrip("/")
self.current_index = 0
self.max_records = schema.navigation.get("max_records", 100)
self.nav_type = schema.navigation.get("type", "manual")
self.nav_action = schema.navigation.get("next_record", "click_next_in_list")
self.nav_delay = schema.navigation.get("delay_ms", 1000)
# Etat interne
self._started = False
self._finished = False
# ------------------------------------------------------------------
# API publique
# ------------------------------------------------------------------
def has_next(self) -> bool:
"""Retourne True s'il reste des enregistrements a traiter."""
if self._finished:
return False
return self.current_index < self.max_records
def navigate_to_next(self, session_id: str) -> bool:
"""
Naviguer vers l'enregistrement suivant.
Envoie les actions de navigation au streaming server
en fonction du type de navigation defini dans le schema.
Args:
session_id: ID de la session de streaming
Returns:
True si la navigation a reussi
"""
if not self.has_next():
logger.info("Plus d'enregistrements a traiter (index=%d)", self.current_index)
return False
success = False
if self.nav_type == "manual":
# Mode manuel : on attend juste un delai
logger.info(
"Navigation manuelle : attente de %dms (index=%d)",
self.nav_delay,
self.current_index,
)
time.sleep(self.nav_delay / 1000)
success = True
elif self.nav_type == "pagination":
success = self._navigate_pagination(session_id)
elif self.nav_type == "list_detail":
success = self._navigate_list_detail(session_id)
elif self.nav_type == "scroll":
success = self._navigate_scroll(session_id)
else:
logger.warning("Type de navigation inconnu : %s", self.nav_type)
success = False
if success:
self.current_index += 1
logger.debug(
"Navigation reussie -> index=%d/%d",
self.current_index,
self.max_records,
)
return success
def navigate_to_record(self, session_id: str, index: int) -> bool:
"""
Naviguer vers un enregistrement specifique.
Args:
session_id: ID de la session de streaming
index: Index de l'enregistrement cible
Returns:
True si la navigation a reussi
"""
if index < 0 or index >= self.max_records:
logger.error("Index hors limites : %d (max=%d)", index, self.max_records)
return False
# Naviguer pas a pas jusqu'a l'index cible
steps = index - self.current_index
if steps < 0:
logger.warning(
"Navigation arriere non supportee (current=%d, target=%d)",
self.current_index,
index,
)
return False
for _ in range(steps):
if not self.navigate_to_next(session_id):
return False
return True
def reset(self) -> None:
"""Reinitialiser le controleur."""
self.current_index = 0
self._started = False
self._finished = False
def mark_finished(self) -> None:
"""Marquer l'iteration comme terminee (ex: fin de liste detectee)."""
self._finished = True
logger.info("Iteration marquee comme terminee a l'index %d", self.current_index)
@property
def progress(self) -> Dict[str, Any]:
"""Retourne la progression actuelle."""
return {
"current_index": self.current_index,
"max_records": self.max_records,
"progress_pct": round(
(self.current_index / self.max_records * 100)
if self.max_records > 0 else 0,
1,
),
"nav_type": self.nav_type,
"finished": self._finished,
}
# ------------------------------------------------------------------
# Navigation specifique
# ------------------------------------------------------------------
def _navigate_pagination(self, session_id: str) -> bool:
"""Navigation par pagination (bouton suivant)."""
action = {
"type": "click",
"target": self.nav_action,
"description": "Cliquer sur le bouton suivant / page suivante",
}
return self._send_action(session_id, action)
def _navigate_list_detail(self, session_id: str) -> bool:
"""Navigation dans une liste (cliquer sur l'element suivant)."""
action = {
"type": "click",
"target": self.nav_action,
"index": self.current_index,
"description": f"Cliquer sur l'element {self.current_index + 1} de la liste",
}
return self._send_action(session_id, action)
def _navigate_scroll(self, session_id: str) -> bool:
"""Navigation par defilement."""
action = {
"type": "scroll",
"direction": "down",
"amount": self.schema.navigation.get("scroll_amount", 300),
"description": "Defiler vers le bas",
}
return self._send_action(session_id, action)
# ------------------------------------------------------------------
# Communication avec le streaming server
# ------------------------------------------------------------------
def _send_action(self, session_id: str, action: Dict[str, Any]) -> bool:
"""
Envoyer une action de navigation au streaming server.
L'action est envoyee via l'API du streaming server (port 5005).
Si le serveur n'est pas disponible, on simule un delai.
Args:
session_id: ID de la session de streaming
action: Description de l'action a executer
Returns:
True si l'action a ete executee ou simulee
"""
try:
payload = {
"session_id": session_id,
"action": action,
}
response = requests.post(
f"{self.server_url}/api/action",
json=payload,
timeout=10,
)
if response.status_code == 200:
# Attendre le delai de navigation
if self.nav_delay > 0:
time.sleep(self.nav_delay / 1000)
return True
else:
logger.warning(
"Action de navigation echouee : HTTP %d", response.status_code
)
return False
except requests.exceptions.ConnectionError:
logger.warning(
"Streaming server non accessible a %s — simulation du delai",
self.server_url,
)
# Simuler l'attente de navigation (mode degrade)
if self.nav_delay > 0:
time.sleep(self.nav_delay / 1000)
return True
except Exception as e:
logger.error("Erreur envoi action de navigation : %s", e)
return False

217
core/extraction/schema.py Normal file
View File

@@ -0,0 +1,217 @@
"""
Schema d'extraction de donnees - Definition des champs et navigation
Permet de definir un schema YAML decrivant les champs a extraire
depuis des captures d'ecran (DPI, formulaires, listes...).
"""
import re
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional
import yaml
@dataclass
class ExtractionField:
"""Definition d'un champ a extraire depuis un screenshot."""
name: str # Ex: "nom_patient", "date_naissance"
description: str # Description pour le VLM
field_type: str = "text" # "text", "date", "number", "boolean"
required: bool = True
validation_regex: Optional[str] = None # Regex de validation optionnelle
def validate_value(self, value: Optional[str]) -> bool:
"""
Valider une valeur extraite pour ce champ.
Returns:
True si la valeur est valide
"""
# Champ requis mais absent
if self.required and (value is None or str(value).strip() == ""):
return False
# Pas de valeur et pas requis => OK
if value is None or str(value).strip() == "":
return True
value_str = str(value).strip()
# Validation par type
if self.field_type == "number":
try:
float(value_str.replace(",", ".").replace(" ", ""))
except ValueError:
return False
elif self.field_type == "boolean":
if value_str.lower() not in (
"true", "false", "oui", "non", "1", "0", "vrai", "faux"
):
return False
elif self.field_type == "date":
# Accepter les formats courants FR
date_patterns = [
r"\d{2}/\d{2}/\d{4}", # JJ/MM/AAAA
r"\d{2}-\d{2}-\d{4}", # JJ-MM-AAAA
r"\d{4}-\d{2}-\d{2}", # AAAA-MM-JJ (ISO)
r"\d{2}\.\d{2}\.\d{4}", # JJ.MM.AAAA
]
if not any(re.fullmatch(p, value_str) for p in date_patterns):
return False
# Validation regex custom
if self.validation_regex:
if not re.fullmatch(self.validation_regex, value_str):
return False
return True
@dataclass
class ExtractionSchema:
"""
Schema complet d'extraction : liste de champs + regles de navigation.
Peut etre charge/sauvegarde en YAML pour reutilisation.
"""
name: str # Ex: "dossier_patient_DPI"
description: str
fields: List[ExtractionField] = field(default_factory=list)
navigation: Dict[str, Any] = field(default_factory=dict)
# --- Serialisation YAML ---
@classmethod
def from_yaml(cls, path: str) -> "ExtractionSchema":
"""
Charger un schema depuis un fichier YAML.
Args:
path: Chemin vers le fichier YAML
Returns:
Instance ExtractionSchema
"""
yaml_path = Path(path)
if not yaml_path.exists():
raise FileNotFoundError(f"Schema YAML non trouve : {path}")
with open(yaml_path, "r", encoding="utf-8") as f:
data = yaml.safe_load(f)
if not isinstance(data, dict):
raise ValueError(f"Le fichier YAML doit contenir un dictionnaire, pas {type(data).__name__}")
return cls._from_dict(data)
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "ExtractionSchema":
"""Construire un schema depuis un dictionnaire Python."""
return cls._from_dict(data)
@classmethod
def _from_dict(cls, data: Dict[str, Any]) -> "ExtractionSchema":
"""Construction interne depuis un dict."""
fields_raw = data.get("fields", [])
fields = []
for fd in fields_raw:
fields.append(ExtractionField(
name=fd["name"],
description=fd.get("description", ""),
field_type=fd.get("type", fd.get("field_type", "text")),
required=fd.get("required", True),
validation_regex=fd.get("validation", fd.get("validation_regex")),
))
return cls(
name=data.get("name", "unnamed"),
description=data.get("description", ""),
fields=fields,
navigation=data.get("navigation", {}),
)
def to_yaml(self, path: str) -> None:
"""
Sauvegarder le schema en fichier YAML.
Args:
path: Chemin de sortie
"""
yaml_path = Path(path)
yaml_path.parent.mkdir(parents=True, exist_ok=True)
data = self.to_dict()
with open(yaml_path, "w", encoding="utf-8") as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
def to_dict(self) -> Dict[str, Any]:
"""Convertir en dictionnaire serialisable."""
return {
"name": self.name,
"description": self.description,
"fields": [
{
"name": f.name,
"description": f.description,
"type": f.field_type,
"required": f.required,
**({"validation": f.validation_regex} if f.validation_regex else {}),
}
for f in self.fields
],
"navigation": self.navigation,
}
# --- Utilitaires ---
@property
def required_fields(self) -> List[ExtractionField]:
"""Retourne la liste des champs obligatoires."""
return [f for f in self.fields if f.required]
@property
def field_names(self) -> List[str]:
"""Retourne la liste des noms de champs."""
return [f.name for f in self.fields]
def get_field(self, name: str) -> Optional[ExtractionField]:
"""Recuperer un champ par son nom."""
for f in self.fields:
if f.name == name:
return f
return None
def validate_record(self, record: Dict[str, Any]) -> Dict[str, Any]:
"""
Valider un enregistrement complet contre le schema.
Returns:
Dict avec 'valid' (bool), 'errors' (list), 'completeness' (float)
"""
errors = []
valid_count = 0
for fld in self.fields:
value = record.get(fld.name)
if fld.validate_value(value):
if value is not None and str(value).strip():
valid_count += 1
else:
errors.append(f"Champ '{fld.name}' invalide: {value!r}")
total = len(self.fields) if self.fields else 1
completeness = valid_count / total
return {
"valid": len(errors) == 0,
"errors": errors,
"completeness": completeness,
}

View File

@@ -24,8 +24,9 @@ Example:
"""
import logging
from typing import List, Dict, Optional, Tuple
from collections import defaultdict
import os
from typing import List, Dict, Optional, Tuple, Any
from collections import defaultdict, Counter
from datetime import datetime
from pathlib import Path
@@ -106,6 +107,7 @@ class GraphBuilder:
self.clustering_eps = clustering_eps
self.clustering_min_samples = clustering_min_samples
self.enable_quality_validation = enable_quality_validation
self._screen_analyzer = None # ScreenAnalyzer (lazy import)
logger.info(
f"GraphBuilder initialized: "
@@ -119,39 +121,47 @@ class GraphBuilder:
self,
session: RawSession,
workflow_name: Optional[str] = None,
precomputed_states: Optional[List["ScreenState"]] = None,
) -> Workflow:
"""
Construire un Workflow complet depuis une RawSession.
Processus:
1. Créer ScreenStates depuis screenshots
1. Créer ScreenStates depuis screenshots (ou utiliser precomputed_states)
2. Calculer embeddings pour chaque état
3. Détecter patterns via clustering
4. Construire nodes depuis clusters
5. Construire edges depuis transitions
Args:
session: Session brute à analyser
workflow_name: Nom du workflow (généré si None)
precomputed_states: ScreenStates déjà analysés (streaming).
Si fourni, saute l'étape 1 (pas de re-analyse via ScreenAnalyzer).
Returns:
Workflow construit avec nodes et edges
Raises:
ValueError: Si la session est vide ou invalide
"""
if not session.screenshots:
raise ValueError("Session has no screenshots")
if not precomputed_states and not session.screenshots:
raise ValueError("Session has no screenshots and no precomputed states")
logger.info(
f"Building workflow from session {session.session_id} "
f"with {len(session.screenshots)} screenshots"
f"with {len(precomputed_states or session.screenshots)} "
f"{'precomputed states' if precomputed_states else 'screenshots'}"
)
# Étape 1: Créer ScreenStates
screen_states = self._create_screen_states(session)
logger.debug(f"Created {len(screen_states)} screen states")
# Étape 1: Créer ScreenStates (ou réutiliser ceux pré-calculés)
if precomputed_states:
screen_states = precomputed_states
logger.debug(f"Using {len(screen_states)} precomputed screen states")
else:
screen_states = self._create_screen_states(session)
logger.debug(f"Created {len(screen_states)} screen states")
# Étape 2: Calculer embeddings
embeddings = self._compute_embeddings(screen_states)
logger.debug(f"Computed {len(embeddings)} embeddings")
@@ -315,16 +325,31 @@ class GraphBuilder:
file_size_bytes=screenshot_path.stat().st_size if screenshot_path.exists() else 0
)
# Créer PerceptionLevel (sera enrichi par embedding_builder)
# Créer PerceptionLevel enrichir avec OCR si le screenshot existe
detected_text = []
text_method = "none"
if screenshot_path.exists():
try:
if self._screen_analyzer is None:
from core.pipeline.screen_analyzer import ScreenAnalyzer
self._screen_analyzer = ScreenAnalyzer(session_id=session.session_id)
extracted = self._screen_analyzer._extract_text(str(screenshot_path))
if extracted:
detected_text = extracted
text_method = self._screen_analyzer._get_ocr_method_name()
except Exception as e:
logger.debug(f"OCR échoué pour {screenshot_path}: {e}")
perception = PerceptionLevel(
embedding=EmbeddingRef(
provider="openclip_ViT-B-32",
vector_id=f"data/embeddings/screens/{session.session_id}_state_{i:04d}.npy",
dimensions=512
),
detected_text=[], # Sera rempli par VLM/OCR
text_detection_method="pending",
confidence_avg=0.0
detected_text=detected_text,
text_detection_method=text_method,
confidence_avg=0.85 if detected_text else 0.0
)
# Créer ContextLevel
@@ -504,8 +529,12 @@ class GraphBuilder:
node = WorkflowNode(
node_id=f"node_{cluster_id:03d}",
name=f"State Pattern {cluster_id}",
screen_template=template,
observation_count=len(indices),
description=f"Pattern auto-détecté ({len(indices)} observations)",
template=template,
metadata={
"observation_count": len(indices),
"_prototype_vector": prototype.tolist(),
},
)
nodes.append(node)
@@ -522,27 +551,172 @@ class GraphBuilder:
) -> ScreenTemplate:
"""
Créer un ScreenTemplate depuis un cluster d'états.
TODO: Implémenter extraction intelligente de:
- window_title_pattern (regex depuis titres communs)
- required_text_patterns (texte présent dans tous les états)
- required_ui_elements (éléments UI communs)
Extrait les contraintes communes à tous les états du cluster :
- window_title_pattern : titre de fenêtre commun
- required_text_patterns : textes présents dans la majorité des états
- required_ui_elements : rôles/types UI récurrents
Args:
states: États du cluster
prototype_embedding: Embedding prototype
Returns:
ScreenTemplate avec contraintes
ScreenTemplate avec contraintes extraites
"""
# Pour l'instant, template basique avec seulement l'embedding
return ScreenTemplate(
embedding_prototype=prototype_embedding.tolist(),
similarity_threshold=0.85,
window_title_pattern=None, # TODO: Extraire
required_text_patterns=[], # TODO: Extraire
required_ui_elements=[], # TODO: Extraire
# --- Extraction du titre de fenêtre commun ---
window_title_pattern = self._extract_window_pattern(states)
# --- Extraction des textes récurrents ---
required_text_patterns = self._extract_common_texts(states)
# --- Extraction des éléments UI récurrents ---
required_ui_elements = self._extract_common_ui_elements(states)
# Construire les sous-objets de contraintes
window_constraint = WindowConstraint(
title_pattern=window_title_pattern,
title_contains=window_title_pattern,
)
text_constraint = TextConstraint(
required_texts=required_text_patterns,
)
ui_roles = [
e.get("role", "") for e in required_ui_elements if e.get("role")
]
ui_constraint = UIConstraint(
required_roles=ui_roles,
)
embedding_proto = EmbeddingPrototype(
provider="openclip_ViT-B-32",
vector_id="", # Le vecteur est stocké dans node.metadata._prototype_vector
min_cosine_similarity=0.85,
sample_count=len(states),
)
return ScreenTemplate(
window=window_constraint,
text=text_constraint,
ui=ui_constraint,
embedding=embedding_proto,
)
def _extract_window_pattern(self, states: List[ScreenState]) -> Optional[str]:
"""Extraire un pattern de titre de fenêtre commun aux états du cluster."""
titles = [s.window.window_title for s in states if s.window.window_title]
if not titles:
return None
# Si tous les titres sont identiques, retourner directement
if len(set(titles)) == 1:
return titles[0]
# Trouver le préfixe commun le plus long
prefix = os.path.commonprefix(titles)
if len(prefix) >= 5:
return prefix.rstrip(" -–—|")
# Fallback: le titre le plus fréquent
from collections import Counter
most_common = Counter(titles).most_common(1)[0][0]
return most_common
def _extract_common_texts(
self, states: List[ScreenState], min_presence_ratio: float = 0.6
) -> List[str]:
"""
Extraire les textes présents dans la majorité des états du cluster.
Args:
states: États du cluster
min_presence_ratio: Proportion minimale de présence (0.6 = 60% des états)
"""
if not states:
return []
# Collecter les textes de chaque état
text_counts: Dict[str, int] = defaultdict(int)
states_with_text = 0
for state in states:
if hasattr(state.perception, 'detected_text') and state.perception.detected_text:
states_with_text += 1
seen_in_state = set()
for text in state.perception.detected_text:
normalized = text.strip().lower()
if len(normalized) >= 3 and normalized not in seen_in_state:
text_counts[normalized] += 1
seen_in_state.add(normalized)
if states_with_text == 0:
return []
# Garder les textes présents dans au moins min_presence_ratio des états
threshold = max(2, int(states_with_text * min_presence_ratio))
common_texts = [
text for text, count in text_counts.items()
if count >= threshold
]
# Limiter à 10 textes les plus fréquents
common_texts.sort(key=lambda t: text_counts[t], reverse=True)
return common_texts[:10]
def _extract_common_ui_elements(
self, states: List[ScreenState], min_presence_ratio: float = 0.5
) -> List[Dict[str, Any]]:
"""
Extraire les types/rôles d'éléments UI récurrents dans le cluster.
Retourne une liste de contraintes UI au format:
[{"type": "button", "role": "validate", "min_count": 1}, ...]
"""
if not states:
return []
# Compter les paires (type, role) dans chaque état
role_counts: Dict[str, int] = defaultdict(int)
type_counts: Dict[str, int] = defaultdict(int)
states_with_ui = 0
for state in states:
if state.ui_elements:
states_with_ui += 1
seen_roles = set()
seen_types = set()
for el in state.ui_elements:
el_type = getattr(el, 'type', 'unknown')
el_role = getattr(el, 'role', 'unknown')
if el_role != 'unknown' and el_role not in seen_roles:
role_counts[el_role] += 1
seen_roles.add(el_role)
if el_type != 'unknown' and el_type not in seen_types:
type_counts[el_type] += 1
seen_types.add(el_type)
if states_with_ui == 0:
return []
threshold = max(2, int(states_with_ui * min_presence_ratio))
constraints = []
# Ajouter les rôles récurrents
for role, count in role_counts.items():
if count >= threshold:
constraints.append({
"role": role,
"min_count": 1,
})
# Limiter à 8 contraintes
constraints.sort(key=lambda c: role_counts.get(c.get("role", ""), 0), reverse=True)
return constraints[:8]
def _build_edges(
self,
@@ -633,9 +807,14 @@ class GraphBuilder:
# Récupérer les embeddings des prototypes de nodes
node_prototypes = {}
for node in nodes:
if hasattr(node, 'template') and node.template:
if hasattr(node.template, 'embedding_prototype'):
node_prototypes[node.node_id] = np.array(node.template.embedding_prototype)
# Priorité : vecteur en mémoire (metadata), sinon chargement depuis disque
proto_list = node.metadata.get("_prototype_vector")
if proto_list is not None:
node_prototypes[node.node_id] = np.array(proto_list, dtype=np.float32)
elif node.template and node.template.embedding and node.template.embedding.vector_id:
proto_path = Path(node.template.embedding.vector_id)
if proto_path.exists():
node_prototypes[node.node_id] = np.load(proto_path)
if not node_prototypes:
logger.warning("No node prototypes available for mapping")
@@ -741,7 +920,7 @@ class GraphBuilder:
action = Action(
type=action_type,
target=TargetSpec(
role=target_role,
by_role=target_role,
selection_policy="first",
fallback_strategy="visual_similarity"
),

View File

@@ -133,10 +133,10 @@ class NodeMatcher:
node: WorkflowNode
) -> bool:
"""Valider les contraintes du node contre l'état."""
template = node.screen_template
if template.window_title_pattern:
if not state.raw_level or not state.raw_level.window_title:
template = node.template
if template and template.window and template.window.title_pattern:
if not state.window or not state.window.window_title:
return False
return True
@@ -179,13 +179,14 @@ class NodeMatcher:
# Calculer similarités avec tous les nodes
similarities = []
for node in candidate_nodes:
if node.screen_template.embedding_prototype_path:
proto_path = node.template.embedding.vector_id if (node.template and node.template.embedding) else None
if proto_path:
try:
prototype = np.load(node.screen_template.embedding_prototype_path)
prototype = np.load(proto_path)
similarity = float(np.dot(state_vector, prototype))
similarities.append({
'node_id': node.node_id,
'node_label': node.label,
'node_label': node.name,
'similarity': similarity,
'threshold': self.similarity_threshold,
'matched': similarity >= self.similarity_threshold
@@ -204,9 +205,9 @@ class NodeMatcher:
'timestamp': timestamp,
'failed_match_id': failed_match_id,
'state': {
'window_title': state.raw_level.window_title if state.raw_level else None,
'screenshot_path': str(state.raw_level.screenshot_path) if state.raw_level else None,
'ui_elements_count': len(state.perception_level.ui_elements) if state.perception_level else 0
'window_title': state.window.window_title if getattr(state, 'window', None) else None,
'screenshot_path': str(state.raw.screenshot_path) if getattr(state, 'raw', None) else None,
'ui_elements_count': len(state.ui_elements) if getattr(state, 'ui_elements', None) else 0
},
'matching_results': {
'best_confidence': best_confidence,

View File

@@ -303,7 +303,7 @@ class HierarchicalMatcher:
if not window_info:
return 0.5 # Score neutre si pas d'info
template = getattr(node, 'screen_template', None)
template = getattr(node, 'template', None)
if not template:
return 0.5
@@ -311,7 +311,7 @@ class HierarchicalMatcher:
# Matching du titre
current_title = window_info.get('title', '')
template_pattern = getattr(template, 'window_title_pattern', None)
template_pattern = getattr(template.window, 'title_pattern', None) if getattr(template, 'window', None) else None
if template_pattern and current_title:
if self.config.use_regex_title_matching:
@@ -329,7 +329,7 @@ class HierarchicalMatcher:
# Matching du processus
current_process = window_info.get('process_name', '')
template_process = getattr(template, 'process_name', None)
template_process = getattr(template.window, 'process_name', None) if getattr(template, 'window', None) else None
if template_process and current_process:
if current_process.lower() == template_process.lower():
@@ -367,12 +367,12 @@ class HierarchicalMatcher:
Returns:
Score de confiance 0.0-1.0
"""
template = getattr(node, 'screen_template', None)
template = getattr(node, 'template', None)
if not template:
return 0.5
# Récupérer embedding prototype du template
prototype = getattr(template, 'embedding_prototype', None)
prototype = getattr(template.embedding, 'vector_id', None) if getattr(template, 'embedding', None) else None
if prototype is None:
return 0.5
@@ -445,7 +445,7 @@ class HierarchicalMatcher:
if not detected_elements:
return 0.5
template = getattr(node, 'screen_template', None)
template = getattr(node, 'template', None)
if not template:
return 0.5

View File

@@ -92,6 +92,41 @@ def get_execution_result():
from .execution_result import WorkflowExecutionResult
return WorkflowExecutionResult
# Lazy import via __getattr__ pour éviter les imports circulaires
_LAZY_IMPORTS = {
"StateEmbedding": "core.models.state_embedding",
"EmbeddingComponent": "core.models.state_embedding",
"Workflow": "core.models.workflow_graph",
"WorkflowNode": "core.models.workflow_graph",
"WorkflowEdge": "core.models.workflow_graph",
"ScreenTemplate": "core.models.workflow_graph",
"Action": "core.models.workflow_graph",
"TargetSpec": "core.models.workflow_graph",
"ActionType": "core.models.workflow_graph",
"EdgeConstraints": "core.models.workflow_graph",
"PostConditions": "core.models.workflow_graph",
"LearningState": "core.models.workflow_graph",
"SelectionPolicy": "core.models.workflow_graph",
"WindowConstraint": "core.models.workflow_graph",
"TextConstraint": "core.models.workflow_graph",
"UIConstraint": "core.models.workflow_graph",
"EmbeddingPrototype": "core.models.workflow_graph",
"EdgeStats": "core.models.workflow_graph",
"SafetyRules": "core.models.workflow_graph",
"WorkflowStats": "core.models.workflow_graph",
"LearningConfig": "core.models.workflow_graph",
"WorkflowExecutionResult": "core.models.execution_result",
"PerformanceMetrics": "core.models.execution_result",
}
def __getattr__(name):
if name in _LAZY_IMPORTS:
import importlib
module = importlib.import_module(_LAZY_IMPORTS[name])
return getattr(module, name)
raise AttributeError(f"module 'core.models' has no attribute {name!r}")
__all__ = [
# Modèles de base standardisés (Tâche 4)
"BBox",

View File

@@ -45,6 +45,25 @@ class BBox(BaseModel):
return int(v)
raise ValueError("Dimensions must be numeric")
def __iter__(self):
"""Permet le unpacking: x, y, w, h = bbox"""
return iter((self.x, self.y, self.width, self.height))
def __getitem__(self, index):
"""Permet l'accès par index: bbox[0], bbox[1], etc."""
return (self.x, self.y, self.width, self.height)[index]
def __len__(self):
return 4
def __eq__(self, other):
if isinstance(other, BBox):
return (self.x == other.x and self.y == other.y and
self.width == other.width and self.height == other.height)
if isinstance(other, (tuple, list)) and len(other) == 4:
return (self.x, self.y, self.width, self.height) == tuple(other)
return NotImplemented
def to_tuple(self) -> Tuple[int, int, int, int]:
"""Conversion vers tuple (x, y, w, h)"""
return (self.x, self.y, self.width, self.height)

View File

@@ -311,8 +311,8 @@ class ScreenTemplate:
# Vérifier contraintes de texte
if hasattr(screen_state, 'perception'):
detected_texts = getattr(screen_state.perception, 'detected_texts', [])
if not self.text.matches(detected_texts):
detected_text = getattr(screen_state.perception, 'detected_text', [])
if not self.text.matches(detected_text):
return False, 0.0
# Vérifier contraintes UI

View File

@@ -3,5 +3,6 @@ Pipeline module - Orchestration du flux RPA Vision V3
"""
from .workflow_pipeline import WorkflowPipeline, create_pipeline
from .screen_analyzer import ScreenAnalyzer
__all__ = ["WorkflowPipeline", "create_pipeline"]
__all__ = ["WorkflowPipeline", "create_pipeline", "ScreenAnalyzer"]

View File

@@ -0,0 +1,343 @@
"""
ScreenAnalyzer - Construction complète d'un ScreenState depuis un screenshot
Orchestre les 4 niveaux du ScreenState :
Niveau 1 (Raw) : métadonnées de l'image
Niveau 2 (Perception): OCR + embedding global
Niveau 3 (UI) : détection d'éléments UI
Niveau 4 (Contexte) : fenêtre active, workflow en cours
Ce module comble le chaînon manquant entre la capture brute (Couche 0)
et la construction d'embeddings (Couche 3).
"""
import logging
import os
from datetime import datetime
from pathlib import Path
from typing import Optional, Dict, Any, List
from PIL import Image
from core.models.screen_state import (
ScreenState,
RawLevel,
PerceptionLevel,
ContextLevel,
WindowContext,
EmbeddingRef,
)
from core.models.ui_element import UIElement
logger = logging.getLogger(__name__)
class ScreenAnalyzer:
"""
Construit un ScreenState complet (4 niveaux) depuis un screenshot.
Utilise le UIDetector pour la détection d'éléments et un OCR
(docTR ou Tesseract) pour l'extraction de texte.
Example:
>>> analyzer = ScreenAnalyzer()
>>> state = analyzer.analyze("/path/to/screenshot.png")
>>> print(state.perception.detected_text)
>>> print(len(state.ui_elements))
"""
def __init__(
self,
ui_detector=None,
ocr_engine: Optional[str] = None,
session_id: str = "",
):
"""
Args:
ui_detector: Instance de UIDetector (créé si None)
ocr_engine: Moteur OCR à utiliser ("doctr", "tesseract", None=auto)
session_id: ID de la session en cours
"""
self._ui_detector = ui_detector
self._ocr_engine_name = ocr_engine
self._ocr = None
self.session_id = session_id
self._state_counter = 0
# Initialisation lazy pour éviter les imports lourds au démarrage
self._ui_detector_initialized = ui_detector is not None
self._ocr_initialized = False
# =========================================================================
# API publique
# =========================================================================
def analyze(
self,
screenshot_path: str,
window_info: Optional[Dict[str, Any]] = None,
context: Optional[Dict[str, Any]] = None,
) -> ScreenState:
"""
Analyser un screenshot et construire un ScreenState complet.
Args:
screenshot_path: Chemin vers le fichier image
window_info: Infos fenêtre active {"title": ..., "app_name": ...}
context: Contexte métier optionnel
Returns:
ScreenState avec les 4 niveaux remplis
"""
screenshot_path = str(screenshot_path)
self._state_counter += 1
state_id = f"{self.session_id}_state_{self._state_counter:04d}" if self.session_id else f"state_{self._state_counter:04d}"
# Niveau 1 : Raw
raw = self._build_raw_level(screenshot_path)
# Niveau 2 : Perception (OCR)
detected_text = self._extract_text(screenshot_path)
perception = PerceptionLevel(
embedding=EmbeddingRef(
provider="openclip_ViT-B-32",
vector_id=f"data/embeddings/screens/{state_id}.npy",
dimensions=512,
),
detected_text=detected_text,
text_detection_method=self._get_ocr_method_name(),
confidence_avg=0.85 if detected_text else 0.0,
)
# Niveau 3 : UI Elements
ui_elements = self._detect_ui_elements(screenshot_path, window_info)
# Niveau 4 : Contexte
window_ctx = self._build_window_context(window_info)
context_level = self._build_context_level(context)
state = ScreenState(
screen_state_id=state_id,
timestamp=datetime.now(),
session_id=self.session_id,
window=window_ctx,
raw=raw,
perception=perception,
context=context_level,
metadata={
"analyzer_version": "1.0",
"ui_elements_count": len(ui_elements),
"text_regions_count": len(detected_text),
},
ui_elements=ui_elements,
)
logger.info(
f"ScreenState {state_id} construit: "
f"{len(ui_elements)} éléments UI, {len(detected_text)} textes détectés"
)
return state
def analyze_image(
self,
image: Image.Image,
save_dir: str = "data/screens",
window_info: Optional[Dict[str, Any]] = None,
context: Optional[Dict[str, Any]] = None,
) -> ScreenState:
"""
Analyser une PIL Image (utile quand on a déjà l'image en mémoire).
Sauvegarde l'image sur disque puis appelle analyze().
"""
save_path = Path(save_dir)
save_path.mkdir(parents=True, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
filename = f"screen_{timestamp}.png"
filepath = save_path / filename
image.save(str(filepath))
return self.analyze(str(filepath), window_info=window_info, context=context)
# =========================================================================
# Niveau 1 : Raw
# =========================================================================
def _build_raw_level(self, screenshot_path: str) -> RawLevel:
file_size = 0
try:
file_size = os.path.getsize(screenshot_path)
except OSError:
pass
return RawLevel(
screenshot_path=screenshot_path,
capture_method="mss",
file_size_bytes=file_size,
)
# =========================================================================
# Niveau 2 : Perception — OCR
# =========================================================================
def _extract_text(self, screenshot_path: str) -> List[str]:
"""Extraire le texte d'un screenshot via OCR."""
self._ensure_ocr()
if self._ocr is None:
return []
try:
return self._ocr(screenshot_path)
except Exception as e:
logger.warning(f"OCR échoué: {e}")
return []
def _ensure_ocr(self) -> None:
"""Initialiser le moteur OCR (lazy)."""
if self._ocr_initialized:
return
self._ocr_initialized = True
engine = self._ocr_engine_name
# Auto-détection : essayer docTR puis Tesseract
if engine is None or engine == "doctr":
try:
self._ocr = self._create_doctr_ocr()
logger.info("OCR initialisé avec docTR")
return
except Exception as e:
if engine == "doctr":
logger.warning(f"docTR non disponible: {e}")
return
if engine is None or engine == "tesseract":
try:
self._ocr = self._create_tesseract_ocr()
logger.info("OCR initialisé avec Tesseract")
return
except Exception as e:
logger.warning(f"Tesseract non disponible: {e}")
logger.warning("Aucun moteur OCR disponible — detected_text sera vide")
def _create_doctr_ocr(self):
"""Créer une fonction OCR basée sur docTR."""
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
predictor = ocr_predictor(det_arch="db_resnet50", reco_arch="crnn_vgg16_bn", pretrained=True)
def ocr_func(image_path: str) -> List[str]:
doc = DocumentFile.from_images(image_path)
result = predictor(doc)
texts = []
for page in result.pages:
for block in page.blocks:
for line in block.lines:
line_text = " ".join(word.value for word in line.words)
if line_text.strip():
texts.append(line_text.strip())
return texts
return ocr_func
def _create_tesseract_ocr(self):
"""Créer une fonction OCR basée sur Tesseract."""
import pytesseract
def ocr_func(image_path: str) -> List[str]:
img = Image.open(image_path)
raw_text = pytesseract.image_to_string(img, lang="fra+eng")
lines = [line.strip() for line in raw_text.split("\n") if line.strip()]
return lines
return ocr_func
def _get_ocr_method_name(self) -> str:
if self._ocr is None:
return "none"
if self._ocr_engine_name:
return self._ocr_engine_name
return "doctr"
# =========================================================================
# Niveau 3 : UI Elements
# =========================================================================
def _detect_ui_elements(
self,
screenshot_path: str,
window_info: Optional[Dict[str, Any]] = None,
) -> List[UIElement]:
"""Détecter les éléments UI dans le screenshot."""
self._ensure_ui_detector()
if self._ui_detector is None:
return []
try:
elements = self._ui_detector.detect(
screenshot_path, window_context=window_info
)
return elements
except Exception as e:
logger.warning(f"Détection UI échouée: {e}")
return []
def _ensure_ui_detector(self) -> None:
"""Initialiser le UIDetector (lazy)."""
if self._ui_detector_initialized:
return
self._ui_detector_initialized = True
try:
from core.detection.ui_detector import UIDetector, DetectionConfig
config = DetectionConfig(
use_owl_detection=False, # Désactiver OWL par défaut (lourd)
use_vlm_classification=True,
confidence_threshold=0.6,
)
self._ui_detector = UIDetector(config)
logger.info("UIDetector initialisé")
except Exception as e:
logger.warning(f"UIDetector non disponible: {e}")
self._ui_detector = None
# =========================================================================
# Niveau 4 : Contexte
# =========================================================================
def _build_window_context(
self, window_info: Optional[Dict[str, Any]] = None
) -> WindowContext:
if window_info:
return WindowContext(
app_name=window_info.get("app_name", "unknown"),
window_title=window_info.get("title", "Unknown"),
screen_resolution=window_info.get("screen_resolution", [1920, 1080]),
workspace=window_info.get("workspace", "main"),
)
return WindowContext(
app_name="unknown",
window_title="Unknown",
screen_resolution=[1920, 1080],
workspace="main",
)
def _build_context_level(
self, context: Optional[Dict[str, Any]] = None
) -> ContextLevel:
if context:
return ContextLevel(
current_workflow_candidate=context.get("workflow_candidate"),
workflow_step=context.get("workflow_step"),
user_id=context.get("user_id", ""),
tags=context.get("tags", []),
business_variables=context.get("business_variables", {}),
)
return ContextLevel()

View File

@@ -319,17 +319,25 @@ class WorkflowPipeline:
np.ndarray ou None si aucun vecteur trouvé
"""
# v1: prototype stocké en liste directement
# v3: prototype stocké dans metadata (Phase 0, mars 2026)
meta = getattr(node, "metadata", {}) or {}
proto_list = meta.get("_prototype_vector")
if proto_list is not None and isinstance(proto_list, list):
try:
return np.array(proto_list, dtype=np.float32)
except Exception as e:
logger.debug(f"Failed to convert metadata prototype: {e}")
# v1: prototype stocké en liste directement sur template
tpl = getattr(node, "template", None)
if tpl is not None:
proto_list = getattr(tpl, "embedding_prototype", None)
if isinstance(proto_list, list):
try:
v = np.array(proto_list, dtype=np.float32)
return v
return np.array(proto_list, dtype=np.float32)
except Exception as e:
logger.debug(f"Failed to convert embedding_prototype list: {e}")
# v2: prototype stocké sur disque via EmbeddingPrototype.vector_id
if tpl is not None:
emb = getattr(tpl, "embedding", None)
@@ -341,16 +349,6 @@ class WorkflowPipeline:
except Exception as e:
logger.debug(f"Failed to load vector from {vector_id}: {e}")
# fallback (ancienne nomenclature)
st = getattr(node, "screen_template", None)
if st is not None:
p = getattr(st, "embedding_prototype_path", None)
if p:
try:
return np.load(p).astype(np.float32)
except Exception as e:
logger.debug(f"Failed to load legacy vector from {p}: {e}")
return None
# =========================================================================
@@ -918,18 +916,6 @@ class WorkflowPipeline:
"recovery_attempted": recovery_result.success,
"recovery_message": recovery_result.message if recovery_result else None
}
self.error_handler.error_history.append(error_ctx)
self.error_handler._log_error(error_ctx)
return {
"execution_id": execution_id,
"workflow_id": workflow_id,
"success": False,
"step_type": "execution_error",
"error": str(e),
"execution_time_ms": total_time_ms,
"correlation_id": execution_id
}
# =============================================================================

View File

@@ -210,7 +210,7 @@ class TrainingQualityValidator:
# 3. Vérifier observations par node
nodes = getattr(workflow, 'nodes', [])
for node in nodes:
obs_count = getattr(node, 'observation_count', 0)
obs_count = (node.metadata.get('observation_count', 0) if getattr(node, 'metadata', None) else 0)
if obs_count < self.config.min_observations_per_node:
recommendations.append(
f"Node '{getattr(node, 'node_id', 'unknown')}' a seulement {obs_count} observations "
@@ -240,7 +240,7 @@ class TrainingQualityValidator:
len(outlier_indices) <= len(embeddings) * self.config.max_outlier_ratio and
(validation_result is None or validation_result.is_valid) and
all(
getattr(node, 'observation_count', 0) >= self.config.min_observations_per_node
(node.metadata.get('observation_count', 0) if getattr(node, 'metadata', None) else 0) >= self.config.min_observations_per_node
for node in nodes
)
)