feat(cognition): raisonnement VLM quand les réflexes ne suffisent pas
Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 14s
security-audit / pip-audit (CVE dépendances) (push) Successful in 11s
security-audit / Scan secrets (grep) (push) Successful in 8s
tests / Lint (ruff + black) (push) Successful in 14s
tests / Tests unitaires (sans GPU) (push) Failing after 14s
tests / Tests sécurité (critique) (push) Has been skipped
Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 14s
security-audit / pip-audit (CVE dépendances) (push) Successful in 11s
security-audit / Scan secrets (grep) (push) Successful in 8s
tests / Lint (ruff + black) (push) Successful in 14s
tests / Tests unitaires (sans GPU) (push) Failing after 14s
tests / Tests sécurité (critique) (push) Has been skipped
vlm_reason_about_screen() : capture l'écran, envoie au VLM local (gemma4/Ollama) avec l'objectif et le contexte, retourne une action en JSON (click/type/wait/nothing + target + reasoning). Chaîne de décision : 1. Réflexes (UIPatternLibrary) → instantané 2. OCR bouton (docTR) → rapide 3. VLM reasoning (Ollama) → intelligent, ~2-5s Le VLM intervient UNIQUEMENT quand 1 et 2 échouent — pas de latence ajoutée quand les réflexes suffisent. UIPatternLibrary enrichie : charge builtin + GUI-R1 + learned patterns. save_learned_pattern() persiste les patterns appris par Shadow. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -206,7 +206,23 @@ def handle_detected_pattern(pattern: Dict[str, Any]) -> bool:
|
|||||||
time.sleep(1.0)
|
time.sleep(1.0)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
logger.info(f"Bouton '{target}' introuvable par OCR")
|
logger.info(f"Bouton '{target}' introuvable par OCR — appel VLM...")
|
||||||
|
vlm_result = vlm_reason_about_screen(
|
||||||
|
objective=f"Cliquer sur le bouton '{target}'",
|
||||||
|
context=f"Un dialogue '{pattern.get('pattern')}' est détecté"
|
||||||
|
)
|
||||||
|
if vlm_result and vlm_result.get('action') == 'click' and vlm_result.get('target'):
|
||||||
|
vlm_target = vlm_result['target']
|
||||||
|
for word in words:
|
||||||
|
if vlm_target.lower() in word['text'].lower():
|
||||||
|
x1, y1, x2, y2 = word['bbox']
|
||||||
|
x = int((x1 + x2) / 2)
|
||||||
|
y = int((y1 + y2) / 2)
|
||||||
|
logger.info(f"VLM → clic sur '{word['text']}' à ({x}, {y})")
|
||||||
|
pyautogui.click(x, y)
|
||||||
|
time.sleep(1.0)
|
||||||
|
return True
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -223,6 +239,90 @@ def handle_detected_pattern(pattern: Dict[str, Any]) -> bool:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def vlm_reason_about_screen(objective: str = "", context: str = "") -> Optional[Dict[str, Any]]:
|
||||||
|
"""Demande au VLM de raisonner sur l'écran actuel et proposer une action.
|
||||||
|
|
||||||
|
Utilisé quand les réflexes (patterns) ne suffisent pas.
|
||||||
|
Le VLM voit l'écran et décide quoi faire.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
objective: Ce que Léa essaie de faire (ex: "cliquer sur Enregistrer")
|
||||||
|
context: Contexte additionnel (ex: "un dialogue est apparu")
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict avec 'action', 'target', 'reasoning' ou None si le VLM ne peut pas aider.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import mss
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import base64
|
||||||
|
import io
|
||||||
|
import os
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
with mss.mss() as sct:
|
||||||
|
monitor = sct.monitors[1]
|
||||||
|
screenshot = sct.grab(monitor)
|
||||||
|
screen = Image.frombytes('RGB', screenshot.size, screenshot.bgra, 'raw', 'BGRX')
|
||||||
|
|
||||||
|
buffer = io.BytesIO()
|
||||||
|
screen.save(buffer, format='JPEG', quality=70)
|
||||||
|
image_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
|
||||||
|
|
||||||
|
prompt = f"""Analyse cet écran et dis-moi quoi faire.
|
||||||
|
|
||||||
|
Objectif : {objective or "Interagir avec l'interface visible"}
|
||||||
|
Contexte : {context or "Aucun contexte supplémentaire"}
|
||||||
|
|
||||||
|
Réponds en JSON strict :
|
||||||
|
{{
|
||||||
|
"action": "click" ou "type" ou "wait" ou "nothing",
|
||||||
|
"target": "texte exact du bouton ou champ à cliquer",
|
||||||
|
"reasoning": "explication courte de ton choix"
|
||||||
|
}}
|
||||||
|
|
||||||
|
Si tu vois un dialogue ou une popup, indique quel bouton cliquer.
|
||||||
|
Si l'écran est normal sans action nécessaire, réponds action="nothing".
|
||||||
|
Réponds UNIQUEMENT le JSON, pas d'explication."""
|
||||||
|
|
||||||
|
ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434")
|
||||||
|
model = os.environ.get("RPA_VLM_MODEL", os.environ.get("VLM_MODEL", "gemma4:e4b"))
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
f"{ollama_url}/api/generate",
|
||||||
|
json={
|
||||||
|
"model": model,
|
||||||
|
"prompt": prompt,
|
||||||
|
"images": [image_b64],
|
||||||
|
"stream": False,
|
||||||
|
"options": {"temperature": 0.1, "num_predict": 200}
|
||||||
|
},
|
||||||
|
timeout=30
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code != 200:
|
||||||
|
logger.warning(f"VLM reasoning failed: HTTP {response.status_code}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
result = response.json()
|
||||||
|
text = result.get('response', '').strip()
|
||||||
|
|
||||||
|
import re
|
||||||
|
match = re.search(r'\{[\s\S]*\}', text)
|
||||||
|
if match:
|
||||||
|
parsed = json.loads(match.group())
|
||||||
|
logger.info(f"VLM reasoning: {parsed.get('action')} '{parsed.get('target')}' — {parsed.get('reasoning', '')[:80]}")
|
||||||
|
return parsed
|
||||||
|
|
||||||
|
logger.debug(f"VLM response not parseable: {text[:100]}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"VLM reasoning failed: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def post_execution_cleanup(execution_mode: str = 'debug'):
|
def post_execution_cleanup(execution_mode: str = 'debug'):
|
||||||
"""Vérifie l'écran après exécution et gère les dialogues restants.
|
"""Vérifie l'écran après exécution et gère les dialogues restants.
|
||||||
|
|
||||||
@@ -240,4 +340,10 @@ def post_execution_cleanup(execution_mode: str = 'debug'):
|
|||||||
handle_detected_pattern(detected)
|
handle_detected_pattern(detected)
|
||||||
time.sleep(1.0)
|
time.sleep(1.0)
|
||||||
else:
|
else:
|
||||||
|
vlm_result = vlm_reason_about_screen(
|
||||||
|
objective="Vérifier que l'écran est propre après l'exécution",
|
||||||
|
context="Le workflow vient de se terminer"
|
||||||
|
)
|
||||||
|
if vlm_result and vlm_result.get('action') in ('click', 'type'):
|
||||||
|
logger.info(f"VLM post-workflow: {vlm_result.get('action')} '{vlm_result.get('target')}'")
|
||||||
break
|
break
|
||||||
|
|||||||
@@ -251,11 +251,25 @@ class UIPatternLibrary:
|
|||||||
elle sait immédiatement quoi faire.
|
elle sait immédiatement quoi faire.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# Chemins par défaut des fichiers de patterns additionnels
|
||||||
|
_PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
|
||||||
|
_GUI_R1_PATTERNS_PATH = _PROJECT_ROOT / "data" / "gui_r1_ui_patterns.json"
|
||||||
|
_LEARNED_PATTERNS_PATH = _PROJECT_ROOT / "data" / "learned_patterns.json"
|
||||||
|
|
||||||
def __init__(self, extra_patterns_path: Optional[str] = None):
|
def __init__(self, extra_patterns_path: Optional[str] = None):
|
||||||
self._patterns: List[UIPattern] = []
|
self._patterns: List[UIPattern] = []
|
||||||
self._load_builtin()
|
self._load_builtin()
|
||||||
|
|
||||||
|
# Charger les patterns extraits de GUI-R1 (statiques, générés une fois)
|
||||||
|
self._load_from_file(str(self._GUI_R1_PATTERNS_PATH))
|
||||||
|
|
||||||
|
# Charger les patterns appris par observation Shadow (dynamiques)
|
||||||
|
self._load_from_file(str(self._LEARNED_PATTERNS_PATH))
|
||||||
|
|
||||||
|
# Fichier custom fourni explicitement
|
||||||
if extra_patterns_path:
|
if extra_patterns_path:
|
||||||
self._load_from_file(extra_patterns_path)
|
self._load_from_file(extra_patterns_path)
|
||||||
|
|
||||||
logger.info(f"UIPatternLibrary: {len(self._patterns)} patterns chargés")
|
logger.info(f"UIPatternLibrary: {len(self._patterns)} patterns chargés")
|
||||||
|
|
||||||
def _load_builtin(self):
|
def _load_builtin(self):
|
||||||
@@ -278,12 +292,20 @@ class UIPatternLibrary:
|
|||||||
def _load_from_file(self, path: str):
|
def _load_from_file(self, path: str):
|
||||||
filepath = Path(path)
|
filepath = Path(path)
|
||||||
if not filepath.exists():
|
if not filepath.exists():
|
||||||
logger.warning(f"Fichier patterns non trouvé: {path}")
|
logger.debug(f"Fichier patterns non trouvé (OK si premier lancement): {path}")
|
||||||
return
|
return
|
||||||
try:
|
try:
|
||||||
with open(filepath) as f:
|
with open(filepath) as f:
|
||||||
data = json.load(f)
|
data = json.load(f)
|
||||||
for p in data.get("patterns", []):
|
for p in data.get("patterns", []):
|
||||||
|
# Construire metadata en incluant source/learned_at/gui_r1_id si présents
|
||||||
|
meta = dict(p.get("metadata", {}))
|
||||||
|
if "source" in p:
|
||||||
|
meta["source"] = p["source"]
|
||||||
|
if "learned_at" in p:
|
||||||
|
meta["learned_at"] = p["learned_at"]
|
||||||
|
if "gui_r1_id" in p:
|
||||||
|
meta["gui_r1_id"] = p["gui_r1_id"]
|
||||||
self._patterns.append(UIPattern(
|
self._patterns.append(UIPattern(
|
||||||
name=p["name"],
|
name=p["name"],
|
||||||
category=p.get("category", "custom"),
|
category=p.get("category", "custom"),
|
||||||
@@ -293,7 +315,8 @@ class UIPatternLibrary:
|
|||||||
typical_zone=p.get("typical_zone", "content"),
|
typical_zone=p.get("typical_zone", "content"),
|
||||||
typical_bbox=p.get("typical_bbox"),
|
typical_bbox=p.get("typical_bbox"),
|
||||||
os=p.get("os", "any"),
|
os=p.get("os", "any"),
|
||||||
metadata=p.get("metadata", {}),
|
confidence=p.get("confidence", 0.9),
|
||||||
|
metadata=meta,
|
||||||
))
|
))
|
||||||
logger.info(f"Chargé {len(data.get('patterns', []))} patterns depuis {path}")
|
logger.info(f"Chargé {len(data.get('patterns', []))} patterns depuis {path}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -413,6 +436,57 @@ class UIPatternLibrary:
|
|||||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||||
logger.info(f"Sauvegardé {len(self._patterns)} patterns dans {path}")
|
logger.info(f"Sauvegardé {len(self._patterns)} patterns dans {path}")
|
||||||
|
|
||||||
|
def save_learned_pattern(self, pattern_dict: Dict[str, Any]):
|
||||||
|
"""Persiste un pattern appris par observation Shadow dans learned_patterns.json.
|
||||||
|
|
||||||
|
Le pattern est ajouté en mémoire ET sauvegardé sur disque.
|
||||||
|
Le fichier est créé s'il n'existe pas, ou les patterns existants sont préservés.
|
||||||
|
"""
|
||||||
|
from datetime import datetime as dt
|
||||||
|
|
||||||
|
# Charger le fichier existant ou créer la structure
|
||||||
|
filepath = self._LEARNED_PATTERNS_PATH
|
||||||
|
filepath.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
existing: Dict[str, Any] = {"patterns": []}
|
||||||
|
if filepath.exists():
|
||||||
|
try:
|
||||||
|
with open(filepath, encoding="utf-8") as f:
|
||||||
|
existing = json.load(f)
|
||||||
|
except (json.JSONDecodeError, OSError):
|
||||||
|
logger.warning(f"Fichier {filepath} corrompu, recréation")
|
||||||
|
|
||||||
|
# Vérifier qu'on ne duplique pas (même trigger + même target)
|
||||||
|
new_triggers = set(t.lower() for t in pattern_dict.get("triggers", []))
|
||||||
|
new_target = pattern_dict.get("target", "").lower()
|
||||||
|
for existing_p in existing.get("patterns", []):
|
||||||
|
existing_triggers = set(t.lower() for t in existing_p.get("triggers", []))
|
||||||
|
if existing_triggers == new_triggers and existing_p.get("target", "").lower() == new_target:
|
||||||
|
logger.debug(f"Pattern déjà connu, skip: triggers={new_triggers}, target={new_target}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Numéroter automatiquement et construire l'entrée complète
|
||||||
|
count = len(existing.get("patterns", []))
|
||||||
|
entry = {
|
||||||
|
"name": pattern_dict.get("name", f"learned_dialog_{count + 1:03d}"),
|
||||||
|
"category": pattern_dict.get("category", "dialog"),
|
||||||
|
"triggers": pattern_dict.get("triggers", []),
|
||||||
|
"action": pattern_dict.get("action", "click"),
|
||||||
|
"target": pattern_dict.get("target", ""),
|
||||||
|
"os": pattern_dict.get("os", "windows"),
|
||||||
|
"source": "shadow_learning",
|
||||||
|
"learned_at": dt.now().isoformat(timespec="seconds"),
|
||||||
|
"confidence": pattern_dict.get("confidence", 0.8),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Ajouter en mémoire (avec le nom auto-généré)
|
||||||
|
self.add_pattern(entry)
|
||||||
|
existing.setdefault("patterns", []).append(entry)
|
||||||
|
|
||||||
|
with open(filepath, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(existing, f, indent=2, ensure_ascii=False)
|
||||||
|
logger.info(f"Pattern appris sauvegardé: {entry['name']} → {entry['target']}")
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def stats(self) -> Dict[str, int]:
|
def stats(self) -> Dict[str, int]:
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
|
|||||||
Reference in New Issue
Block a user