feat(cognition): timing + écran attendu + auto-apprentissage Shadow + VLM qwen2.5vl
Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 13s
security-audit / pip-audit (CVE dépendances) (push) Successful in 10s
security-audit / Scan secrets (grep) (push) Successful in 8s
tests / Lint (ruff + black) (push) Successful in 14s
tests / Tests unitaires (sans GPU) (push) Failing after 15s
tests / Tests sécurité (critique) (push) Has been skipped
Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 13s
security-audit / pip-audit (CVE dépendances) (push) Successful in 10s
security-audit / Scan secrets (grep) (push) Successful in 8s
tests / Lint (ruff + black) (push) Successful in 14s
tests / Tests unitaires (sans GPU) (push) Failing after 15s
tests / Tests sécurité (critique) (push) Has been skipped
Mémoire de travail enrichie : - Timing par étape (durée, moyenne, alerte si lent) - Écran attendu vs observation réelle - Contexte VLM étendu VLM reasoning : default qwen2.5vl:3b (gemma4 ne supporte pas vision) Auto-apprentissage Shadow : - stream_processor apprend les dialogues automatiquement - Clic utilisateur après dialogue → pattern mémorisé - Sauvegardé dans data/learned_patterns.json GUI-R1 : 10 patterns additionnels extraits du dataset Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -77,10 +77,15 @@ class CognitiveContext:
|
||||
needs_help: bool = False
|
||||
help_reason: str = ""
|
||||
|
||||
# Métadonnées
|
||||
# Timing
|
||||
session_id: str = ""
|
||||
machine_id: str = ""
|
||||
started_at: Optional[datetime] = None
|
||||
step_started_at: Optional[datetime] = None
|
||||
step_durations: Dict[str, List[float]] = field(default_factory=dict)
|
||||
|
||||
# Ce que Léa devrait voir à l'écran (comparaison attendu vs réel)
|
||||
expected_screen: str = ""
|
||||
|
||||
def record_action(self, action_type: str, target: str = "",
|
||||
result: str = "", success: bool = True,
|
||||
@@ -117,10 +122,47 @@ class CognitiveContext:
|
||||
|
||||
def advance_step(self):
|
||||
"""Passe à l'étape suivante du plan."""
|
||||
# Enregistrer la durée de l'étape précédente
|
||||
if self.step_started_at:
|
||||
duration = (datetime.now() - self.step_started_at).total_seconds()
|
||||
step_key = self.current_step_description or f"step_{self.current_step}"
|
||||
self.step_durations.setdefault(step_key, []).append(duration)
|
||||
|
||||
self.current_step += 1
|
||||
self.step_started_at = datetime.now()
|
||||
if self.remaining_steps:
|
||||
self.current_step_description = self.remaining_steps.pop(0)
|
||||
|
||||
def get_step_timing(self) -> Optional[Dict[str, Any]]:
|
||||
"""Retourne les infos de timing de l'étape en cours."""
|
||||
if not self.step_started_at:
|
||||
return None
|
||||
|
||||
elapsed = (datetime.now() - self.step_started_at).total_seconds()
|
||||
step_key = self.current_step_description or f"step_{self.current_step}"
|
||||
history = self.step_durations.get(step_key, [])
|
||||
avg = sum(history) / len(history) if history else None
|
||||
|
||||
result = {"elapsed_seconds": elapsed}
|
||||
if avg:
|
||||
result["avg_previous"] = avg
|
||||
result["is_slow"] = elapsed > avg * 2
|
||||
return result
|
||||
|
||||
def set_expected_screen(self, description: str):
|
||||
"""Définit ce que Léa devrait voir à l'écran pour cette étape."""
|
||||
self.expected_screen = description
|
||||
|
||||
def check_screen_matches_expected(self) -> Optional[bool]:
|
||||
"""Compare l'observation actuelle avec l'écran attendu."""
|
||||
if not self.expected_screen or not self.current_observation:
|
||||
return None
|
||||
obs_text = (self.current_observation.window_title + " " +
|
||||
self.current_observation.ocr_text).lower()
|
||||
expected_words = self.expected_screen.lower().split()
|
||||
matches = sum(1 for w in expected_words if w in obs_text)
|
||||
return matches / max(len(expected_words), 1) > 0.3
|
||||
|
||||
def learn(self, fact: str):
|
||||
"""Enregistre un fait appris pendant l'exécution."""
|
||||
if fact not in self.learned_facts:
|
||||
@@ -175,6 +217,21 @@ class CognitiveContext:
|
||||
for step in self.remaining_steps[:3]:
|
||||
lines.append(f" - {step}")
|
||||
|
||||
timing = self.get_step_timing()
|
||||
if timing:
|
||||
lines.append(f"TEMPS ÉTAPE : {timing['elapsed_seconds']:.1f}s")
|
||||
if timing.get('avg_previous'):
|
||||
lines.append(f"MOYENNE PRÉCÉDENTE : {timing['avg_previous']:.1f}s")
|
||||
if timing.get('is_slow'):
|
||||
lines.append("⚠ ÉTAPE ANORMALEMENT LENTE")
|
||||
|
||||
if self.expected_screen:
|
||||
match = self.check_screen_matches_expected()
|
||||
if match is False:
|
||||
lines.append(f"⚠ ÉCRAN INATTENDU (attendu: {self.expected_screen})")
|
||||
elif match is True:
|
||||
lines.append(f"ÉCRAN CONFORME : {self.expected_screen}")
|
||||
|
||||
lines.append(f"CONFIANCE : {self.confidence:.0%}")
|
||||
|
||||
if self.needs_help:
|
||||
|
||||
@@ -287,7 +287,7 @@ Si l'écran est normal sans action nécessaire, réponds action="nothing".
|
||||
Réponds UNIQUEMENT le JSON, pas d'explication."""
|
||||
|
||||
ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434")
|
||||
model = os.environ.get("RPA_VLM_MODEL", os.environ.get("VLM_MODEL", "gemma4:e4b"))
|
||||
model = os.environ.get("RPA_REASONING_MODEL", os.environ.get("RPA_VLM_MODEL", "qwen2.5vl:3b"))
|
||||
|
||||
response = requests.post(
|
||||
f"{ollama_url}/api/generate",
|
||||
|
||||
Reference in New Issue
Block a user