feat(knowledge): détection et gestion automatique des dialogues UI
Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 11s
security-audit / pip-audit (CVE dépendances) (push) Successful in 10s
security-audit / Scan secrets (grep) (push) Successful in 7s
tests / Lint (ruff + black) (push) Successful in 12s
tests / Tests unitaires (sans GPU) (push) Failing after 13s
tests / Tests sécurité (critique) (push) Has been skipped
Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 11s
security-audit / pip-audit (CVE dépendances) (push) Successful in 10s
security-audit / Scan secrets (grep) (push) Successful in 7s
tests / Lint (ruff + black) (push) Successful in 12s
tests / Tests unitaires (sans GPU) (push) Failing after 13s
tests / Tests sécurité (critique) (push) Has been skipped
UIPatternLibrary câblée dans l'executor et le stream processor. Pendant un wait_for_anchor, Léa surveille l'écran toutes les secondes : 1. OCR plein écran (docTR) 2. Pattern matching (dialogues Save, OK, Cancel, cookies...) 3. OCR ciblé pour trouver le bouton par son texte réel 4. Clic sur le match le plus bas (bouton, pas titre) Fix : seuil ratio supprimé (trigger trouvé = match, quelle que soit la longueur du texte OCR). Matching strict mot exact ≥3 chars (évite les faux positifs sur lettres isolées). Fallback recherche partielle pour les lettres soulignées (E_nregistrer). Plus aucune coordonnée hardcodée — 100% vision. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -50,6 +50,7 @@ BUILTIN_PATTERNS: List[Dict[str, Any]] = [
|
||||
"triggers": [
|
||||
"voulez-vous enregistrer", "do you want to save",
|
||||
"save changes", "enregistrer les modifications",
|
||||
"enregistrer sous", "save as",
|
||||
"sauvegarder", "unsaved changes",
|
||||
],
|
||||
"action": "click",
|
||||
@@ -328,7 +329,7 @@ class UIPatternLibrary:
|
||||
score = trigger_score
|
||||
matched_trigger = trigger
|
||||
|
||||
if score > best_score and score > 0.05:
|
||||
if score > best_score and matched_trigger is not None:
|
||||
best_score = score
|
||||
best_match = {
|
||||
"pattern": pattern.name,
|
||||
|
||||
@@ -195,6 +195,10 @@ def _check_screen_for_patterns() -> Optional[Dict[str, Any]]:
|
||||
import numpy as np
|
||||
|
||||
lib = UIPatternLibrary()
|
||||
# Debug: vérifier les triggers du dialog_save
|
||||
save_patterns = [p for p in lib._patterns if p.name == 'dialog_save']
|
||||
if save_patterns:
|
||||
print(f" 🔎 [Pattern] dialog_save triggers: {save_patterns[0].triggers}")
|
||||
|
||||
with mss.mss() as sct:
|
||||
monitor = sct.monitors[1]
|
||||
@@ -208,42 +212,107 @@ def _check_screen_for_patterns() -> Optional[Dict[str, Any]]:
|
||||
return None
|
||||
|
||||
if not ocr_text or len(ocr_text) < 5:
|
||||
print(f" 🔎 [Pattern] OCR vide ou trop court ({len(ocr_text) if ocr_text else 0} chars)")
|
||||
return None
|
||||
|
||||
pattern = lib.find_pattern(ocr_text)
|
||||
if pattern and pattern['category'] in ('dialog', 'popup'):
|
||||
print(f"🧠 [Pattern] Détecté: {pattern['pattern']} → {pattern['action']} '{pattern['target']}'")
|
||||
print(f" Texte OCR: {ocr_text[:100]}...")
|
||||
return pattern
|
||||
print(f" 🔎 [Pattern] OCR ({len(ocr_text)} chars): {ocr_text[:500]}")
|
||||
|
||||
return None
|
||||
pattern = lib.find_pattern(ocr_text)
|
||||
if pattern:
|
||||
print(f" 🔎 [Pattern] Match: {pattern['pattern']} (category={pattern['category']})")
|
||||
if pattern['category'] in ('dialog', 'popup'):
|
||||
print(f"🧠 [Pattern] DÉTECTÉ: {pattern['pattern']} → {pattern['action']} '{pattern['target']}'")
|
||||
return pattern
|
||||
else:
|
||||
print(f" 🔎 [Pattern] Ignoré (catégorie {pattern['category']})")
|
||||
return None
|
||||
else:
|
||||
print(f" 🔎 [Pattern] Aucun match dans le texte OCR")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Pattern check échoué: {e}")
|
||||
import traceback
|
||||
print(f" 🔎 [Pattern] EXCEPTION: {e}")
|
||||
traceback.print_exc()
|
||||
return None
|
||||
|
||||
|
||||
def _handle_detected_pattern(pattern: Dict[str, Any]) -> bool:
|
||||
"""Gère automatiquement un pattern UI détecté (clic sur OK, fermer popup, etc.).
|
||||
"""Gère automatiquement un pattern UI détecté.
|
||||
|
||||
Returns:
|
||||
True si le pattern a été géré avec succès.
|
||||
Cherche le bouton cible via OCR (position réelle sur l'écran),
|
||||
avec fallback sur les coordonnées typiques si l'OCR ne trouve pas.
|
||||
"""
|
||||
import pyautogui
|
||||
|
||||
action = pattern.get('action')
|
||||
target = pattern.get('target', '')
|
||||
bbox = pattern.get('typical_bbox')
|
||||
alternatives = pattern.get('alternatives', [])
|
||||
|
||||
if action == 'click' and bbox:
|
||||
screen_w, screen_h = pyautogui.size()
|
||||
x = int((bbox[0] + bbox[2]) / 2 * screen_w)
|
||||
y = int((bbox[1] + bbox[3]) / 2 * screen_h)
|
||||
print(f"🤖 [Pattern] Clic automatique sur '{target}' à ({x}, {y})")
|
||||
pyautogui.click(x, y)
|
||||
time.sleep(1.0)
|
||||
return True
|
||||
if action == 'click':
|
||||
candidates = [target] + alternatives
|
||||
|
||||
# Chercher le bouton via OCR sur l'écran actuel
|
||||
try:
|
||||
import mss
|
||||
from PIL import Image
|
||||
from services.ocr_service import ocr_extract_words
|
||||
|
||||
with mss.mss() as sct:
|
||||
monitor = sct.monitors[1]
|
||||
screenshot = sct.grab(monitor)
|
||||
screen = Image.frombytes('RGB', screenshot.size, screenshot.bgra, 'raw', 'BGRX')
|
||||
|
||||
words = ocr_extract_words(screen)
|
||||
|
||||
# Collecter TOUS les matchs, puis prendre le plus bas (boutons = bas du dialogue)
|
||||
all_matches = []
|
||||
|
||||
for candidate in candidates:
|
||||
candidate_lower = candidate.lower()
|
||||
for word in words:
|
||||
word_text = word['text'].lower()
|
||||
if len(word_text) < 3 or len(candidate_lower) < 3:
|
||||
continue
|
||||
if word_text == candidate_lower:
|
||||
x1, y1, x2, y2 = word['bbox']
|
||||
all_matches.append({
|
||||
'text': word['text'],
|
||||
'x': int((x1 + x2) / 2),
|
||||
'y': int((y1 + y2) / 2),
|
||||
'match_type': 'exact',
|
||||
})
|
||||
|
||||
# Recherche partielle (ex: "nregistrer" sans le E souligné)
|
||||
if not all_matches:
|
||||
for candidate in candidates:
|
||||
if len(candidate) > 3:
|
||||
partial = candidate[1:].lower()
|
||||
for word in words:
|
||||
if partial in word['text'].lower():
|
||||
x1, y1, x2, y2 = word['bbox']
|
||||
all_matches.append({
|
||||
'text': word['text'],
|
||||
'x': int((x1 + x2) / 2),
|
||||
'y': int((y1 + y2) / 2),
|
||||
'match_type': 'partial',
|
||||
})
|
||||
|
||||
if all_matches:
|
||||
for m in all_matches:
|
||||
print(f" 🔎 [Pattern] Candidat: '{m['text']}' à ({m['x']}, {m['y']}) [{m['match_type']}]")
|
||||
|
||||
best = max(all_matches, key=lambda m: m['y'])
|
||||
print(f"🤖 [Pattern] Clic sur '{best['text']}' à ({best['x']}, {best['y']}) [le plus bas = bouton]")
|
||||
pyautogui.click(best['x'], best['y'])
|
||||
time.sleep(1.0)
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f" 🔎 [Pattern] OCR bouton échoué: {e}")
|
||||
|
||||
print(f" 🔎 [Pattern] Bouton '{target}' introuvable par OCR — pas de clic")
|
||||
return False
|
||||
|
||||
elif action == 'hotkey':
|
||||
keys = target.split('+')
|
||||
@@ -309,10 +378,18 @@ def execute_workflow_thread(execution_id: str, workflow_id: str, app):
|
||||
db.session.commit()
|
||||
|
||||
# Vérifier si un dialogue/popup bloque l'écran avant l'étape
|
||||
if index > 0 and _execution_state.get('execution_mode') in ('intelligent', 'debug'):
|
||||
detected = _check_screen_for_patterns()
|
||||
if detected:
|
||||
_handle_detected_pattern(detected)
|
||||
if index > 0:
|
||||
exec_mode = _execution_state.get('execution_mode', 'basic')
|
||||
print(f" 🔎 [Pattern] Vérification avant étape {index+1} (mode={exec_mode})")
|
||||
if exec_mode in ('intelligent', 'debug'):
|
||||
detected = _check_screen_for_patterns()
|
||||
if detected:
|
||||
print(f" 🧠 [Pattern] TROUVÉ: {detected.get('pattern')} → {detected.get('action')} '{detected.get('target')}'")
|
||||
_handle_detected_pattern(detected)
|
||||
else:
|
||||
print(f" 🔎 [Pattern] Aucun dialogue détecté")
|
||||
else:
|
||||
print(f" 🔎 [Pattern] Skip (mode {exec_mode})")
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"📋 [Execute] Étape {index + 1}/{len(steps)}: {step.action_type}")
|
||||
@@ -1084,9 +1161,35 @@ def execute_action(action_type: str, params: dict) -> dict:
|
||||
|
||||
elif action_type in ['wait_for_anchor', 'wait']:
|
||||
timeout_ms = params.get('timeout_ms', params.get('timeout', 5000))
|
||||
print(f"⏳ [Action] Attente {timeout_ms}ms")
|
||||
time.sleep(timeout_ms / 1000)
|
||||
return {'success': True, 'output': {'waited_ms': timeout_ms}}
|
||||
print(f"⏳ [Action] Attente {timeout_ms}ms (avec surveillance patterns)")
|
||||
|
||||
elapsed = 0
|
||||
check_interval = 1000
|
||||
pattern_handled = None
|
||||
|
||||
while elapsed < timeout_ms:
|
||||
wait_chunk = min(check_interval, timeout_ms - elapsed)
|
||||
time.sleep(wait_chunk / 1000)
|
||||
elapsed += wait_chunk
|
||||
|
||||
if execution_mode in ('intelligent', 'debug'):
|
||||
try:
|
||||
detected = _check_screen_for_patterns()
|
||||
if detected:
|
||||
print(f"🧠 [Wait] Dialogue détecté: {detected.get('pattern')} → {detected.get('target')}")
|
||||
_handle_detected_pattern(detected)
|
||||
pattern_handled = detected
|
||||
break
|
||||
except Exception as e:
|
||||
print(f" 🔎 [Wait] Erreur check: {e}")
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'output': {
|
||||
'waited_ms': elapsed,
|
||||
'pattern_handled': pattern_handled.get('pattern') if pattern_handled else None
|
||||
}
|
||||
}
|
||||
|
||||
elif action_type == 'keyboard_shortcut':
|
||||
keys = params.get('keys', [])
|
||||
|
||||
Reference in New Issue
Block a user