feat(knowledge): détection et gestion automatique des dialogues UI
Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 11s
security-audit / pip-audit (CVE dépendances) (push) Successful in 10s
security-audit / Scan secrets (grep) (push) Successful in 7s
tests / Lint (ruff + black) (push) Successful in 12s
tests / Tests unitaires (sans GPU) (push) Failing after 13s
tests / Tests sécurité (critique) (push) Has been skipped

UIPatternLibrary câblée dans l'executor et le stream processor.
Pendant un wait_for_anchor, Léa surveille l'écran toutes les secondes :
1. OCR plein écran (docTR)
2. Pattern matching (dialogues Save, OK, Cancel, cookies...)
3. OCR ciblé pour trouver le bouton par son texte réel
4. Clic sur le match le plus bas (bouton, pas titre)

Fix : seuil ratio supprimé (trigger trouvé = match, quelle que soit
la longueur du texte OCR). Matching strict mot exact ≥3 chars
(évite les faux positifs sur lettres isolées). Fallback recherche
partielle pour les lettres soulignées (E_nregistrer).

Plus aucune coordonnée hardcodée — 100% vision.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dom
2026-04-20 11:06:17 +02:00
parent d168833609
commit ffd97ae9a5
2 changed files with 131 additions and 27 deletions

View File

@@ -50,6 +50,7 @@ BUILTIN_PATTERNS: List[Dict[str, Any]] = [
"triggers": [
"voulez-vous enregistrer", "do you want to save",
"save changes", "enregistrer les modifications",
"enregistrer sous", "save as",
"sauvegarder", "unsaved changes",
],
"action": "click",
@@ -328,7 +329,7 @@ class UIPatternLibrary:
score = trigger_score
matched_trigger = trigger
if score > best_score and score > 0.05:
if score > best_score and matched_trigger is not None:
best_score = score
best_match = {
"pattern": pattern.name,

View File

@@ -195,6 +195,10 @@ def _check_screen_for_patterns() -> Optional[Dict[str, Any]]:
import numpy as np
lib = UIPatternLibrary()
# Debug: vérifier les triggers du dialog_save
save_patterns = [p for p in lib._patterns if p.name == 'dialog_save']
if save_patterns:
print(f" 🔎 [Pattern] dialog_save triggers: {save_patterns[0].triggers}")
with mss.mss() as sct:
monitor = sct.monitors[1]
@@ -208,42 +212,107 @@ def _check_screen_for_patterns() -> Optional[Dict[str, Any]]:
return None
if not ocr_text or len(ocr_text) < 5:
print(f" 🔎 [Pattern] OCR vide ou trop court ({len(ocr_text) if ocr_text else 0} chars)")
return None
pattern = lib.find_pattern(ocr_text)
if pattern and pattern['category'] in ('dialog', 'popup'):
print(f"🧠 [Pattern] Détecté: {pattern['pattern']}{pattern['action']} '{pattern['target']}'")
print(f" Texte OCR: {ocr_text[:100]}...")
return pattern
print(f" 🔎 [Pattern] OCR ({len(ocr_text)} chars): {ocr_text[:500]}")
return None
pattern = lib.find_pattern(ocr_text)
if pattern:
print(f" 🔎 [Pattern] Match: {pattern['pattern']} (category={pattern['category']})")
if pattern['category'] in ('dialog', 'popup'):
print(f"🧠 [Pattern] DÉTECTÉ: {pattern['pattern']}{pattern['action']} '{pattern['target']}'")
return pattern
else:
print(f" 🔎 [Pattern] Ignoré (catégorie {pattern['category']})")
return None
else:
print(f" 🔎 [Pattern] Aucun match dans le texte OCR")
return None
except Exception as e:
logger.debug(f"Pattern check échoué: {e}")
import traceback
print(f" 🔎 [Pattern] EXCEPTION: {e}")
traceback.print_exc()
return None
def _handle_detected_pattern(pattern: Dict[str, Any]) -> bool:
"""Gère automatiquement un pattern UI détecté (clic sur OK, fermer popup, etc.).
"""Gère automatiquement un pattern UI détecté.
Returns:
True si le pattern a été géré avec succès.
Cherche le bouton cible via OCR (position réelle sur l'écran),
avec fallback sur les coordonnées typiques si l'OCR ne trouve pas.
"""
import pyautogui
action = pattern.get('action')
target = pattern.get('target', '')
bbox = pattern.get('typical_bbox')
alternatives = pattern.get('alternatives', [])
if action == 'click' and bbox:
screen_w, screen_h = pyautogui.size()
x = int((bbox[0] + bbox[2]) / 2 * screen_w)
y = int((bbox[1] + bbox[3]) / 2 * screen_h)
print(f"🤖 [Pattern] Clic automatique sur '{target}' à ({x}, {y})")
pyautogui.click(x, y)
time.sleep(1.0)
return True
if action == 'click':
candidates = [target] + alternatives
# Chercher le bouton via OCR sur l'écran actuel
try:
import mss
from PIL import Image
from services.ocr_service import ocr_extract_words
with mss.mss() as sct:
monitor = sct.monitors[1]
screenshot = sct.grab(monitor)
screen = Image.frombytes('RGB', screenshot.size, screenshot.bgra, 'raw', 'BGRX')
words = ocr_extract_words(screen)
# Collecter TOUS les matchs, puis prendre le plus bas (boutons = bas du dialogue)
all_matches = []
for candidate in candidates:
candidate_lower = candidate.lower()
for word in words:
word_text = word['text'].lower()
if len(word_text) < 3 or len(candidate_lower) < 3:
continue
if word_text == candidate_lower:
x1, y1, x2, y2 = word['bbox']
all_matches.append({
'text': word['text'],
'x': int((x1 + x2) / 2),
'y': int((y1 + y2) / 2),
'match_type': 'exact',
})
# Recherche partielle (ex: "nregistrer" sans le E souligné)
if not all_matches:
for candidate in candidates:
if len(candidate) > 3:
partial = candidate[1:].lower()
for word in words:
if partial in word['text'].lower():
x1, y1, x2, y2 = word['bbox']
all_matches.append({
'text': word['text'],
'x': int((x1 + x2) / 2),
'y': int((y1 + y2) / 2),
'match_type': 'partial',
})
if all_matches:
for m in all_matches:
print(f" 🔎 [Pattern] Candidat: '{m['text']}' à ({m['x']}, {m['y']}) [{m['match_type']}]")
best = max(all_matches, key=lambda m: m['y'])
print(f"🤖 [Pattern] Clic sur '{best['text']}' à ({best['x']}, {best['y']}) [le plus bas = bouton]")
pyautogui.click(best['x'], best['y'])
time.sleep(1.0)
return True
except Exception as e:
print(f" 🔎 [Pattern] OCR bouton échoué: {e}")
print(f" 🔎 [Pattern] Bouton '{target}' introuvable par OCR — pas de clic")
return False
elif action == 'hotkey':
keys = target.split('+')
@@ -309,10 +378,18 @@ def execute_workflow_thread(execution_id: str, workflow_id: str, app):
db.session.commit()
# Vérifier si un dialogue/popup bloque l'écran avant l'étape
if index > 0 and _execution_state.get('execution_mode') in ('intelligent', 'debug'):
detected = _check_screen_for_patterns()
if detected:
_handle_detected_pattern(detected)
if index > 0:
exec_mode = _execution_state.get('execution_mode', 'basic')
print(f" 🔎 [Pattern] Vérification avant étape {index+1} (mode={exec_mode})")
if exec_mode in ('intelligent', 'debug'):
detected = _check_screen_for_patterns()
if detected:
print(f" 🧠 [Pattern] TROUVÉ: {detected.get('pattern')}{detected.get('action')} '{detected.get('target')}'")
_handle_detected_pattern(detected)
else:
print(f" 🔎 [Pattern] Aucun dialogue détecté")
else:
print(f" 🔎 [Pattern] Skip (mode {exec_mode})")
print(f"\n{'='*60}")
print(f"📋 [Execute] Étape {index + 1}/{len(steps)}: {step.action_type}")
@@ -1084,9 +1161,35 @@ def execute_action(action_type: str, params: dict) -> dict:
elif action_type in ['wait_for_anchor', 'wait']:
timeout_ms = params.get('timeout_ms', params.get('timeout', 5000))
print(f"⏳ [Action] Attente {timeout_ms}ms")
time.sleep(timeout_ms / 1000)
return {'success': True, 'output': {'waited_ms': timeout_ms}}
print(f"⏳ [Action] Attente {timeout_ms}ms (avec surveillance patterns)")
elapsed = 0
check_interval = 1000
pattern_handled = None
while elapsed < timeout_ms:
wait_chunk = min(check_interval, timeout_ms - elapsed)
time.sleep(wait_chunk / 1000)
elapsed += wait_chunk
if execution_mode in ('intelligent', 'debug'):
try:
detected = _check_screen_for_patterns()
if detected:
print(f"🧠 [Wait] Dialogue détecté: {detected.get('pattern')}{detected.get('target')}")
_handle_detected_pattern(detected)
pattern_handled = detected
break
except Exception as e:
print(f" 🔎 [Wait] Erreur check: {e}")
return {
'success': True,
'output': {
'waited_ms': elapsed,
'pattern_handled': pattern_handled.get('pattern') if pattern_handled else None
}
}
elif action_type == 'keyboard_shortcut':
keys = params.get('keys', [])