feat: analyse OCR+VLM de l'ancre à la capture (pas à l'exécution)
Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 13s
security-audit / pip-audit (CVE dépendances) (push) Successful in 11s
security-audit / Scan secrets (grep) (push) Successful in 8s
tests / Lint (ruff + black) (push) Successful in 14s
tests / Tests unitaires (sans GPU) (push) Failing after 15s
tests / Tests sécurité (critique) (push) Has been skipped
Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 13s
security-audit / pip-audit (CVE dépendances) (push) Successful in 11s
security-audit / Scan secrets (grep) (push) Successful in 8s
tests / Lint (ruff + black) (push) Successful in 14s
tests / Tests unitaires (sans GPU) (push) Failing after 15s
tests / Tests sécurité (critique) (push) Has been skipped
Quand l'utilisateur sélectionne une ancre dans le VWB : 1. OCR docTR extrait le texte du crop → target_text 2. Si texte < 3 chars → VLM qwen2.5vl:3b décrit en 5 mots 3. Stocké en BDD (VisualAnchor.target_text + ocr_description) 4. Injecté automatiquement dans les params à l'exécution L'exécution sait maintenant QUOI chercher dès le départ : - CLIP vérifie par OCR que le texte correspond - Le grounding cascade a un vrai target_text - Plus besoin de deviner à chaque run Migration SQLite gracieuse (ALTER TABLE si colonnes absentes). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -198,6 +198,49 @@ def select_anchor():
|
||||
thumbnail_path = os.path.join(ANCHORS_DIR, f"{anchor_id}_thumb.png")
|
||||
thumbnail.save(thumbnail_path, 'PNG')
|
||||
|
||||
# ── Analyse automatique du crop : OCR + VLM ────────────────────
|
||||
target_text = ""
|
||||
ocr_description = ""
|
||||
try:
|
||||
from services.ocr_service import ocr_extract_text
|
||||
target_text = ocr_extract_text(thumbnail).strip()
|
||||
print(f"🔍 [OCR] Texte extrait de l'ancre: '{target_text}'")
|
||||
|
||||
# Si le texte OCR est trop court ou vide, décrire via VLM
|
||||
if len(target_text) < 3:
|
||||
try:
|
||||
import requests as http_requests
|
||||
ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434")
|
||||
|
||||
# Encoder le crop en base64 pour le VLM
|
||||
thumb_buffer = BytesIO()
|
||||
thumbnail.save(thumb_buffer, format='PNG')
|
||||
thumb_b64 = base64.b64encode(thumb_buffer.getvalue()).decode('utf-8')
|
||||
|
||||
resp = http_requests.post(
|
||||
f"{ollama_url}/api/generate",
|
||||
json={
|
||||
"model": "qwen2.5vl:3b",
|
||||
"prompt": "Describe this UI element in 5 words. Just the name, nothing else.",
|
||||
"images": [thumb_b64],
|
||||
"stream": False,
|
||||
"options": {"temperature": 0.1, "num_predict": 15}
|
||||
},
|
||||
timeout=15
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
ocr_description = resp.json().get("response", "").strip()
|
||||
print(f"🏷️ [VLM] Description ancre: '{ocr_description}'")
|
||||
except Exception as vlm_err:
|
||||
print(f"⚠️ [VLM] Description ancre échouée: {vlm_err}")
|
||||
else:
|
||||
# Le texte OCR est suffisant, l'utiliser aussi comme description
|
||||
ocr_description = target_text
|
||||
except ImportError:
|
||||
print("⚠️ [OCR] docTR non disponible, analyse ancre ignorée")
|
||||
except Exception as ocr_err:
|
||||
print(f"⚠️ [OCR] Analyse ancre échouée: {ocr_err}")
|
||||
|
||||
# Créer l'enregistrement en base
|
||||
# Utiliser les dimensions de l'image décodée (pas de session.last_capture qui peut être None)
|
||||
anchor = VisualAnchor(
|
||||
@@ -210,7 +253,9 @@ def select_anchor():
|
||||
bbox_height=h,
|
||||
screen_width=img.width,
|
||||
screen_height=img.height,
|
||||
description=description
|
||||
description=description or ocr_description,
|
||||
target_text=target_text,
|
||||
ocr_description=ocr_description
|
||||
)
|
||||
|
||||
db.session.add(anchor)
|
||||
|
||||
@@ -197,6 +197,12 @@ def execute_workflow_thread(execution_id: str, workflow_id: str, app):
|
||||
}
|
||||
}
|
||||
|
||||
# Injecter le texte OCR et la description VLM pré-calculés
|
||||
if anchor.target_text:
|
||||
params['visual_anchor']['target_text'] = anchor.target_text
|
||||
if anchor.ocr_description:
|
||||
params['visual_anchor']['description'] = anchor.ocr_description
|
||||
|
||||
# Valider le contrat
|
||||
try:
|
||||
enforce_action_contract(step.action_type, params)
|
||||
|
||||
@@ -372,6 +372,23 @@ with app.app_context():
|
||||
db.session.rollback()
|
||||
print(f" [DB] Colonne '{col_name}' déjà existante ou erreur: {e}")
|
||||
|
||||
# Migration manuelle : ajouter les colonnes OCR/VLM aux ancres visuelles
|
||||
if 'visual_anchors' in insp.get_table_names():
|
||||
existing_anchor_cols = {col['name'] for col in insp.get_columns('visual_anchors')}
|
||||
new_anchor_cols = {
|
||||
'target_text': "ALTER TABLE visual_anchors ADD COLUMN target_text TEXT",
|
||||
'ocr_description': "ALTER TABLE visual_anchors ADD COLUMN ocr_description TEXT",
|
||||
}
|
||||
for col_name, sql in new_anchor_cols.items():
|
||||
if col_name not in existing_anchor_cols:
|
||||
try:
|
||||
db.session.execute(text(sql))
|
||||
db.session.commit()
|
||||
print(f" [DB] Colonne '{col_name}' ajoutée à visual_anchors")
|
||||
except Exception as e:
|
||||
db.session.rollback()
|
||||
print(f" [DB] Colonne '{col_name}' déjà existante ou erreur: {e}")
|
||||
|
||||
# Initialize VisualTargetManager with RPA Vision V3 components (optional)
|
||||
try:
|
||||
from core.capture.screen_capturer import ScreenCapturer
|
||||
|
||||
@@ -183,6 +183,11 @@ class VisualAnchor(db.Model):
|
||||
# Description pour l'utilisateur
|
||||
description = db.Column(db.Text, nullable=True)
|
||||
|
||||
# Texte OCR extrait du crop de l'ancre (analyse à la capture)
|
||||
target_text = db.Column(db.Text, nullable=True)
|
||||
# Description VLM de l'ancre (si l'OCR ne trouve pas de texte)
|
||||
ocr_description = db.Column(db.Text, nullable=True)
|
||||
|
||||
# Seuil de confiance pour la détection
|
||||
confidence_threshold = db.Column(db.Float, default=0.8)
|
||||
|
||||
@@ -207,6 +212,8 @@ class VisualAnchor(db.Model):
|
||||
'height': self.screen_height
|
||||
} if self.screen_width else None,
|
||||
'description': self.description,
|
||||
'target_text': self.target_text,
|
||||
'ocr_description': self.ocr_description,
|
||||
'confidence_threshold': self.confidence_threshold,
|
||||
'created_at': self.created_at.isoformat() if self.created_at else None
|
||||
}
|
||||
|
||||
@@ -254,6 +254,8 @@ export interface VisualAnchor {
|
||||
bounding_box: { x: number; y: number; width: number; height: number };
|
||||
thumbnail_url?: string;
|
||||
description?: string;
|
||||
target_text?: string;
|
||||
ocr_description?: string;
|
||||
}
|
||||
|
||||
export interface Step {
|
||||
|
||||
Reference in New Issue
Block a user