diff --git a/visual_workflow_builder/backend/api_v3/capture.py b/visual_workflow_builder/backend/api_v3/capture.py index 76501987d..0d28b1c6d 100644 --- a/visual_workflow_builder/backend/api_v3/capture.py +++ b/visual_workflow_builder/backend/api_v3/capture.py @@ -198,6 +198,49 @@ def select_anchor(): thumbnail_path = os.path.join(ANCHORS_DIR, f"{anchor_id}_thumb.png") thumbnail.save(thumbnail_path, 'PNG') + # ── Analyse automatique du crop : OCR + VLM ──────────────────── + target_text = "" + ocr_description = "" + try: + from services.ocr_service import ocr_extract_text + target_text = ocr_extract_text(thumbnail).strip() + print(f"🔍 [OCR] Texte extrait de l'ancre: '{target_text}'") + + # Si le texte OCR est trop court ou vide, décrire via VLM + if len(target_text) < 3: + try: + import requests as http_requests + ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434") + + # Encoder le crop en base64 pour le VLM + thumb_buffer = BytesIO() + thumbnail.save(thumb_buffer, format='PNG') + thumb_b64 = base64.b64encode(thumb_buffer.getvalue()).decode('utf-8') + + resp = http_requests.post( + f"{ollama_url}/api/generate", + json={ + "model": "qwen2.5vl:3b", + "prompt": "Describe this UI element in 5 words. Just the name, nothing else.", + "images": [thumb_b64], + "stream": False, + "options": {"temperature": 0.1, "num_predict": 15} + }, + timeout=15 + ) + if resp.status_code == 200: + ocr_description = resp.json().get("response", "").strip() + print(f"🏷️ [VLM] Description ancre: '{ocr_description}'") + except Exception as vlm_err: + print(f"⚠️ [VLM] Description ancre échouée: {vlm_err}") + else: + # Le texte OCR est suffisant, l'utiliser aussi comme description + ocr_description = target_text + except ImportError: + print("⚠️ [OCR] docTR non disponible, analyse ancre ignorée") + except Exception as ocr_err: + print(f"⚠️ [OCR] Analyse ancre échouée: {ocr_err}") + # Créer l'enregistrement en base # Utiliser les dimensions de l'image décodée (pas de session.last_capture qui peut être None) anchor = VisualAnchor( @@ -210,7 +253,9 @@ def select_anchor(): bbox_height=h, screen_width=img.width, screen_height=img.height, - description=description + description=description or ocr_description, + target_text=target_text, + ocr_description=ocr_description ) db.session.add(anchor) diff --git a/visual_workflow_builder/backend/api_v3/execute.py b/visual_workflow_builder/backend/api_v3/execute.py index 4bf5ba028..f430b9313 100644 --- a/visual_workflow_builder/backend/api_v3/execute.py +++ b/visual_workflow_builder/backend/api_v3/execute.py @@ -197,6 +197,12 @@ def execute_workflow_thread(execution_id: str, workflow_id: str, app): } } + # Injecter le texte OCR et la description VLM pré-calculés + if anchor.target_text: + params['visual_anchor']['target_text'] = anchor.target_text + if anchor.ocr_description: + params['visual_anchor']['description'] = anchor.ocr_description + # Valider le contrat try: enforce_action_contract(step.action_type, params) diff --git a/visual_workflow_builder/backend/app.py b/visual_workflow_builder/backend/app.py index b2744bb86..4f025c764 100644 --- a/visual_workflow_builder/backend/app.py +++ b/visual_workflow_builder/backend/app.py @@ -372,6 +372,23 @@ with app.app_context(): db.session.rollback() print(f" [DB] Colonne '{col_name}' déjà existante ou erreur: {e}") + # Migration manuelle : ajouter les colonnes OCR/VLM aux ancres visuelles + if 'visual_anchors' in insp.get_table_names(): + existing_anchor_cols = {col['name'] for col in insp.get_columns('visual_anchors')} + new_anchor_cols = { + 'target_text': "ALTER TABLE visual_anchors ADD COLUMN target_text TEXT", + 'ocr_description': "ALTER TABLE visual_anchors ADD COLUMN ocr_description TEXT", + } + for col_name, sql in new_anchor_cols.items(): + if col_name not in existing_anchor_cols: + try: + db.session.execute(text(sql)) + db.session.commit() + print(f" [DB] Colonne '{col_name}' ajoutée à visual_anchors") + except Exception as e: + db.session.rollback() + print(f" [DB] Colonne '{col_name}' déjà existante ou erreur: {e}") + # Initialize VisualTargetManager with RPA Vision V3 components (optional) try: from core.capture.screen_capturer import ScreenCapturer diff --git a/visual_workflow_builder/backend/db/models.py b/visual_workflow_builder/backend/db/models.py index 8100327da..0bb4caecc 100644 --- a/visual_workflow_builder/backend/db/models.py +++ b/visual_workflow_builder/backend/db/models.py @@ -183,6 +183,11 @@ class VisualAnchor(db.Model): # Description pour l'utilisateur description = db.Column(db.Text, nullable=True) + # Texte OCR extrait du crop de l'ancre (analyse à la capture) + target_text = db.Column(db.Text, nullable=True) + # Description VLM de l'ancre (si l'OCR ne trouve pas de texte) + ocr_description = db.Column(db.Text, nullable=True) + # Seuil de confiance pour la détection confidence_threshold = db.Column(db.Float, default=0.8) @@ -207,6 +212,8 @@ class VisualAnchor(db.Model): 'height': self.screen_height } if self.screen_width else None, 'description': self.description, + 'target_text': self.target_text, + 'ocr_description': self.ocr_description, 'confidence_threshold': self.confidence_threshold, 'created_at': self.created_at.isoformat() if self.created_at else None } diff --git a/visual_workflow_builder/frontend_v4/src/types.ts b/visual_workflow_builder/frontend_v4/src/types.ts index 75b442418..8e811a009 100644 --- a/visual_workflow_builder/frontend_v4/src/types.ts +++ b/visual_workflow_builder/frontend_v4/src/types.ts @@ -254,6 +254,8 @@ export interface VisualAnchor { bounding_box: { x: number; y: number; width: number; height: number }; thumbnail_url?: string; description?: string; + target_text?: string; + ocr_description?: string; } export interface Step {