Files
rpa_vision_v3/core/grounding/server.py
Dom 3d6868f029 docs: cartographie complète d'exécution + fix target_text ORA + worker InfiGUI fichiers
docs/CARTOGRAPHY.md :
- Carte complète des 2 chemins d'exécution (Legacy vs ORA)
- 12 systèmes de grounding identifiés dont 3 morts
- Trace du champ target_text de la capture au clic
- Fonctions existantes non branchées (verify, recovery, ShadowLearningHook)
- Budget VRAM, fichiers critiques, règles de modification

Fix target_text ORA (observe_reason_act.py:217) :
- Détecte les target_text absurdes ("click_anchor")
- Appelle _describe_anchor_image() (VLM) pour décrire le crop
- Même logique que le legacy execute.py:893

Worker InfiGUI via fichiers /tmp :
- Communication par fichiers (pas subprocess pipes, pas HTTP)
- Process indépendant lancé avant le backend
- Résout le crash CUDA dans Flask/FastAPI/uvicorn

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-26 12:37:43 +02:00

114 lines
4.9 KiB
Python

"""Serveur grounding minimaliste — Flask single-thread, même contexte CUDA."""
import base64, io, json, math, os, re, time, gc
import torch
from flask import Flask, request, jsonify
from PIL import Image
app = Flask(__name__)
MODEL_ID = os.environ.get("GROUNDING_MODEL", "InfiX-ai/InfiGUI-G1-3B")
MIN_PIXELS = 100 * 28 * 28
MAX_PIXELS = 5600 * 28 * 28
_model = None
_processor = None
def _smart_resize(h, w, factor=28):
h_bar = max(factor, round(h/factor)*factor)
w_bar = max(factor, round(w/factor)*factor)
if h_bar*w_bar > MAX_PIXELS:
beta = math.sqrt((h*w)/MAX_PIXELS)
h_bar = math.floor(h/beta/factor)*factor
w_bar = math.floor(w/beta/factor)*factor
elif h_bar*w_bar < MIN_PIXELS:
beta = math.sqrt(MIN_PIXELS/(h*w))
h_bar = math.ceil(h*beta/factor)*factor
w_bar = math.ceil(w*beta/factor)*factor
return h_bar, w_bar
def load_model():
global _model, _processor
if _model is not None:
return
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
torch.cuda.empty_cache(); gc.collect()
print(f"[grounding] Chargement {MODEL_ID}...")
bnb = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True)
_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
MODEL_ID, quantization_config=bnb, device_map="auto")
_model.eval()
_processor = AutoProcessor.from_pretrained(MODEL_ID, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS, padding_side="left")
print(f"[grounding] Prêt — VRAM: {torch.cuda.memory_allocated()/1e9:.2f}GB")
@app.route('/health')
def health():
return jsonify({"status": "ok", "model": MODEL_ID, "model_loaded": _model is not None,
"cuda_available": torch.cuda.is_available(),
"vram_allocated_gb": round(torch.cuda.memory_allocated()/1e9, 2)})
@app.route('/ground', methods=['POST'])
def ground():
if _model is None:
return jsonify({"error": "Modèle pas chargé"}), 503
from qwen_vl_utils import process_vision_info
data = request.json
target = data.get('target_text', '')
desc = data.get('target_description', '')
label = f"{target}{desc}" if desc else target
if not label.strip():
return jsonify({"error": "target_text requis"}), 400
# Image
if data.get('image_b64'):
raw = data['image_b64'].split(',')[1] if ',' in data['image_b64'] else data['image_b64']
img = Image.open(io.BytesIO(base64.b64decode(raw))).convert('RGB')
else:
import mss
with mss.mss() as sct:
grab = sct.grab(sct.monitors[0])
img = Image.frombytes('RGB', grab.size, grab.bgra, 'raw', 'BGRX')
W, H = img.size
rH, rW = _smart_resize(H, W)
user_text = f'The screen\'s resolution is {rW}x{rH}.\nLocate the UI element(s) for "{label}", output the coordinates using JSON format: [{{"point_2d": [x, y]}}, ...]'
system = "You FIRST think about the reasoning process as an internal monologue and then provide the final answer.\nThe reasoning process MUST BE enclosed within <think> </think> tags."
messages = [{"role": "system", "content": system},
{"role": "user", "content": [{"type": "image", "image": img}, {"type": "text", "text": user_text}]}]
text = _processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = _processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt").to(_model.device)
t0 = time.time()
with torch.no_grad():
gen = _model.generate(**inputs, max_new_tokens=512)
infer_ms = (time.time()-t0)*1000
trimmed = [o[len(i):] for i,o in zip(inputs.input_ids, gen)]
raw = _processor.batch_decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0].strip()
print(f"[grounding] '{label[:40]}'{raw[:100]} ({infer_ms:.0f}ms)")
# Parser JSON point_2d
json_part = raw.split("</think>")[-1] if "</think>" in raw else raw
json_part = json_part.replace("```json","").replace("```","").strip()
px, py = None, None
try:
parsed = json.loads(json_part)
if isinstance(parsed, list) and len(parsed) > 0:
pt = parsed[0].get("point_2d", [])
if len(pt) >= 2:
px, py = int(pt[0]*W/rW), int(pt[1]*H/rH)
except json.JSONDecodeError:
m = re.search(r'"point_2d"\s*:\s*\[(\d+),\s*(\d+)\]', raw)
if m:
px, py = int(int(m.group(1))*W/rW), int(int(m.group(2))*H/rH)
return jsonify({"x": px, "y": py, "method": "infigui", "confidence": 0.90 if px else 0.0,
"time_ms": round(infer_ms, 1), "raw_output": raw[:300]})
if __name__ == '__main__':
load_model()
app.run(host='0.0.0.0', port=8200, threaded=False)