feat(p1): persist workflows and semantic learning artifacts

This commit is contained in:
Dom
2026-06-02 16:20:38 +02:00
parent 7a1a5cb6fd
commit 86b3c8f7e7
21 changed files with 3816 additions and 31 deletions

View File

@@ -687,6 +687,7 @@ def _extract_required_apps_from_events(
- launch_result_target: dict optionnel (vrai clic SearchHost -> app)
"""
app_counts: Dict[str, int] = defaultdict(int)
app_titles: Dict[str, List[str]] = defaultdict(list)
first_app = None
first_window_title = None
@@ -702,6 +703,8 @@ def _extract_required_apps_from_events(
title = to_info.get("title", "")
if app_name:
app_counts[app_name] += 1
if title:
app_titles[app_name].append(title)
if first_app is None and app_name.lower() not in _SETUP_IGNORE_APPS:
first_app = app_name
first_window_title = title
@@ -741,6 +744,10 @@ def _extract_required_apps_from_events(
"primary_launch_cmd": primary_launch_cmd,
"first_window_title": first_window_title or "",
"apps": dict(app_counts),
"has_neutral_window_title": any(
_is_neutral_window_title(title)
for title in app_titles.get(primary_app, [])
),
}
if start_menu_target:
result["start_menu_target"] = start_menu_target
@@ -927,6 +934,9 @@ def _extract_required_apps_from_workflow(workflow) -> Dict[str, Any]:
"primary_launch_cmd": primary_launch_cmd,
"first_window_title": first_title,
"apps": {},
"has_neutral_window_title": any(
_is_neutral_window_title(title) for title in window_titles
),
"source_session_id": source_session_id,
"machine_id": machine_id,
}
@@ -1113,6 +1123,50 @@ def _generate_run_dialog_setup_actions(
},
]
needs_fresh_notepad_document = (
primary_app.lower() == "notepad.exe"
and (
bool(app_info.get("has_neutral_window_title"))
or _is_neutral_window_title(first_title)
)
)
if needs_fresh_notepad_document:
if title_patterns or first_title:
actions.append({
"action_id": f"act_{setup_id_prefix}_verify_before_fresh_document",
"type": "verify_screen",
"expected_node": "setup_initial_before_fresh_document",
"timeout_ms": 5000,
"_setup_phase": True,
"_setup_step": "verify_app_ready_before_fresh_document",
"_setup_strategy": "run_dialog",
"expected_window_title_contains": title_patterns or [first_title],
"intention": (
"vérifier que Bloc-notes est la scène active avant "
"d'ouvrir un document vierge"
),
})
actions.extend([
{
"action_id": f"act_{setup_id_prefix}_ensure_fresh_document",
"type": "key_combo",
"keys": ["ctrl", "n"],
"_setup_phase": True,
"_setup_step": "ensure_fresh_document",
"_setup_strategy": "run_dialog",
"expected_window_before": first_title,
"intention": "ouvrir un document Bloc-notes vierge non nommé",
},
{
"action_id": f"act_{setup_id_prefix}_wait_fresh_document",
"type": "wait",
"duration_ms": 400,
"_setup_phase": True,
"_setup_step": "wait_fresh_document",
"_setup_strategy": "run_dialog",
},
])
if title_patterns or first_title:
actions.append({
"action_id": f"act_{setup_id_prefix}_verify",
@@ -1688,6 +1742,63 @@ def _is_learned_workflow(workflow) -> bool:
return has_prototype
_TARGET_SEMANTIC_KEYS = (
"by_text",
"by_role",
"anchor_id",
"target_text",
"ocr_description",
"description",
"vlm_description",
"anchor_image_base64",
"by_text_source",
"window_title",
"anchor_bbox",
"original_size",
)
def _first_non_empty_text(*values: Any) -> str:
for value in values:
text = str(value or "").strip()
if text and text.casefold() not in {"none", "null"}:
return text
return ""
def _target_attr(target: Any, key: str, default: Any = None) -> Any:
if isinstance(target, dict):
return target.get(key, default)
return getattr(target, key, default)
def _copy_semantic_target_fields(
target_spec: Dict[str, Any],
*sources: Optional[Dict[str, Any]],
) -> None:
for source in sources:
if not isinstance(source, dict):
continue
for key in _TARGET_SEMANTIC_KEYS:
value = source.get(key)
if value and not target_spec.get(key):
target_spec[key] = value
if not target_spec.get("by_text"):
target_text = _first_non_empty_text(target_spec.get("target_text"))
if target_text:
target_spec["by_text"] = target_text
target_spec.setdefault("by_text_source", "visual_anchor")
if not target_spec.get("vlm_description"):
description = _first_non_empty_text(
target_spec.get("description"),
target_spec.get("ocr_description"),
)
if description:
target_spec["vlm_description"] = description
def _edge_to_normalized_actions(edge, params: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Convertir un WorkflowEdge en liste d'actions normalisées pour l'Agent V1.
@@ -1705,8 +1816,9 @@ def _edge_to_normalized_actions(edge, params: Dict[str, Any]) -> List[Dict[str,
# Extraire les coordonnées normalisées depuis TargetSpec.by_position
x_pct = 0.0
y_pct = 0.0
if target and target.by_position:
px, py = target.by_position
by_position = _target_attr(target, "by_position")
if target and by_position:
px, py = by_position
if px <= 1.0 and py <= 1.0:
x_pct = px
y_pct = py
@@ -1769,10 +1881,15 @@ def _edge_to_normalized_actions(edge, params: Dict[str, Any]) -> List[Dict[str,
elif action_type == "extract_table":
normalized["type"] = "extract_table"
normalized["parameters"] = {
"output_var": action_params.get("output_var", "table_rows"),
"output_var": (
action_params.get("variable_name")
or action_params.get("output_var")
or "table_rows"
),
"pattern": action_params.get("pattern"),
"limit": action_params.get("limit"),
"region": action_params.get("region"),
"engine": action_params.get("engine", "easyocr"),
}
return [normalized]
@@ -1833,14 +1950,33 @@ def _edge_to_normalized_actions(edge, params: Dict[str, Any]) -> List[Dict[str,
# Ajouter le target_spec complet pour la résolution visuelle
target_spec = {}
if target and target.by_role:
target_spec["by_role"] = target.by_role
normalized["target_role"] = target.by_role # Compat debug
if target and target.by_text:
target_spec["by_text"] = target.by_text
normalized["target_text"] = target.by_text # Compat debug
if target and hasattr(target, 'context_hints') and target.context_hints:
target_spec["context_hints"] = target.context_hints
by_role = _target_attr(target, "by_role", "")
by_text = _target_attr(target, "by_text", "")
context_hints = _target_attr(target, "context_hints", {}) or {}
if target and by_role:
target_spec["by_role"] = by_role
normalized["target_role"] = by_role # Compat debug
if target and by_text:
target_spec["by_text"] = by_text
normalized["target_text"] = by_text # Compat debug
if target and context_hints:
target_spec["context_hints"] = context_hints
_copy_semantic_target_fields(
target_spec,
action_params,
action_params.get("target_spec") if isinstance(action_params, dict) else None,
context_hints,
)
semantic_label = _first_non_empty_text(
target_spec.get("by_text"),
target_spec.get("target_text"),
target_spec.get("description"),
target_spec.get("ocr_description"),
target_spec.get("vlm_description"),
)
if semantic_label:
normalized.setdefault("target_text", target_spec.get("target_text") or semantic_label)
normalized.setdefault("target_description", semantic_label)
if target_spec:
normalized["target_spec"] = target_spec
normalized["visual_mode"] = True # Signal à l'agent d'utiliser la résolution visuelle
@@ -2004,6 +2140,7 @@ def _handle_extract_table_action(
output_var : nom de variable runtime (default "table_rows")
pattern : regex à matcher sur chaque token OCR (ex : r"^25\\d{6}$")
limit : nb max d'entrées à retourner
engine : easyocr (défaut) ou tesseract/digits/ipp pour chiffres
region : (x, y, w, h) en pixels pour cropper avant OCR
(None = image entière)
@@ -2014,6 +2151,7 @@ def _handle_extract_table_action(
output_var = (params.get("output_var") or params.get("variable_name") or "table_rows").strip()
pattern = params.get("pattern") or None
limit = params.get("limit")
engine = params.get("engine") or "easyocr"
region = params.get("region") or None
if isinstance(limit, str):
try:
@@ -2058,6 +2196,7 @@ def _handle_extract_table_action(
region=tuple(region) if region else None,
pattern=pattern,
limit=limit,
engine=engine,
)
except Exception as e:
logger.warning(
@@ -2071,8 +2210,8 @@ def _handle_extract_table_action(
replay_state.setdefault("variables", {})[output_var] = rows
logger.info(
"extract_table → variable '%s' (%d entrées, pattern=%r, limit=%s) replay %s",
output_var, len(rows), pattern, limit, replay_state.get("replay_id", "?"),
"extract_table → variable '%s' (%d entrées, pattern=%r, limit=%s, engine=%s) replay %s",
output_var, len(rows), pattern, limit, engine, replay_state.get("replay_id", "?"),
)
return bool(rows)
@@ -2410,6 +2549,29 @@ def _expand_compound_steps(
action["x_pct"] = step.get("x_pct", 0.0)
action["y_pct"] = step.get("y_pct", 0.0)
action["button"] = step.get("button", "left")
target_spec: Dict[str, Any] = {}
_copy_semantic_target_fields(
target_spec,
step,
step.get("target_spec") if isinstance(step, dict) else None,
step.get("visual_anchor") if isinstance(step, dict) else None,
)
semantic_label = _first_non_empty_text(
target_spec.get("by_text"),
target_spec.get("target_text"),
target_spec.get("description"),
target_spec.get("ocr_description"),
target_spec.get("vlm_description"),
)
if semantic_label:
action.setdefault(
"target_text",
target_spec.get("target_text") or semantic_label,
)
action.setdefault("target_description", semantic_label)
if target_spec:
action["target_spec"] = target_spec
action["visual_mode"] = True
else:
logger.debug(f"Step compound inconnu : {step_type}")
@@ -2659,6 +2821,8 @@ def _create_replay_state(
a_copy = {
"action_id": a.get("action_id"),
"type": a.get("type"),
"keys": a.get("keys"),
"button": a.get("button"),
"x_pct": a.get("x_pct"),
"y_pct": a.get("y_pct"),
# Contrôle strict des étapes (Dom, matin 10 avril 2026)
@@ -2667,6 +2831,9 @@ def _create_replay_state(
"expected_window_title": a.get("expected_window_title", ""),
# Contexte métier utile pour logs et apprentissage
"intention": a.get("intention", ""),
"target_text": a.get("target_text", ""),
"target_description": a.get("target_description", ""),
"description": a.get("description", ""),
}
ts = a.get("target_spec")
if isinstance(ts, dict):