feat(p11): learn from offline cross-session matches
This commit is contained in:
@@ -2461,6 +2461,10 @@ class StreamProcessor:
|
||||
# Workflows construits (pour le matching)
|
||||
self._workflows: Dict[str, Any] = {}
|
||||
|
||||
# P1.1 : learner continu branché uniquement sur de vraies observations
|
||||
# cross-session, après matching offline contre un workflow existant.
|
||||
self._continuous_learner = None
|
||||
|
||||
# Shadow learning : dernier pattern UI détecté par session
|
||||
# Stocke {session_id: {"pattern": str, "ocr_text": str, "screen_state": obj, "shot_id": str}}
|
||||
self._pending_ui_patterns: Dict[str, Dict[str, Any]] = {}
|
||||
@@ -3005,6 +3009,14 @@ class StreamProcessor:
|
||||
except Exception as e2:
|
||||
return {"error": f"Erreur RawSession: {e2}"}
|
||||
|
||||
session_machine_id = getattr(session, "machine_id", None)
|
||||
cross_learning = self._run_cross_session_learning(
|
||||
session_id=session_id,
|
||||
states=states,
|
||||
embeddings=embeddings,
|
||||
machine_id=session_machine_id,
|
||||
)
|
||||
|
||||
# Construire le workflow via GraphBuilder
|
||||
try:
|
||||
from core.graph.graph_builder import GraphBuilder
|
||||
@@ -3086,6 +3098,7 @@ class StreamProcessor:
|
||||
"embeddings_indexed": len(embeddings),
|
||||
"saved_path": str(saved_path) if saved_path else None,
|
||||
"app_context": app_context,
|
||||
"cross_session_learning": cross_learning,
|
||||
}
|
||||
|
||||
logger.info(
|
||||
@@ -3130,6 +3143,257 @@ class StreamProcessor:
|
||||
|
||||
return None
|
||||
|
||||
def _get_continuous_learner(self):
|
||||
"""Lazy init du ContinuousLearner existant."""
|
||||
if self._continuous_learner is not None:
|
||||
return self._continuous_learner
|
||||
from core.learning.continuous_learner import ContinuousLearner
|
||||
self._continuous_learner = ContinuousLearner()
|
||||
return self._continuous_learner
|
||||
|
||||
def _run_cross_session_learning(
|
||||
self,
|
||||
session_id: str,
|
||||
states: List[Any],
|
||||
embeddings: List[np.ndarray],
|
||||
machine_id: Optional[str] = None,
|
||||
*,
|
||||
update_threshold: float = 0.85,
|
||||
drift_min_confidence: float = 0.50,
|
||||
) -> Dict[str, Any]:
|
||||
"""Matcher une session observée contre les workflows existants.
|
||||
|
||||
P1.1 Option A : le learner ne reçoit jamais le prototype du node
|
||||
existant. Il reçoit uniquement un embedding observé de la session
|
||||
courante, si cet embedding matche fortement un node d'un workflow déjà
|
||||
connu. Les identifiants écrits par le learner sont hashés pour éviter
|
||||
de propager un nom de workflow potentiellement métier dans les chemins.
|
||||
"""
|
||||
stats: Dict[str, Any] = {
|
||||
"status": "skipped",
|
||||
"states_seen": len(states),
|
||||
"embeddings_seen": len(embeddings),
|
||||
"candidate_workflows": 0,
|
||||
"matches": 0,
|
||||
"updates": 0,
|
||||
"drift_checks": 0,
|
||||
"skips": {},
|
||||
}
|
||||
|
||||
def _skip(reason: str) -> Dict[str, Any]:
|
||||
stats["skips"][reason] = stats["skips"].get(reason, 0) + 1
|
||||
return stats
|
||||
|
||||
if not states:
|
||||
return _skip("no_states")
|
||||
if len(states) != len(embeddings):
|
||||
return _skip("embedding_count_mismatch")
|
||||
|
||||
with self._data_lock:
|
||||
workflows = list(self._workflows.values())
|
||||
|
||||
if not workflows:
|
||||
return _skip("no_existing_workflow")
|
||||
|
||||
stats["candidate_workflows"] = sum(
|
||||
1
|
||||
for workflow in workflows
|
||||
if not (
|
||||
machine_id
|
||||
and getattr(workflow, "_machine_id", None)
|
||||
and getattr(workflow, "_machine_id", None) != machine_id
|
||||
)
|
||||
)
|
||||
if stats["candidate_workflows"] == 0:
|
||||
return _skip("no_candidate_workflow_for_machine")
|
||||
|
||||
learner = self._get_continuous_learner()
|
||||
stats["status"] = "processed"
|
||||
|
||||
for state, observed_embedding in zip(states, embeddings):
|
||||
observed_vector = self._normalise_vector(observed_embedding)
|
||||
if observed_vector is None:
|
||||
_skip("invalid_observed_embedding")
|
||||
continue
|
||||
|
||||
match = self._find_best_cross_session_match(
|
||||
state=state,
|
||||
observed_vector=observed_vector,
|
||||
workflows=workflows,
|
||||
machine_id=machine_id,
|
||||
min_confidence=drift_min_confidence,
|
||||
)
|
||||
if match is None:
|
||||
_skip("no_match")
|
||||
continue
|
||||
|
||||
stats["matches"] += 1
|
||||
node_key = self._learning_node_key(
|
||||
workflow_id=match["workflow_id"],
|
||||
node_id=match["node_id"],
|
||||
)
|
||||
|
||||
confidence = float(match["confidence"])
|
||||
learner.detect_drift(node_key, [confidence])
|
||||
stats["drift_checks"] += 1
|
||||
|
||||
if confidence < update_threshold:
|
||||
_skip("below_update_threshold")
|
||||
continue
|
||||
|
||||
prototype = match.get("prototype")
|
||||
if prototype is not None and np.allclose(
|
||||
observed_vector, prototype, atol=1e-6
|
||||
):
|
||||
_skip("same_as_existing_prototype")
|
||||
continue
|
||||
|
||||
# Signal réel : observation acceptée uniquement parce qu'un match
|
||||
# cross-session dépasse le seuil d'update. Les confidences faibles
|
||||
# alimentent seulement la détection de drift ci-dessus.
|
||||
execution_success = confidence >= update_threshold
|
||||
if not execution_success:
|
||||
_skip("no_success_signal")
|
||||
continue
|
||||
|
||||
learner.update_prototype(
|
||||
node_key,
|
||||
observed_vector.copy(),
|
||||
execution_success=execution_success,
|
||||
)
|
||||
stats["updates"] += 1
|
||||
|
||||
logger.info(
|
||||
"P1.1 cross-session learning: states=%d workflows=%d matches=%d "
|
||||
"updates=%d drift_checks=%d skips=%s",
|
||||
stats["states_seen"],
|
||||
stats["candidate_workflows"],
|
||||
stats["matches"],
|
||||
stats["updates"],
|
||||
stats["drift_checks"],
|
||||
stats["skips"],
|
||||
)
|
||||
return stats
|
||||
|
||||
def _find_best_cross_session_match(
|
||||
self,
|
||||
state: Any,
|
||||
observed_vector: np.ndarray,
|
||||
workflows: List[Any],
|
||||
machine_id: Optional[str],
|
||||
min_confidence: float,
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""Retour le meilleur node existant pour un embedding observé."""
|
||||
best: Optional[Dict[str, Any]] = None
|
||||
for workflow in workflows:
|
||||
workflow_machine = getattr(workflow, "_machine_id", None)
|
||||
if machine_id and workflow_machine and workflow_machine != machine_id:
|
||||
continue
|
||||
|
||||
workflow_id = getattr(workflow, "workflow_id", "")
|
||||
for node in getattr(workflow, "nodes", []) or []:
|
||||
prototype = self._extract_node_prototype(node)
|
||||
if prototype is None or prototype.shape != observed_vector.shape:
|
||||
continue
|
||||
|
||||
confidence = self._cosine_similarity(observed_vector, prototype)
|
||||
if confidence < min_confidence:
|
||||
continue
|
||||
if not self._template_accepts_observation(node, state, confidence):
|
||||
continue
|
||||
|
||||
if best is None or confidence > best["confidence"]:
|
||||
best = {
|
||||
"workflow_id": workflow_id,
|
||||
"node_id": getattr(node, "node_id", ""),
|
||||
"confidence": confidence,
|
||||
"prototype": prototype,
|
||||
}
|
||||
|
||||
return best
|
||||
|
||||
def _extract_node_prototype(self, node: Any) -> Optional[np.ndarray]:
|
||||
"""Extraire le prototype d'un node sans dépendre de FAISS."""
|
||||
meta = getattr(node, "metadata", {}) or {}
|
||||
proto_list = meta.get("_prototype_vector")
|
||||
if isinstance(proto_list, list):
|
||||
return self._normalise_vector(proto_list)
|
||||
|
||||
template = getattr(node, "template", None)
|
||||
embedding = getattr(template, "embedding", None) if template else None
|
||||
vector_id = getattr(embedding, "vector_id", None) if embedding else None
|
||||
if vector_id:
|
||||
try:
|
||||
path = Path(vector_id)
|
||||
if path.exists():
|
||||
return self._normalise_vector(np.load(path))
|
||||
except Exception as exc:
|
||||
logger.debug("Prototype node illisible, skip: %s", exc)
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _normalise_vector(vector: Any) -> Optional[np.ndarray]:
|
||||
try:
|
||||
arr = np.asarray(vector, dtype=np.float32)
|
||||
except Exception:
|
||||
return None
|
||||
if arr.ndim != 1 or arr.size == 0:
|
||||
return None
|
||||
norm = float(np.linalg.norm(arr))
|
||||
if norm <= 0:
|
||||
return None
|
||||
return arr / norm
|
||||
|
||||
@staticmethod
|
||||
def _cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
|
||||
return float(np.clip(np.dot(a, b), -1.0, 1.0))
|
||||
|
||||
@staticmethod
|
||||
def _template_accepts_observation(
|
||||
node: Any,
|
||||
state: Any,
|
||||
confidence: float,
|
||||
) -> bool:
|
||||
template = getattr(node, "template", None)
|
||||
if template is None:
|
||||
return True
|
||||
try:
|
||||
window = getattr(template, "window", None)
|
||||
if window and hasattr(state, "window"):
|
||||
window_title = getattr(state.window, "window_title", "")
|
||||
process = getattr(state.window, "process", "")
|
||||
app_name = getattr(state.window, "app_name", "")
|
||||
if not window.matches(window_title, process or app_name):
|
||||
return False
|
||||
|
||||
text = getattr(template, "text", None)
|
||||
if text and hasattr(state, "perception"):
|
||||
detected = getattr(state.perception, "detected_text", [])
|
||||
if not text.matches(detected):
|
||||
return False
|
||||
|
||||
ui = getattr(template, "ui", None)
|
||||
if ui and hasattr(state, "ui_elements"):
|
||||
if not ui.matches(getattr(state, "ui_elements", [])):
|
||||
return False
|
||||
|
||||
return True
|
||||
except Exception as exc:
|
||||
logger.debug("Template match offline impossible, skip node: %s", exc)
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def _learning_node_key(workflow_id: str, node_id: str) -> str:
|
||||
"""Clé learner stable sans fuite de nom workflow potentiellement métier."""
|
||||
digest = hashlib.sha256(f"{workflow_id}:{node_id}".encode("utf-8")).hexdigest()
|
||||
safe_node = "".join(
|
||||
ch if ch.isascii() and (ch.isalnum() or ch in "._-") else "_"
|
||||
for ch in str(node_id)
|
||||
).strip("._-")
|
||||
if not safe_node:
|
||||
safe_node = "node"
|
||||
return f"wf_{digest[:16]}__{safe_node[:64]}"
|
||||
|
||||
# =========================================================================
|
||||
# Enrichissement VLM des workflows (target_spec sur chaque edge)
|
||||
# =========================================================================
|
||||
|
||||
Reference in New Issue
Block a user