fix(replay): guard single in-flight dispatch
Add a private in-flight helper for replay dispatch, block machine retargeting while an action is still pending on the previous session, and warn on duplicate in-flight entries for the same replay triplet. Freeze the Notepad runtime dialog success path and add integration coverage for single in-flight dispatch, watchdog late-report documentation, and the known concurrent-poll race as an xfail.
This commit is contained in:
@@ -625,6 +625,40 @@ def _remove_queued_action_duplicates(session_id: str, action_id: str) -> int:
|
||||
return removed
|
||||
|
||||
|
||||
def _find_in_flight_action(
|
||||
session_id: str,
|
||||
machine_id: str,
|
||||
replay_id: str,
|
||||
) -> Optional[str]:
|
||||
"""Return the in-flight action for this replay triplet, if any.
|
||||
|
||||
Must be called while `_replay_lock` is held. `dispatched_at == 0.0` is
|
||||
intentionally not in-flight: it marks a resume/retry/watchdog repush that
|
||||
still needs to be dispatched.
|
||||
"""
|
||||
if not replay_id:
|
||||
return None
|
||||
matches: list[str] = []
|
||||
for pending_action_id, pending in list(_retry_pending.items()):
|
||||
if (
|
||||
pending.get("session_id") == session_id
|
||||
and pending.get("machine_id") == machine_id
|
||||
and pending.get("replay_id") == replay_id
|
||||
and float(pending.get("dispatched_at") or 0) > 0
|
||||
):
|
||||
matches.append(pending_action_id)
|
||||
if not matches:
|
||||
return None
|
||||
if len(matches) > 1:
|
||||
logger.warning(
|
||||
"[REPLAY] _find_in_flight_action: %d in-flight actions for triplet "
|
||||
"session=%s machine=%s replay=%s — state may be corrupted, "
|
||||
"returning first (insertion order). action_ids=%s",
|
||||
len(matches), session_id, machine_id, replay_id, matches,
|
||||
)
|
||||
return matches[0]
|
||||
|
||||
|
||||
class StreamEvent(BaseModel):
|
||||
session_id: str
|
||||
timestamp: float
|
||||
@@ -3071,21 +3105,61 @@ async def get_next_action(session_id: str, machine_id: str = "default"):
|
||||
f"actions_en_attente={len(queue)}"
|
||||
)
|
||||
|
||||
if owning_replay is not None:
|
||||
replay_id = owning_replay.get("replay_id", "")
|
||||
in_flight_action_id = _find_in_flight_action(
|
||||
session_id, machine_id, replay_id
|
||||
)
|
||||
if in_flight_action_id is not None:
|
||||
logger.debug(
|
||||
"[REPLAY] action déjà en vol replay=%s session=%s "
|
||||
"machine=%s action_id=%s — pas de nouveau dispatch",
|
||||
replay_id, session_id, machine_id, in_flight_action_id,
|
||||
)
|
||||
return {
|
||||
"action": None,
|
||||
"session_id": session_id,
|
||||
"machine_id": machine_id,
|
||||
"action_in_flight": True,
|
||||
"in_flight_action_id": in_flight_action_id,
|
||||
"replay_id": replay_id,
|
||||
}
|
||||
|
||||
if not queue and machine_id != "default":
|
||||
# Lookup 1 : machine_replay_target (mapping explicite POST /replay)
|
||||
target_sid = _machine_replay_target.get(machine_id)
|
||||
if target_sid and target_sid != session_id:
|
||||
target_queue = _replay_queues.get(target_sid, [])
|
||||
if target_queue:
|
||||
# Vérifier que le replay_state ciblé concerne BIEN cette machine
|
||||
target_state = None
|
||||
for state in _replay_states.values():
|
||||
if (state.get("session_id") == target_sid
|
||||
and state.get("machine_id") == machine_id
|
||||
and state["status"] == "running"):
|
||||
target_state = state
|
||||
break
|
||||
if target_state:
|
||||
# Vérifier que le replay_state ciblé concerne BIEN cette machine
|
||||
target_state = None
|
||||
for state in _replay_states.values():
|
||||
if (state.get("session_id") == target_sid
|
||||
and state.get("machine_id") == machine_id
|
||||
and state["status"] == "running"):
|
||||
target_state = state
|
||||
break
|
||||
if target_state:
|
||||
replay_id = target_state.get("replay_id", "")
|
||||
in_flight_action_id = _find_in_flight_action(
|
||||
target_sid, machine_id, replay_id
|
||||
)
|
||||
if in_flight_action_id is not None:
|
||||
logger.debug(
|
||||
"[REPLAY] action déjà en vol replay=%s session=%s "
|
||||
"ancienne_session=%s machine=%s action_id=%s — "
|
||||
"reciblage différé",
|
||||
replay_id, session_id, target_sid, machine_id,
|
||||
in_flight_action_id,
|
||||
)
|
||||
return {
|
||||
"action": None,
|
||||
"session_id": session_id,
|
||||
"machine_id": machine_id,
|
||||
"action_in_flight": True,
|
||||
"in_flight_action_id": in_flight_action_id,
|
||||
"replay_id": replay_id,
|
||||
}
|
||||
if target_queue:
|
||||
queue = target_queue
|
||||
owning_replay = target_state
|
||||
_replay_queues[session_id] = target_queue
|
||||
@@ -3101,6 +3175,26 @@ async def get_next_action(session_id: str, machine_id: str = "default"):
|
||||
and state["status"] == "running"
|
||||
and state["session_id"] != session_id):
|
||||
other_sid = state["session_id"]
|
||||
replay_id = state.get("replay_id", "")
|
||||
in_flight_action_id = _find_in_flight_action(
|
||||
other_sid, machine_id, replay_id
|
||||
)
|
||||
if in_flight_action_id is not None:
|
||||
logger.debug(
|
||||
"[REPLAY] action déjà en vol replay=%s session=%s "
|
||||
"ancienne_session=%s machine=%s action_id=%s — "
|
||||
"reciblage différé",
|
||||
replay_id, session_id, other_sid, machine_id,
|
||||
in_flight_action_id,
|
||||
)
|
||||
return {
|
||||
"action": None,
|
||||
"session_id": session_id,
|
||||
"machine_id": machine_id,
|
||||
"action_in_flight": True,
|
||||
"in_flight_action_id": in_flight_action_id,
|
||||
"replay_id": replay_id,
|
||||
}
|
||||
other_queue = _replay_queues.get(other_sid, [])
|
||||
if other_queue:
|
||||
queue = other_queue
|
||||
@@ -3112,6 +3206,26 @@ async def get_next_action(session_id: str, machine_id: str = "default"):
|
||||
logger.info(f"Replay machine-state: {machine_id} -> {other_sid} -> {session_id}")
|
||||
break
|
||||
|
||||
if owning_replay is not None:
|
||||
replay_id = owning_replay.get("replay_id", "")
|
||||
in_flight_action_id = _find_in_flight_action(
|
||||
session_id, machine_id, replay_id
|
||||
)
|
||||
if in_flight_action_id is not None:
|
||||
logger.debug(
|
||||
"[REPLAY] action déjà en vol replay=%s session=%s "
|
||||
"machine=%s action_id=%s — pas de nouveau dispatch",
|
||||
replay_id, session_id, machine_id, in_flight_action_id,
|
||||
)
|
||||
return {
|
||||
"action": None,
|
||||
"session_id": session_id,
|
||||
"machine_id": machine_id,
|
||||
"action_in_flight": True,
|
||||
"in_flight_action_id": in_flight_action_id,
|
||||
"replay_id": replay_id,
|
||||
}
|
||||
|
||||
if not queue:
|
||||
return {"action": None, "session_id": session_id, "machine_id": machine_id}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user