fix(replay): guard single in-flight dispatch

Add a private in-flight helper for replay dispatch, block machine retargeting while an action is still pending on the previous session, and warn on duplicate in-flight entries for the same replay triplet.

Freeze the Notepad runtime dialog success path and add integration coverage for single in-flight dispatch, watchdog late-report documentation, and the known concurrent-poll race as an xfail.
This commit is contained in:
Dom
2026-05-25 11:00:59 +02:00
parent 7bb8d543ab
commit 4ba426c205
3 changed files with 805 additions and 10 deletions

View File

@@ -625,6 +625,40 @@ def _remove_queued_action_duplicates(session_id: str, action_id: str) -> int:
return removed
def _find_in_flight_action(
session_id: str,
machine_id: str,
replay_id: str,
) -> Optional[str]:
"""Return the in-flight action for this replay triplet, if any.
Must be called while `_replay_lock` is held. `dispatched_at == 0.0` is
intentionally not in-flight: it marks a resume/retry/watchdog repush that
still needs to be dispatched.
"""
if not replay_id:
return None
matches: list[str] = []
for pending_action_id, pending in list(_retry_pending.items()):
if (
pending.get("session_id") == session_id
and pending.get("machine_id") == machine_id
and pending.get("replay_id") == replay_id
and float(pending.get("dispatched_at") or 0) > 0
):
matches.append(pending_action_id)
if not matches:
return None
if len(matches) > 1:
logger.warning(
"[REPLAY] _find_in_flight_action: %d in-flight actions for triplet "
"session=%s machine=%s replay=%s — state may be corrupted, "
"returning first (insertion order). action_ids=%s",
len(matches), session_id, machine_id, replay_id, matches,
)
return matches[0]
class StreamEvent(BaseModel):
session_id: str
timestamp: float
@@ -3071,21 +3105,61 @@ async def get_next_action(session_id: str, machine_id: str = "default"):
f"actions_en_attente={len(queue)}"
)
if owning_replay is not None:
replay_id = owning_replay.get("replay_id", "")
in_flight_action_id = _find_in_flight_action(
session_id, machine_id, replay_id
)
if in_flight_action_id is not None:
logger.debug(
"[REPLAY] action déjà en vol replay=%s session=%s "
"machine=%s action_id=%s — pas de nouveau dispatch",
replay_id, session_id, machine_id, in_flight_action_id,
)
return {
"action": None,
"session_id": session_id,
"machine_id": machine_id,
"action_in_flight": True,
"in_flight_action_id": in_flight_action_id,
"replay_id": replay_id,
}
if not queue and machine_id != "default":
# Lookup 1 : machine_replay_target (mapping explicite POST /replay)
target_sid = _machine_replay_target.get(machine_id)
if target_sid and target_sid != session_id:
target_queue = _replay_queues.get(target_sid, [])
if target_queue:
# Vérifier que le replay_state ciblé concerne BIEN cette machine
target_state = None
for state in _replay_states.values():
if (state.get("session_id") == target_sid
and state.get("machine_id") == machine_id
and state["status"] == "running"):
target_state = state
break
if target_state:
# Vérifier que le replay_state ciblé concerne BIEN cette machine
target_state = None
for state in _replay_states.values():
if (state.get("session_id") == target_sid
and state.get("machine_id") == machine_id
and state["status"] == "running"):
target_state = state
break
if target_state:
replay_id = target_state.get("replay_id", "")
in_flight_action_id = _find_in_flight_action(
target_sid, machine_id, replay_id
)
if in_flight_action_id is not None:
logger.debug(
"[REPLAY] action déjà en vol replay=%s session=%s "
"ancienne_session=%s machine=%s action_id=%s"
"reciblage différé",
replay_id, session_id, target_sid, machine_id,
in_flight_action_id,
)
return {
"action": None,
"session_id": session_id,
"machine_id": machine_id,
"action_in_flight": True,
"in_flight_action_id": in_flight_action_id,
"replay_id": replay_id,
}
if target_queue:
queue = target_queue
owning_replay = target_state
_replay_queues[session_id] = target_queue
@@ -3101,6 +3175,26 @@ async def get_next_action(session_id: str, machine_id: str = "default"):
and state["status"] == "running"
and state["session_id"] != session_id):
other_sid = state["session_id"]
replay_id = state.get("replay_id", "")
in_flight_action_id = _find_in_flight_action(
other_sid, machine_id, replay_id
)
if in_flight_action_id is not None:
logger.debug(
"[REPLAY] action déjà en vol replay=%s session=%s "
"ancienne_session=%s machine=%s action_id=%s"
"reciblage différé",
replay_id, session_id, other_sid, machine_id,
in_flight_action_id,
)
return {
"action": None,
"session_id": session_id,
"machine_id": machine_id,
"action_in_flight": True,
"in_flight_action_id": in_flight_action_id,
"replay_id": replay_id,
}
other_queue = _replay_queues.get(other_sid, [])
if other_queue:
queue = other_queue
@@ -3112,6 +3206,26 @@ async def get_next_action(session_id: str, machine_id: str = "default"):
logger.info(f"Replay machine-state: {machine_id} -> {other_sid} -> {session_id}")
break
if owning_replay is not None:
replay_id = owning_replay.get("replay_id", "")
in_flight_action_id = _find_in_flight_action(
session_id, machine_id, replay_id
)
if in_flight_action_id is not None:
logger.debug(
"[REPLAY] action déjà en vol replay=%s session=%s "
"machine=%s action_id=%s — pas de nouveau dispatch",
replay_id, session_id, machine_id, in_flight_action_id,
)
return {
"action": None,
"session_id": session_id,
"machine_id": machine_id,
"action_in_flight": True,
"in_flight_action_id": in_flight_action_id,
"replay_id": replay_id,
}
if not queue:
return {"action": None, "session_id": session_id, "machine_id": machine_id}