fix(workflow): Restore run_counter during rehydration to ensure unique loop paths

DeanChensj · DeanChensj · commit 7127b618519f · 2026-04-09T08:54:18.000-07:00
When a workflow is rehydrated after an interruption, the `run_counter` in `NodeState` was defaulting to 0 because it was not explicitly restored from events.
In loop scenarios (e.g. HITL loops), this caused the engine to generate duplicate `run_id` paths (like `@1`) when it looped back and triggered the node again, leading to path collisions and state carryover bugs.

This fix explicitly restores `run_counter` from the event history during `_restore_static_nodes_from_events`, ensuring that subsequent runs of the same node in a loop generate unique sequential paths (like `@2`, `@3`).

This ensures correctness in human-in-the-loop samples like `request_input` where the execution loops back based on user feedback.

Change-Id: I2a8e9d0797b47a441261b4d1aa3ab932424a8b54
diff --git a/src/google/adk/workflow/_workflow_class.py b/src/google/adk/workflow/_workflow_class.py
@@ -536,6 +536,7 @@ def _restore_static_nodes_from_events(
       unresolved = child.interrupt_ids - child.resolved_ids
       existing_evt_run_id = child.run_id
 
+      run_counter = int(existing_evt_run_id) if existing_evt_run_id else 0
       if unresolved:
         node = self._get_static_node_by_name(child_name)
         if node.rerun_on_resume and child.resolved_ids:
@@ -547,6 +548,7 @@ def _restore_static_nodes_from_events(
               status=NodeStatus.PENDING,
               resume_inputs=child.resolved_responses,
               run_id=existing_evt_run_id,
+              run_counter=run_counter,
           )
         else:
           # Child can't handle partial resume, or nothing resolved
@@ -555,12 +557,14 @@ def _restore_static_nodes_from_events(
               status=NodeStatus.WAITING,
               interrupts=list(unresolved),
               run_id=existing_evt_run_id,
+              run_counter=run_counter,
           )
       elif child.output is not None:
         # Node's all interrupts are resolved and had output in previous run.
         nodes[child_name] = NodeState(
             status=NodeStatus.COMPLETED,
             run_id=existing_evt_run_id,
+            run_counter=run_counter,
         )
         node_outputs[child_name] = child.output
       elif child.interrupt_ids:
@@ -570,17 +574,35 @@ def _restore_static_nodes_from_events(
           nodes[child_name] = NodeState(
               status=NodeStatus.COMPLETED,
               run_id=existing_evt_run_id,
+              run_counter=run_counter,
           )
           node_outputs[child_name] = self._extract_resume_output(child, ctx)
-
           # Mark that we need to trigger downstream for this node
           nodes_to_trigger.append((child_name, node_outputs[child_name]))
         else:
           nodes[child_name] = NodeState(
               status=NodeStatus.PENDING,
               resume_inputs=child.resolved_responses,
               run_id=existing_evt_run_id,
+              run_counter=run_counter,
           )
+      if child_name not in nodes:
+        is_wait_for_output = False
+        try:
+          node = self._get_static_node_by_name(child_name)
+          is_wait_for_output = node.wait_for_output
+        except ValueError:
+          pass
+
+        # For nodes with events but no output:
+        # If wait_for_output is True, they are still WAITING for output.
+        # Otherwise, they are considered COMPLETED (e.g., side-effect nodes).
+        status = NodeStatus.WAITING if is_wait_for_output and child.output is None else NodeStatus.COMPLETED
+        nodes[child_name] = NodeState(
+            status=status,
+            run_id=existing_evt_run_id,
+            run_counter=run_counter,
+        )
 
     # wait_for_output nodes that were triggered but produced no output
     self._add_wait_for_output_nodes(nodes, children)
@@ -599,13 +621,6 @@ def _restore_static_nodes_from_events(
         for interrupt_id in state.interrupts
     }
 
-    # Restore run_counter from run_id so resumed nodes continue
-    # sequential ids. When NodeState is persisted (resumability=on),
-    # run_counter will already be correct from deserialization.
-    for state in nodes.values():
-      if state.run_id and state.run_id.isdigit():
-        state.run_counter = int(state.run_id)
-
     logger.info('node %s rehydrate end.', ctx.node_path)
 
   def _extract_resume_output(self, child: _ChildScanState, ctx: Context) -> Any:
@@ -686,7 +701,7 @@ def _scan_child_events(self, ctx: Context) -> dict[str, _ChildScanState]:
 
       # New run_id → reset child state (previous run stale).
       # ONLY update run_id from direct child events, not descendants!
-      evt_run_id = event.node_info.run_id
+      evt_run_id = event.node_info.path.rsplit('@', 1)[-1] if '@' in event.node_info.path else ''
       if (
           is_direct_child(event.node_info.path, workflow_path)
           and evt_run_id
@@ -751,6 +766,8 @@ def _process_resume(self, loop_state: _LoopState, ctx: Context) -> None:
     """Seed triggers for PENDING nodes and collect interrupt IDs."""
     for node_name, node_state in loop_state.nodes.items():
       if node_state.status == NodeStatus.PENDING:
+        if node_name in loop_state.trigger_buffer:
+          continue
         loop_state.trigger_buffer.setdefault(node_name, []).append(
             Trigger(
                 input=node_state.input,
diff --git a/tests/unittests/workflow/test_workflow_hitl.py b/tests/unittests/workflow/test_workflow_hitl.py
@@ -36,9 +36,9 @@
 from google.adk.workflow import START
 from google.adk.workflow._node_status import NodeStatus
 from google.adk.workflow._workflow_class import Workflow
-from google.adk.workflow.utils._workflow_hitl_utils import REQUEST_CREDENTIAL_FUNCTION_CALL_NAME
 from google.adk.workflow.utils._workflow_hitl_utils import create_request_input_response
 from google.adk.workflow.utils._workflow_hitl_utils import get_request_input_interrupt_ids
+from google.adk.workflow.utils._workflow_hitl_utils import REQUEST_CREDENTIAL_FUNCTION_CALL_NAME
 from google.adk.workflow.utils._workflow_hitl_utils import REQUEST_INPUT_FUNCTION_CALL_NAME
 from google.adk.workflow.utils._workflow_hitl_utils import wrap_response
 from google.genai import types
@@ -1378,6 +1378,7 @@ def process():
   events1 = await runner.run_async(testing_utils.get_user_content('go'))
   req1 = workflow_testing_utils.get_request_input_events(events1)
   assert len(req1) == 1
+  assert 'review@1' in req1[0].node_info.path
   inv_id = events1[0].invocation_id
 
   # Turn 2: revise → process reruns → review reruns → interrupt again
@@ -1389,6 +1390,7 @@ def process():
   )
   req2 = workflow_testing_utils.get_request_input_events(events2)
   assert len(req2) == 1, 'Expected second interrupt after revise'
+  assert 'review@2' in req2[0].node_info.path
   inv_id = events2[0].invocation_id
 
   # Turn 3: approve → should complete, not loop
@@ -1580,3 +1582,107 @@ def second_task():
   # Both nodes ran — node_b did NOT pause for a second auth request.
   assert call_log == ['first', 'second']
   assert sink.received_inputs == [{'status': 'done'}]
+
+
+@pytest.mark.asyncio
+async def test_workflow_loop_generates_unique_paths_across_resume(
+    request: pytest.FixtureRequest
+):
+  """Workflow loop generates unique sequential paths across resumes.
+
+  Setup: workflow simulating request_input sample with a loop and a RequestInput node.
+  Act:
+    - Turn 1: trigger RequestInput and interrupt.
+    - Turn 2: provide response triggering a loop back, and trigger RequestInput again.
+  Assert:
+    - Turn 1: node path has @1.
+    - Turn 2: node path has @2.
+  """
+  from google.adk.workflow import node
+  from google.adk.apps import App
+  from google.adk.events.event import Event
+  from google.adk.events.request_input import RequestInput
+
+  from tests.unittests import testing_utils
+  from tests.unittests.workflow import workflow_testing_utils
+
+  # Given a workflow simulating the request_input sample
+  @node
+  def process_input(node_input: Any):
+    yield Event(state={"complaint": node_input, "feedback": ""})
+
+  @node
+  def draft_email(ctx: Context):
+    complaint = ctx.state.get('complaint')
+    feedback = ctx.state.get('feedback')
+    yield Event(output=f"Draft based on {complaint} and feedback {feedback}")
+
+  @node(rerun_on_resume=True)
+  def request_human_review(node_input: Any, ctx: Context):
+    resume = ctx.resume_inputs.get('human_review')
+    if not resume:
+      yield RequestInput(
+          interrupt_id='human_review',
+          message=f"Please review: {node_input}",
+      )
+      return
+    yield Event(output=resume)
+
+  request_human_review.wait_for_output = True
+
+  @node
+  def handle_human_review(node_input: Any):
+    result = node_input.get('result') if isinstance(node_input, dict) else node_input
+    if result == "approve":
+      yield Event(route="approved")
+    else:
+      yield Event(state={"feedback": result}, route="revise")
+
+  @node
+  def end_node(node_input: Any):
+    yield Event(output="done")
+
+  wf = Workflow(
+      name="request_input",
+      edges=[
+          (
+              START,
+              process_input,
+              draft_email,
+              request_human_review,
+              handle_human_review,
+          ),
+          (handle_human_review, {"revise": draft_email, "approved": end_node}),
+      ],
+  )
+
+  app = App(
+      name=request.function.__name__,
+      root_agent=wf,
+  )
+  runner = testing_utils.InMemoryRunner(app=app)
+
+  # When Turn 1 executes (starts and interrupts)
+  events1 = await runner.run_async(
+      testing_utils.get_user_content("my complaint")
+  )
+
+  # Then verify it interrupted at request_human_review@1
+  req1 = workflow_testing_utils.get_request_input_events(events1)
+  assert len(req1) == 1
+  assert 'request_human_review@1' in req1[0].node_info.path
+
+  inv_id = events1[0].invocation_id
+
+  # When Turn 2 executes (provides response and loops back)
+  events2 = await runner.run_async(
+      new_message=testing_utils.UserContent(
+          create_request_input_response('human_review', {'result': 'make it shorter'})
+      ),
+      invocation_id=inv_id,
+  )
+
+  # Then verify it triggered request_human_review again with run_id @2
+  req2 = workflow_testing_utils.get_request_input_events(events2)
+  assert len(req2) == 1
+  assert 'request_human_review@2' in req2[0].node_info.path