NVIDIA · ybenvidia · Jun 29, 2026 · Jun 29, 2026 · Jun 29, 2026 · Jul 2, 2026
@@ -14,19 +14,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-name = "moe_benchmark_standard"
-description = "MoE Benchmark - DeepEP standard mode plus matrix export"
+name = "moe_benchmark_HT"
+description = "MoE Benchmark - high-throughput (HT) mode plus matrix export"
 test_template_name = "MoEBenchmark"
 
 [cmd_args]
 docker_image_url = "/your/path/to/the/container"
-benchmark_root = "/workspace/dp-benchmark/benchmark"
-mode = "standard"
+benchmark_root = "/workspace/DeepEP/benchmark"
+mode = "standard" # benchmark CLI value for high-throughput; LL lives in moe_benchmark_low_latency.toml
+deepep_versions = [
+  "legacy",
+  "elastic",
+  "nixl_ep",
+  "uccl_ep",
+  "nccl_ep",
+  "deepep_hybrid",
+]
 tokens = 4096
 num_experts = 256
 num_topk = 8
 hidden_size = 7168
 data_type = "bfloat16"
+benchmark_combine = true
 allow_nvlink_for_low_latency = false
 allow_mnnvl = false
 round_scale = false
@@ -37,8 +46,19 @@ shuffle_columns = false
 use_kineto_profiler = false
 enable_tuning = false
 config_file_path = "/tmp/config.yaml"
-results_dir = "/workspace/dp-benchmark/results"
+results_dir = "/workspace/DeepEP/results"
 
 [extra_env_vars]
+UCX_LOG_LEVEL = "error"
 NUM_QPS_PER_RANK = "12"
 NUM_SMS = "24"
+# nixl_ep imports the CUDA-versioned package from the meson build dir. Set it
+# explicitly (no trailing colon) so it's present regardless of --export behavior;
+# harmless for v1/v2 (deep_ep is in site-packages).
+PYTHONPATH = "/workspace/nixl/build/examples/device/ep"
+# uccl_ep: UCCL RDMA env (mirror NCCL's GID/iface) + intranode hint. UCCL's
+# Buffer falls back to torch.cuda.current_device() when LOCAL_RANK is unset, so
+# init_dist_slurm's per-rank device is used; LOCAL_WORLD_SIZE aids its topo detect.
+UCCL_SOCKET_IFNAME = "eno3"
+UCCL_IB_GID_INDEX = "3"
+LOCAL_WORLD_SIZE = "8"
@@ -15,18 +15,20 @@
 # limitations under the License.
 
 name = "moe_benchmark_low_latency"
-description = "MoE Benchmark - DeepEP low-latency mode plus matrix export"
+description = "MoE Benchmark - low-latency (LL) decode mode plus matrix export"
 test_template_name = "MoEBenchmark"
 
 [cmd_args]
 docker_image_url = "/your/path/to/the/container"
-benchmark_root = "/path/in/the/container/to/the/tests/folder"
+benchmark_root = "/workspace/DeepEP/benchmark"
 mode = "low_latency"
+deepep_versions = ["legacy", "nixl_ep", "uccl_ep", "nccl_ep"]
 tokens = 128
-num_experts = 288
+num_experts = 256
 num_topk = 8
 hidden_size = 7168
 data_type = "bfloat16"
+benchmark_combine = true
 allow_nvlink_for_low_latency = false
 allow_mnnvl = false
 round_scale = false
@@ -37,8 +39,18 @@ shuffle_columns = false
 use_kineto_profiler = false
 enable_tuning = false
 config_file_path = "/tmp/config.yaml"
-results_dir = "/workspace/dp-benchmark/results"
+results_dir = "/workspace/DeepEP/results"
 
 [extra_env_vars]
+UCX_LOG_LEVEL = "error"
 NUM_QPS_PER_RANK = "12"
 NUM_SMS = "24"
+NCCL_P2P_DISABLE = "1"
+NCCL_NVLS_DISABLE = "1"
+NVSHMEM_DISABLE_P2P = "1"
+# nixl_ep imports the CUDA-versioned package from the meson build dir.
+PYTHONPATH = "/workspace/nixl/build/examples/device/ep"
+# uccl_ep: UCCL RDMA env (mirror NCCL's GID/iface) + intranode hint.
+UCCL_SOCKET_IFNAME = "eno3"
+UCCL_IB_GID_INDEX = "3"
+LOCAL_WORLD_SIZE = "8"
@@ -31,6 +31,7 @@ warmup_iters = 1
 check = 1
 blocking = 0
 use_deepep_matrix = true
+average = 3
 
 [extra_env_vars]
 NCCL_P2P_DISABLE = "1"

@@ -26,6 +26,7 @@ e = "8M"
 use_deepep_matrix = true
 
 [extra_env_vars]
+UCX_LOG_LEVEL = "error"
 UCX_IB_GID_INDEX = "auto"
 UCX_TLS = "cuda_copy,rc"
 UCX_RNDV_THRESH = "0"

@@ -18,7 +18,7 @@ name = "moe-benchmark"
 
 [[Tests]]
 id = "Tests.moe_benchmark"
-test_name = "moe_benchmark_standard"
+test_name = "moe_benchmark_HT"
 num_nodes = 2
 time_limit = "00:30:00"
 

@@ -0,0 +1,46 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Low-latency (decode) counterpart of moe_benchmark.toml: runs the LL MoE test
+# (legacy/nixl_ep/uccl_ep/nccl_ep) then the UCC/NCCL all-to-all-v baselines. Uses the
+# dedicated no-NVLink baseline variants (ucc_alltoallv_deepep_nonvlink /
+# nccl_test_alltoallv_nonvlink) so the baselines match the RDMA-only LL EP backends —
+# no need to toggle the NVLink-ON HT baselines.
+name = "moe-benchmark-ll"
+
+[[Tests]]
+id = "Tests.moe_benchmark"
+test_name = "moe_benchmark_low_latency"
+num_nodes = 2
+time_limit = "00:30:00"
+
+[[Tests]]
+id = "Tests.ucc_alltoallv"
+test_name = "ucc_alltoallv_deepep_nonvlink"
+num_nodes = 2
+time_limit = "00:30:00"
+  [[Tests.dependencies]]
+  type = "start_post_comp"
+  id = "Tests.moe_benchmark"
+
+[[Tests]]
+id = "Tests.nccl_alltoallv"
+test_name = "nccl_test_alltoallv_nonvlink"
+num_nodes = 2
+time_limit = "00:30:00"
+  [[Tests.dependencies]]
+  type = "start_post_comp"
+  id = "Tests.ucc_alltoallv"
@@ -141,14 +141,48 @@ def get_global_env_vars(self) -> str:
             vars.append(f"export {key}={value}")
         return "\n".join(vars)
 
+    def _reject_single_sbatch_incompatible(self) -> None:
+        """
+        Fail fast on tests that cannot run in single-sbatch mode, with an actionable message.
+
+        Single-sbatch builds the whole script up front, before anything runs, so a test that needs an
+        artifact produced by an earlier test at runtime cannot work here. Each command-gen strategy decides
+        whether it is compatible via single_sbatch_unsupported_reason() (workloads that don't define it are
+        always allowed); this runner only aggregates the reasons and raises, staying workload-agnostic.
+        """
+        problems: list[str] = []
+        for tr in self.all_trs:
+            cmd_gen = self.get_cmd_gen_strategy(self.system, tr)
+            reason_fn = getattr(cmd_gen, "single_sbatch_unsupported_reason", None)
+            reason = reason_fn() if callable(reason_fn) else None
+            if reason:
+                problems.append(f"  - {tr.name}: {reason}")
+        if problems:
+            raise ValueError("These tests cannot run in single-sbatch mode:\n" + "\n".join(problems))
+
     def gen_sbatch_content(self) -> str:
+        self._reject_single_sbatch_incompatible()
         content: list[str] = ["#!/bin/bash", *self.get_sbatch_directives(), ""]
         content.extend(self.aux_commands())
         content.append("")
 
         content.append(self.get_global_env_vars())
         content.append("")
 
+        # Job-scoped prologue (head-node detection / etcd rendezvous). Only workloads that
+        # define gen_job_prologue() emit anything (e.g. MoE); others contribute nothing.
+        # Emit once, deduped.
+        seen_prologues: set[str] = set()
+        for tr in self.all_trs:
+            cmd_gen = self.get_cmd_gen_strategy(self.system, tr)
+            prologue_fn = getattr(cmd_gen, "gen_job_prologue", None)
+            prologue: list[str] = cast("list[str]", prologue_fn()) if callable(prologue_fn) else []
+            key = "\n".join(prologue)
+            if prologue and key not in seen_prologues:
+                seen_prologues.add(key)
+                content.extend(prologue)
+                content.append("")
+
         tr = self.test_scenario.test_runs[0]
         if tr.pre_test:
             content.append(self.add_pre_tests(tr.pre_test, tr))

@@ -24,6 +24,14 @@
 
 MOE_BENCHMARK_PREV_MOUNT = "/cloudai_moe_benchmark_prev"
 
+# Why a use_deepep_matrix=True baseline cannot run in single-sbatch mode. Shared by the UCC and
+# NCCL command-gen strategies, surfaced to the runner via single_sbatch_unsupported_reason().
+DEEPEP_MATRIX_SINGLE_SBATCH_REASON = (
+    "use_deepep_matrix=True replays the MoE benchmark's runtime traffic matrix, which is produced only after "
+    "the MoE test runs. Single-sbatch builds the whole script before anything runs, so the matrix is "
+    "unavailable. Run this scenario in multi-sbatch mode (drop --single-sbatch), or set use_deepep_matrix=false."
+)
+
 
 def start_post_comp_chain(test_run: TestRun) -> list[TestRun]:
     """Follow ``start_post_comp`` (e.g. UCC -> NCCL -> MoE benchmark)."""

@@ -16,6 +16,8 @@
 
 from typing import Literal, Optional
 
+from pydantic import Field
+
 from cloudai.core import DockerImage, Installable
 from cloudai.models.workload import CmdArgs, TestDefinition
 
@@ -24,8 +26,9 @@ class MoEBenchmarkCmdArgs(CmdArgs):
     """Command arguments for the custom MoE benchmark that compares EP/alltoallv backends."""
 
     docker_image_url: str
-    benchmark_root: str = "/workspace/dp-benchmark/benchmark"
+    benchmark_root: str = "/workspace/DeepEP/benchmark"
     mode: Literal["standard", "low_latency"] = "standard"
+    deepep_versions: list[str] = Field(default_factory=lambda: ["legacy", "elastic"])
     tokens: int = 1024
     num_experts: int = 256
     num_topk: int = 8
@@ -35,15 +38,20 @@ class MoEBenchmarkCmdArgs(CmdArgs):
     allow_mnnvl: bool = False
     round_scale: bool = False
     use_ue8m0: bool = False
+    benchmark_combine: bool = True
     num_warmups: int = 20
     num_iterations: int = 50
     shuffle_columns: bool = False
     use_kineto_profiler: bool = False
     enable_tuning: bool = False
     num_sms: int = 24
     num_qps_per_rank: int = 12
+
+    v2_num_sms: int = 12
+    v2_num_qps: int = 0
+    v2_prefer_overlap_with_compute: bool = False
     config_file_path: str = "/tmp/config.yaml"
-    results_dir: str = "/workspace/dp-benchmark/results"
+    results_dir: str = "/workspace/DeepEP/results"
 
 
 class MoEBenchmarkTestDefinition(TestDefinition):
@@ -72,6 +80,7 @@ def cmd_args_dict(self) -> dict:
                 "docker_image_url",
                 "benchmark_root",
                 "mode",
+                "deepep_versions",
                 "num_sms",
                 "num_qps_per_rank",
                 "config_file_path",

@@ -82,7 +82,7 @@ def generate_report(self) -> None:
                 "num_ranks",
                 "num_tokens",
                 "hidden",
-                "deepep_time",
+                "time_s",
                 "bus_bw_avg",
                 "bus_bw_min",
                 "bus_bw_max",