Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,28 @@
# See the License for the specific language governing permissions and
# limitations under the License.

name = "moe_benchmark_standard"
description = "MoE Benchmark - DeepEP standard mode plus matrix export"
name = "moe_benchmark_HT"
description = "MoE Benchmark - high-throughput (HT) mode plus matrix export"
test_template_name = "MoEBenchmark"

[cmd_args]
docker_image_url = "/your/path/to/the/container"
benchmark_root = "/workspace/dp-benchmark/benchmark"
mode = "standard"
benchmark_root = "/workspace/DeepEP/benchmark"
mode = "standard" # benchmark CLI value for high-throughput; LL lives in moe_benchmark_low_latency.toml
deepep_versions = [
"legacy",
"elastic",
"nixl_ep",
"uccl_ep",
"nccl_ep",
"deepep_hybrid",
]
tokens = 4096
num_experts = 256
num_topk = 8
hidden_size = 7168
data_type = "bfloat16"
benchmark_combine = true
allow_nvlink_for_low_latency = false
allow_mnnvl = false
round_scale = false
Expand All @@ -37,8 +46,19 @@ shuffle_columns = false
use_kineto_profiler = false
enable_tuning = false
config_file_path = "/tmp/config.yaml"
results_dir = "/workspace/dp-benchmark/results"
results_dir = "/workspace/DeepEP/results"

[extra_env_vars]
UCX_LOG_LEVEL = "error"
NUM_QPS_PER_RANK = "12"
NUM_SMS = "24"
# nixl_ep imports the CUDA-versioned package from the meson build dir. Set it
# explicitly (no trailing colon) so it's present regardless of --export behavior;
# harmless for v1/v2 (deep_ep is in site-packages).
PYTHONPATH = "/workspace/nixl/build/examples/device/ep"
# uccl_ep: UCCL RDMA env (mirror NCCL's GID/iface) + intranode hint. UCCL's
# Buffer falls back to torch.cuda.current_device() when LOCAL_RANK is unset, so
# init_dist_slurm's per-rank device is used; LOCAL_WORLD_SIZE aids its topo detect.
UCCL_SOCKET_IFNAME = "eno3"
UCCL_IB_GID_INDEX = "3"
LOCAL_WORLD_SIZE = "8"
20 changes: 16 additions & 4 deletions conf/experimental/test/moe_benchmark_low_latency.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,20 @@
# limitations under the License.

name = "moe_benchmark_low_latency"
description = "MoE Benchmark - DeepEP low-latency mode plus matrix export"
description = "MoE Benchmark - low-latency (LL) decode mode plus matrix export"
test_template_name = "MoEBenchmark"

[cmd_args]
docker_image_url = "/your/path/to/the/container"
benchmark_root = "/path/in/the/container/to/the/tests/folder"
benchmark_root = "/workspace/DeepEP/benchmark"
mode = "low_latency"
deepep_versions = ["legacy", "nixl_ep", "uccl_ep", "nccl_ep"]
tokens = 128
num_experts = 288
num_experts = 256
num_topk = 8
hidden_size = 7168
data_type = "bfloat16"
benchmark_combine = true
allow_nvlink_for_low_latency = false
allow_mnnvl = false
round_scale = false
Expand All @@ -37,8 +39,18 @@ shuffle_columns = false
use_kineto_profiler = false
enable_tuning = false
config_file_path = "/tmp/config.yaml"
results_dir = "/workspace/dp-benchmark/results"
results_dir = "/workspace/DeepEP/results"

[extra_env_vars]
UCX_LOG_LEVEL = "error"
NUM_QPS_PER_RANK = "12"
NUM_SMS = "24"
NCCL_P2P_DISABLE = "1"
NCCL_NVLS_DISABLE = "1"
NVSHMEM_DISABLE_P2P = "1"
# nixl_ep imports the CUDA-versioned package from the meson build dir.
PYTHONPATH = "/workspace/nixl/build/examples/device/ep"
# uccl_ep: UCCL RDMA env (mirror NCCL's GID/iface) + intranode hint.
UCCL_SOCKET_IFNAME = "eno3"
UCCL_IB_GID_INDEX = "3"
LOCAL_WORLD_SIZE = "8"
1 change: 1 addition & 0 deletions conf/experimental/test/nccl_test_alltoallv.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ warmup_iters = 1
check = 1
blocking = 0
use_deepep_matrix = true
average = 3

[extra_env_vars]
NCCL_P2P_DISABLE = "1"
Expand Down
1 change: 1 addition & 0 deletions conf/experimental/test/ucc_alltoallv_deepep.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ e = "8M"
use_deepep_matrix = true

[extra_env_vars]
UCX_LOG_LEVEL = "error"
UCX_IB_GID_INDEX = "auto"
UCX_TLS = "cuda_copy,rc"
UCX_RNDV_THRESH = "0"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ name = "moe-benchmark"

[[Tests]]
id = "Tests.moe_benchmark"
test_name = "moe_benchmark_standard"
test_name = "moe_benchmark_HT"
num_nodes = 2
time_limit = "00:30:00"

Expand Down
46 changes: 46 additions & 0 deletions conf/experimental/test_scenario/moe_benchmark_low_latency.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Low-latency (decode) counterpart of moe_benchmark.toml: runs the LL MoE test
# (legacy/nixl_ep/uccl_ep/nccl_ep) then the UCC/NCCL all-to-all-v baselines. Uses the
# dedicated no-NVLink baseline variants (ucc_alltoallv_deepep_nonvlink /
# nccl_test_alltoallv_nonvlink) so the baselines match the RDMA-only LL EP backends —
# no need to toggle the NVLink-ON HT baselines.
name = "moe-benchmark-ll"

[[Tests]]
id = "Tests.moe_benchmark"
test_name = "moe_benchmark_low_latency"
num_nodes = 2
time_limit = "00:30:00"

[[Tests]]
id = "Tests.ucc_alltoallv"
test_name = "ucc_alltoallv_deepep_nonvlink"
num_nodes = 2
time_limit = "00:30:00"
[[Tests.dependencies]]
type = "start_post_comp"
id = "Tests.moe_benchmark"

[[Tests]]
id = "Tests.nccl_alltoallv"
test_name = "nccl_test_alltoallv_nonvlink"
num_nodes = 2
time_limit = "00:30:00"
[[Tests.dependencies]]
type = "start_post_comp"
id = "Tests.ucc_alltoallv"
34 changes: 34 additions & 0 deletions src/cloudai/systems/slurm/single_sbatch_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,14 +141,48 @@ def get_global_env_vars(self) -> str:
vars.append(f"export {key}={value}")
return "\n".join(vars)

def _reject_single_sbatch_incompatible(self) -> None:
"""
Fail fast on tests that cannot run in single-sbatch mode, with an actionable message.

Single-sbatch builds the whole script up front, before anything runs, so a test that needs an
artifact produced by an earlier test at runtime cannot work here. Each command-gen strategy decides
whether it is compatible via single_sbatch_unsupported_reason() (workloads that don't define it are
always allowed); this runner only aggregates the reasons and raises, staying workload-agnostic.
"""
problems: list[str] = []
for tr in self.all_trs:
cmd_gen = self.get_cmd_gen_strategy(self.system, tr)
reason_fn = getattr(cmd_gen, "single_sbatch_unsupported_reason", None)
reason = reason_fn() if callable(reason_fn) else None
if reason:
problems.append(f" - {tr.name}: {reason}")
if problems:
raise ValueError("These tests cannot run in single-sbatch mode:\n" + "\n".join(problems))

def gen_sbatch_content(self) -> str:
self._reject_single_sbatch_incompatible()
content: list[str] = ["#!/bin/bash", *self.get_sbatch_directives(), ""]
content.extend(self.aux_commands())
content.append("")

content.append(self.get_global_env_vars())
content.append("")

# Job-scoped prologue (head-node detection / etcd rendezvous). Only workloads that
# define gen_job_prologue() emit anything (e.g. MoE); others contribute nothing.
# Emit once, deduped.
seen_prologues: set[str] = set()
for tr in self.all_trs:
cmd_gen = self.get_cmd_gen_strategy(self.system, tr)
prologue_fn = getattr(cmd_gen, "gen_job_prologue", None)
prologue: list[str] = cast("list[str]", prologue_fn()) if callable(prologue_fn) else []
key = "\n".join(prologue)
if prologue and key not in seen_prologues:
seen_prologues.add(key)
content.extend(prologue)
content.append("")

tr = self.test_scenario.test_runs[0]
if tr.pre_test:
content.append(self.add_pre_tests(tr.pre_test, tr))
Expand Down
8 changes: 8 additions & 0 deletions src/cloudai/workloads/common/moe_benchmark_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,14 @@

MOE_BENCHMARK_PREV_MOUNT = "/cloudai_moe_benchmark_prev"

# Why a use_deepep_matrix=True baseline cannot run in single-sbatch mode. Shared by the UCC and
# NCCL command-gen strategies, surfaced to the runner via single_sbatch_unsupported_reason().
DEEPEP_MATRIX_SINGLE_SBATCH_REASON = (
"use_deepep_matrix=True replays the MoE benchmark's runtime traffic matrix, which is produced only after "
"the MoE test runs. Single-sbatch builds the whole script before anything runs, so the matrix is "
"unavailable. Run this scenario in multi-sbatch mode (drop --single-sbatch), or set use_deepep_matrix=false."
)


def start_post_comp_chain(test_run: TestRun) -> list[TestRun]:
"""Follow ``start_post_comp`` (e.g. UCC -> NCCL -> MoE benchmark)."""
Expand Down
13 changes: 11 additions & 2 deletions src/cloudai/workloads/moe_benchmark/moe_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

from typing import Literal, Optional

from pydantic import Field

from cloudai.core import DockerImage, Installable
from cloudai.models.workload import CmdArgs, TestDefinition

Expand All @@ -24,8 +26,9 @@ class MoEBenchmarkCmdArgs(CmdArgs):
"""Command arguments for the custom MoE benchmark that compares EP/alltoallv backends."""

docker_image_url: str
benchmark_root: str = "/workspace/dp-benchmark/benchmark"
benchmark_root: str = "/workspace/DeepEP/benchmark"
mode: Literal["standard", "low_latency"] = "standard"
deepep_versions: list[str] = Field(default_factory=lambda: ["legacy", "elastic"])
tokens: int = 1024
num_experts: int = 256
num_topk: int = 8
Expand All @@ -35,15 +38,20 @@ class MoEBenchmarkCmdArgs(CmdArgs):
allow_mnnvl: bool = False
round_scale: bool = False
use_ue8m0: bool = False
benchmark_combine: bool = True
num_warmups: int = 20
num_iterations: int = 50
shuffle_columns: bool = False
use_kineto_profiler: bool = False
enable_tuning: bool = False
num_sms: int = 24
num_qps_per_rank: int = 12

v2_num_sms: int = 12
v2_num_qps: int = 0
v2_prefer_overlap_with_compute: bool = False
config_file_path: str = "/tmp/config.yaml"
results_dir: str = "/workspace/dp-benchmark/results"
results_dir: str = "/workspace/DeepEP/results"


class MoEBenchmarkTestDefinition(TestDefinition):
Expand Down Expand Up @@ -72,6 +80,7 @@ def cmd_args_dict(self) -> dict:
"docker_image_url",
"benchmark_root",
"mode",
"deepep_versions",
"num_sms",
"num_qps_per_rank",
"config_file_path",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def generate_report(self) -> None:
"num_ranks",
"num_tokens",
"hidden",
"deepep_time",
"time_s",
"bus_bw_avg",
"bus_bw_min",
"bus_bw_max",
Expand Down
Loading
Loading