Skip to content
Merged
5 changes: 3 additions & 2 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1756,7 +1756,7 @@ dsv4-fp4-b200-sglang:
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 }

dsv4-fp4-b200-vllm:
image: vllm/vllm-openai:v0.21.0
image: vllm/vllm-openai:v0.22.0
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: b200-dsv4
Expand All @@ -1770,7 +1770,8 @@ dsv4-fp4-b200-vllm:
search-space:
- { tp: 8, conc-start: 1, conc-end: 64 }
- { tp: 8, ep: 8, conc-start: 128, conc-end: 128 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 4096 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 }
- isl: 8192
osl: 1024
search-space:
Expand Down
11 changes: 3 additions & 8 deletions benchmarks/single_node/dsv4_fp4_b200_vllm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,9 @@ if [ "${EP_SIZE:-1}" -gt 1 ]; then
EP_ARGS=(--enable-expert-parallel)
fi

# Mega-MoE backend and the lower GMU only kick in on the DP-attn path,
Comment thread
wzhao18 marked this conversation as resolved.
# per the vLLM v0.20.0 DeepSeek-V4-Pro recipe. All configs share the
# FULL_AND_PIECEWISE compilation config.
Comment thread
wzhao18 marked this conversation as resolved.
GMU_ARGS=()
MOE_ARGS=()
if [ "${DP_ATTENTION}" = "true" ]; then
GMU_ARGS=(--gpu-memory-utilization 0.85)
MOE_ARGS=(--moe-backend deep_gemm_mega_moe)
fi

Comment thread
wzhao18 marked this conversation as resolved.
Expand All @@ -58,10 +54,9 @@ else
MAX_NUM_BATCHED_TOKENS=2048
fi

MAX_CUDAGRAPH_CAPTURE_SIZE=2048

BENCHMARK_MAX_MODEL_LEN="$MAX_MODEL_LEN"
if [ "$ISL" -eq 1024 ] && [ "$OSL" -eq 1024 ]; then
BENCHMARK_MAX_MODEL_LEN=4096
fi

if [ "${EVAL_ONLY}" = "true" ]; then
EVAL_MAX_MODEL_LEN=$(compute_eval_context_length "$MODEL" "$BENCHMARK_MAX_MODEL_LEN")
Expand Down Expand Up @@ -90,7 +85,7 @@ vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \
--tool-call-parser deepseek_v4 \
--enable-auto-tool-choice \
--reasoning-parser deepseek_v4 \
--max-cudagraph-capture-size 2048 \
--max-cudagraph-capture-size "$MAX_CUDAGRAPH_CAPTURE_SIZE" \
--max-model-len "$SERVE_MAX_MODEL_LEN" \
--max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" > "$SERVER_LOG" 2>&1 &

Expand Down
6 changes: 6 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3208,3 +3208,9 @@
- "1k1k and 8k1k STP hightpt and lowlat srt-slurm recipes under benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/ (resolved from upstream srt-slurm PR #160 via srtctl resolve-override)"
- "Wire glm5/fp8 model + dynamo-sglang framework branches into runners/launch_gb300-nv.sh with SA upstream defaults (SLURM_PARTITION=batch_1, SLURM_ACCOUNT=benchmark, SQUASH_FILE under /home/sa-shared/gharunners/squash/)"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1557

- config-keys:
- dsv4-fp4-b200-vllm
description:
- "Update vLLM image tag to v0.22.0"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1384
Comment thread
wzhao18 marked this conversation as resolved.
Loading