diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 30c657da1..3026739d5 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1756,7 +1756,7 @@ dsv4-fp4-b200-sglang: - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 } dsv4-fp4-b200-vllm: - image: vllm/vllm-openai:v0.21.0 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-dsv4 @@ -1770,7 +1770,8 @@ dsv4-fp4-b200-vllm: search-space: - { tp: 8, conc-start: 1, conc-end: 64 } - { tp: 8, ep: 8, conc-start: 128, conc-end: 128 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 4096 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 } - isl: 8192 osl: 1024 search-space: diff --git a/benchmarks/single_node/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/dsv4_fp4_b200_vllm.sh index 312d41472..cedcda59f 100755 --- a/benchmarks/single_node/dsv4_fp4_b200_vllm.sh +++ b/benchmarks/single_node/dsv4_fp4_b200_vllm.sh @@ -42,13 +42,9 @@ if [ "${EP_SIZE:-1}" -gt 1 ]; then EP_ARGS=(--enable-expert-parallel) fi -# Mega-MoE backend and the lower GMU only kick in on the DP-attn path, -# per the vLLM v0.20.0 DeepSeek-V4-Pro recipe. All configs share the -# FULL_AND_PIECEWISE compilation config. GMU_ARGS=() MOE_ARGS=() if [ "${DP_ATTENTION}" = "true" ]; then - GMU_ARGS=(--gpu-memory-utilization 0.85) MOE_ARGS=(--moe-backend deep_gemm_mega_moe) fi @@ -58,10 +54,9 @@ else MAX_NUM_BATCHED_TOKENS=2048 fi +MAX_CUDAGRAPH_CAPTURE_SIZE=2048 + BENCHMARK_MAX_MODEL_LEN="$MAX_MODEL_LEN" -if [ "$ISL" -eq 1024 ] && [ "$OSL" -eq 1024 ]; then - BENCHMARK_MAX_MODEL_LEN=4096 -fi if [ "${EVAL_ONLY}" = "true" ]; then EVAL_MAX_MODEL_LEN=$(compute_eval_context_length "$MODEL" "$BENCHMARK_MAX_MODEL_LEN") @@ -90,7 +85,7 @@ vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \ --tool-call-parser deepseek_v4 \ --enable-auto-tool-choice \ --reasoning-parser deepseek_v4 \ - --max-cudagraph-capture-size 2048 \ + --max-cudagraph-capture-size "$MAX_CUDAGRAPH_CAPTURE_SIZE" \ --max-model-len "$SERVE_MAX_MODEL_LEN" \ --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" > "$SERVER_LOG" 2>&1 & diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 1b194b52f..2ea8d72de 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3208,3 +3208,9 @@ - "1k1k and 8k1k STP hightpt and lowlat srt-slurm recipes under benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/ (resolved from upstream srt-slurm PR #160 via srtctl resolve-override)" - "Wire glm5/fp8 model + dynamo-sglang framework branches into runners/launch_gb300-nv.sh with SA upstream defaults (SLURM_PARTITION=batch_1, SLURM_ACCOUNT=benchmark, SQUASH_FILE under /home/sa-shared/gharunners/squash/)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1557 + +- config-keys: + - dsv4-fp4-b200-vllm + description: + - "Update vLLM image tag to v0.22.0" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1384