From 0b4ed808c2f0b979b36006dd98a37b79bf37f412 Mon Sep 17 00:00:00 2001 From: wzhao18 Date: Thu, 14 May 2026 21:36:08 -0700 Subject: [PATCH 1/8] Try updating b200 dsv4 --- .github/configs/nvidia-master.yaml | 2 +- benchmarks/single_node/dsv4_fp4_b200_vllm.sh | 15 +++++++-------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index c05299917..f9580444e 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1734,7 +1734,7 @@ dsv4-fp4-b200-sglang: - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 } dsv4-fp4-b200-vllm: - image: vllm/vllm-openai:v0.20.0-cu130 + image: vllm/vllm-openai:v0.20.2 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-dsv4 diff --git a/benchmarks/single_node/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/dsv4_fp4_b200_vllm.sh index 312d41472..b205a5577 100755 --- a/benchmarks/single_node/dsv4_fp4_b200_vllm.sh +++ b/benchmarks/single_node/dsv4_fp4_b200_vllm.sh @@ -42,13 +42,9 @@ if [ "${EP_SIZE:-1}" -gt 1 ]; then EP_ARGS=(--enable-expert-parallel) fi -# Mega-MoE backend and the lower GMU only kick in on the DP-attn path, -# per the vLLM v0.20.0 DeepSeek-V4-Pro recipe. All configs share the -# FULL_AND_PIECEWISE compilation config. GMU_ARGS=() MOE_ARGS=() if [ "${DP_ATTENTION}" = "true" ]; then - GMU_ARGS=(--gpu-memory-utilization 0.85) MOE_ARGS=(--moe-backend deep_gemm_mega_moe) fi @@ -58,11 +54,14 @@ else MAX_NUM_BATCHED_TOKENS=2048 fi -BENCHMARK_MAX_MODEL_LEN="$MAX_MODEL_LEN" -if [ "$ISL" -eq 1024 ] && [ "$OSL" -eq 1024 ]; then - BENCHMARK_MAX_MODEL_LEN=4096 +if [ "${ISL}" -eq 8192 ]; then + MAX_CUDAGRAPH_CAPTURE_SIZE=512 +else + MAX_CUDAGRAPH_CAPTURE_SIZE=2048 fi +BENCHMARK_MAX_MODEL_LEN="$MAX_MODEL_LEN" + if [ "${EVAL_ONLY}" = "true" ]; then EVAL_MAX_MODEL_LEN=$(compute_eval_context_length "$MODEL" "$BENCHMARK_MAX_MODEL_LEN") export EVAL_MAX_MODEL_LEN @@ -90,7 +89,7 @@ vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \ --tool-call-parser deepseek_v4 \ --enable-auto-tool-choice \ --reasoning-parser deepseek_v4 \ - --max-cudagraph-capture-size 2048 \ + --max-cudagraph-capture-size "$MAX_CUDAGRAPH_CAPTURE_SIZE" \ --max-model-len "$SERVE_MAX_MODEL_LEN" \ --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" > "$SERVER_LOG" 2>&1 & From ba46c77a8b592091ac8becb38367ca18c3ec603c Mon Sep 17 00:00:00 2001 From: wzhao18 Date: Thu, 14 May 2026 21:38:31 -0700 Subject: [PATCH 2/8] add changelog --- perf-changelog.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index b173f8127..9b1cc9ed9 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2486,3 +2486,9 @@ description: - "Update SGLang image from v0.5.9-cu130 to v0.5.11-cu130" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1322 + +- config-keys: + - dsv4-fp4-b200-vllm + description: + - "Update vLLM image from v0.20.0 to v0.20.2" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1384 From c5d239fcbe05da28f949702dfd48acbae1ac7f19 Mon Sep 17 00:00:00 2001 From: Wei Zhao <51183510+wzhao18@users.noreply.github.com> Date: Fri, 15 May 2026 09:45:43 -0400 Subject: [PATCH 3/8] Set MAX_CUDAGRAPH_CAPTURE_SIZE to 2048 unconditionally --- benchmarks/single_node/dsv4_fp4_b200_vllm.sh | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/dsv4_fp4_b200_vllm.sh index b205a5577..cedcda59f 100755 --- a/benchmarks/single_node/dsv4_fp4_b200_vllm.sh +++ b/benchmarks/single_node/dsv4_fp4_b200_vllm.sh @@ -54,11 +54,7 @@ else MAX_NUM_BATCHED_TOKENS=2048 fi -if [ "${ISL}" -eq 8192 ]; then - MAX_CUDAGRAPH_CAPTURE_SIZE=512 -else - MAX_CUDAGRAPH_CAPTURE_SIZE=2048 -fi +MAX_CUDAGRAPH_CAPTURE_SIZE=2048 BENCHMARK_MAX_MODEL_LEN="$MAX_MODEL_LEN" From 11d0585801aeabd48d9abf36160634fb98e39a7a Mon Sep 17 00:00:00 2001 From: Wei Zhao <51183510+wzhao18@users.noreply.github.com> Date: Thu, 28 May 2026 22:42:43 -0400 Subject: [PATCH 4/8] Update Docker image for dsv4-fp4-b200-vllm --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index f9580444e..4b264f89e 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1734,7 +1734,7 @@ dsv4-fp4-b200-sglang: - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 } dsv4-fp4-b200-vllm: - image: vllm/vllm-openai:v0.20.2 + image: vllm/vllm-openai:nightly-626fa9bba5663a5cf6a870debf031ee344ddb822 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-dsv4 From 715429d72ba2562c65e2c280edc087b009bb45d7 Mon Sep 17 00:00:00 2001 From: Wei Zhao <51183510+wzhao18@users.noreply.github.com> Date: Fri, 29 May 2026 12:18:55 -0400 Subject: [PATCH 5/8] Update vLLM image tag in perf-changelog.yaml Updated the vLLM image tag to specify the nightly version. --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index b2262c762..3f15dacec 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3204,5 +3204,5 @@ - config-keys: - dsv4-fp4-b200-vllm description: - - "Update vLLM image" + - "Update vLLM image tag to nightly-626fa9bba5663a5cf6a870debf031ee344ddb822" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1384 From 0a846851527b278d043b1c84d48509598fca21f2 Mon Sep 17 00:00:00 2001 From: Wei Zhao <51183510+wzhao18@users.noreply.github.com> Date: Fri, 29 May 2026 12:22:09 -0400 Subject: [PATCH 6/8] Update Docker image tag for dsv4-fp4-b200-vllm --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 32a313370..eaa344968 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1756,7 +1756,7 @@ dsv4-fp4-b200-sglang: - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 } dsv4-fp4-b200-vllm: - image: vllm/vllm-openai:nightly-626fa9bba5663a5cf6a870debf031ee344ddb822 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-dsv4 From 00c24f3ef3879f07f77402cdfdc888c62cfe6748 Mon Sep 17 00:00:00 2001 From: Wei Zhao <51183510+wzhao18@users.noreply.github.com> Date: Fri, 29 May 2026 12:22:37 -0400 Subject: [PATCH 7/8] Update vLLM image tag to v0.22.0 --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 3f15dacec..fc464e10c 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3204,5 +3204,5 @@ - config-keys: - dsv4-fp4-b200-vllm description: - - "Update vLLM image tag to nightly-626fa9bba5663a5cf6a870debf031ee344ddb822" + - "Update vLLM image tag to v0.22.0" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1384 From 527d0fbd19549921f4a935c7fdf56583300cf623 Mon Sep 17 00:00:00 2001 From: Wei Zhao <51183510+wzhao18@users.noreply.github.com> Date: Fri, 29 May 2026 12:25:40 -0400 Subject: [PATCH 8/8] Update conc-end values in nvidia-master.yaml --- .github/configs/nvidia-master.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index eaa344968..4c7786532 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1770,7 +1770,8 @@ dsv4-fp4-b200-vllm: search-space: - { tp: 8, conc-start: 1, conc-end: 64 } - { tp: 8, ep: 8, conc-start: 128, conc-end: 128 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 4096 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 } - isl: 8192 osl: 1024 search-space: