From 2a364805d23509c078a45fab78f292e10ea6abf0 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Sat, 30 May 2026 12:50:39 +0000 Subject: [PATCH 1/6] MinimaxM2.5-FP8-MI325x-vLLM: pin AITER FA attention backend vLLM PR #36702 (between v0.18.0 and v0.21.0) flipped the dense full-attention default on ROCm from ROCM_AITER_FA to ROCM_ATTN, causing a ~38% throughput regression for MiniMax-M2.5 FP8 on MI325X (vllm-project/vllm#43029). Align benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh with the merged upstream recipe (vllm-project/recipes#481) to restore the v0.18.0 attention path on the v0.21.0 image: - export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1 (asm/hip paged-attention auto-dispatch) - pass --attention-backend ROCM_AITER_FA to vllm serve --- benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh index 13867ce7e..22f05a644 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh @@ -24,9 +24,8 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi -# following AMD andy's recipe -# https://www.linkedin.com/posts/andyluo77_day-0-support-of-minimax-25-on-amd-gpu-activity-7428151527309025280-hXR8/ export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} @@ -52,6 +51,7 @@ $EP \ --max-model-len $MAX_MODEL_LEN \ --block-size=32 \ --no-enable-prefix-caching \ +--attention-backend ROCM_AITER_FA \ --trust-remote-code > $SERVER_LOG 2>&1 & SERVER_PID=$! From e58aba4da1bea431527f12f8642c73eb99ae376f Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Sat, 30 May 2026 12:57:33 +0000 Subject: [PATCH 2/6] Update the perf-changelog --- perf-changelog.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 2afe61dbe..1f70cc3b2 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3228,3 +3228,11 @@ - "Picks up the fix for the GSM8K accuracy regression reported in sgl-project/sglang#25742 (v0.5.12-20260517 collapsed to ~0.32 at TP=2)" - "Local eval-only runs on MI355X recover to gsm8k strict-match 0.975 at TP=2/conc=64 and 0.974 at TP=4/conc=16, well above the 0.92 upstream gate added in sgl-project/sglang#26396" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1593 + +- config-keys: + - minimaxm2.5-fp8-mi325x-vllm + description: + - "Pin AITER FA attention backend in benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh to recover the ~38% MI325X throughput regression introduced when vLLM PR #36702 (between v0.18.0 and v0.21.0) flipped the dense full-attention default on ROCm from ROCM_AITER_FA to ROCM_ATTN (vllm-project/vllm#43029)" + - "Export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1 to enable the AITER asm/hip paged-attention auto-dispatch" + - "Pass --attention-backend ROCM_AITER_FA to vllm serve, aligning with the merged upstream MiniMax ROCm recipe (vllm-project/recipes#481)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1594 From b69765b16983668d7a7e5a5ae61663ad12034074 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Sat, 30 May 2026 14:25:23 +0000 Subject: [PATCH 3/6] runners/launch_mi325x-amds.sh: propagate srun failures --- runners/launch_mi325x-amds.sh | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/runners/launch_mi325x-amds.sh b/runners/launch_mi325x-amds.sh index 810cbde2f..2e9bc406a 100644 --- a/runners/launch_mi325x-amds.sh +++ b/runners/launch_mi325x-amds.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +set -euo pipefail export HF_HUB_CACHE_MOUNT="/nfsdata/sa/gharunner/gharunners/hf-hub-cache/" export PORT=8888 @@ -13,17 +14,20 @@ set -x # chi-mi325x-pod1-121: enroot-aufs2ovlfs setcap fails on this node's NFS-backed # squash dir; container image import never completes # (root-caused via #1467/#1468/#1469 sweep failures). -JOB_ID=$(salloc --partition=$PARTITION --exclude=chi-mi325x-pod1-121.ord.vultr.cpe.ice.amd.com --gres=gpu:$TP --cpus-per-task=256 --time=480 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') +JOB_ID=$(set +o pipefail; salloc --partition=$PARTITION --exclude=chi-mi325x-pod1-121.ord.vultr.cpe.ice.amd.com --gres=gpu:$TP --cpus-per-task=256 --time=480 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') if [ -z "$JOB_ID" ]; then - echo "ERROR: salloc failed to allocate a job" + echo "ERROR: salloc failed to allocate a job" >&2 exit 1 fi +trap 'rc=$?; scancel "$JOB_ID" 2>/dev/null || true; exit "$rc"' EXIT + # Use flock to serialize concurrent imports to the same squash file -srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c " +srun --jobid="$JOB_ID" --job-name="$RUNNER_NAME" bash -c " + set -euo pipefail exec 9>\"$LOCK_FILE\" - flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } + flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE' >&2; exit 1; } if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then echo 'Squash file already exists and is valid, skipping import' else @@ -31,9 +35,9 @@ srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c " enroot import -o \"$SQUASH_FILE\" docker://$IMAGE fi " -srun --jobid=$JOB_ID \ ---container-image=$SQUASH_FILE \ ---container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ +srun --jobid="$JOB_ID" \ +--container-image="$SQUASH_FILE" \ +--container-mounts="$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE" \ --container-mount-home \ --container-writable \ --container-remap-root \ From fde577731bccaab2cce9470d669eb184cd85100c Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Sat, 30 May 2026 17:54:52 +0000 Subject: [PATCH 4/6] minimaxm2.5-fp8-mi325x-vllm: align with upstream MiniMax-M2.5 ROCm recipe --- .github/configs/amd-master.yaml | 2 +- benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh | 1 + perf-changelog.yaml | 2 ++ runners/launch_mi325x-amds.sh | 10 +++++++++- 4 files changed, 13 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index aba66160b..da9db6ed6 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1038,7 +1038,7 @@ minimaxm2.5-fp8-mi300x-vllm-agentic: - { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] } minimaxm2.5-fp8-mi325x-vllm: - image: vllm/vllm-openai-rocm:v0.21.0 + image: vllm/vllm-openai-rocm:v0.20.2 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi325x diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh index 22f05a644..4c01bc3d8 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh @@ -52,6 +52,7 @@ $EP \ --block-size=32 \ --no-enable-prefix-caching \ --attention-backend ROCM_AITER_FA \ +--compilation-config '{"mode":3,"cudagraph_mode":"PIECEWISE"}' \ --trust-remote-code > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 1f70cc3b2..c764c0c1a 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3235,4 +3235,6 @@ - "Pin AITER FA attention backend in benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh to recover the ~38% MI325X throughput regression introduced when vLLM PR #36702 (between v0.18.0 and v0.21.0) flipped the dense full-attention default on ROCm from ROCM_AITER_FA to ROCM_ATTN (vllm-project/vllm#43029)" - "Export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1 to enable the AITER asm/hip paged-attention auto-dispatch" - "Pass --attention-backend ROCM_AITER_FA to vllm serve, aligning with the merged upstream MiniMax ROCm recipe (vllm-project/recipes#481)" + - "Pin image vllm/vllm-openai-rocm:v0.20.2 — the version the upstream recipe explicitly validates (`min_vllm_version: 0.20.2`). v0.21.0 separately crashes during AITER MoE CUDA-graph capture on MiniMax-M2.5 (silent worker death, `Engine core initialization failed`) reproducible via the recipe's exact flags; v0.20.2 + recipe completes a 100-prompt vllm bench serve cleanly at 2030 tok/s total throughput on MI325X (TP=4)" + - "Add --compilation-config '{\"mode\":3,\"cudagraph_mode\":\"PIECEWISE\"}' to vllm serve, mirroring `model.base_args` from the upstream recipe. `pass_config.fuse_minimax_qk_norm` from the recipe is intentionally omitted — it triggers an upstream NameError on ROCm because vllm/compilation/passes/pass_manager.py imports MiniMaxQKNormPass under `is_cuda()` (NVIDIA-only) while using it unconditionally" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1594 diff --git a/runners/launch_mi325x-amds.sh b/runners/launch_mi325x-amds.sh index 2e9bc406a..d9b6de862 100644 --- a/runners/launch_mi325x-amds.sh +++ b/runners/launch_mi325x-amds.sh @@ -8,6 +8,14 @@ PARTITION="compute" SQUASH_FILE="/nfsdata/sa/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" LOCK_FILE="${SQUASH_FILE}.lock" +cleanup_stale_benchmark_logs() { + if [[ -n "${GITHUB_WORKSPACE:-}" ]]; then + sudo rm -rf "$GITHUB_WORKSPACE/benchmark_logs" 2>/dev/null || \ + rm -rf "$GITHUB_WORKSPACE/benchmark_logs" 2>/dev/null || true + fi +} +cleanup_stale_benchmark_logs + set -x # Exclude known-broken mi325x nodes: @@ -21,7 +29,7 @@ if [ -z "$JOB_ID" ]; then exit 1 fi -trap 'rc=$?; scancel "$JOB_ID" 2>/dev/null || true; exit "$rc"' EXIT +trap 'rc=$?; scancel "$JOB_ID" 2>/dev/null || true; cleanup_stale_benchmark_logs; exit "$rc"' EXIT # Use flock to serialize concurrent imports to the same squash file srun --jobid="$JOB_ID" --job-name="$RUNNER_NAME" bash -c " From d0fbcf4d19363c5b434694d23d0aabc8cac304de Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Sat, 30 May 2026 19:15:32 +0000 Subject: [PATCH 5/6] runners/launch_mi325x-amds.sh: derive PORT per job; sudo -n in cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use `40000 + (JOB_ID % 10000)` instead of a hard-coded 8888 — a non-SLURM Docker workload on chi-mi325x-pod1-019 bound :8888 and made every sweep job scheduled there fail in sock.bind() with EADDRINUSE before vLLM ran. Also harden the benchmark_logs trap with `sudo -n` so it fails fast under a non-tty instead of hanging. --- runners/launch_mi325x-amds.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/runners/launch_mi325x-amds.sh b/runners/launch_mi325x-amds.sh index d9b6de862..f44fb46ae 100644 --- a/runners/launch_mi325x-amds.sh +++ b/runners/launch_mi325x-amds.sh @@ -2,7 +2,6 @@ set -euo pipefail export HF_HUB_CACHE_MOUNT="/nfsdata/sa/gharunner/gharunners/hf-hub-cache/" -export PORT=8888 PARTITION="compute" SQUASH_FILE="/nfsdata/sa/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" @@ -10,7 +9,7 @@ LOCK_FILE="${SQUASH_FILE}.lock" cleanup_stale_benchmark_logs() { if [[ -n "${GITHUB_WORKSPACE:-}" ]]; then - sudo rm -rf "$GITHUB_WORKSPACE/benchmark_logs" 2>/dev/null || \ + sudo -n rm -rf "$GITHUB_WORKSPACE/benchmark_logs" 2>/dev/null || \ rm -rf "$GITHUB_WORKSPACE/benchmark_logs" 2>/dev/null || true fi } @@ -29,6 +28,8 @@ if [ -z "$JOB_ID" ]; then exit 1 fi +export PORT=$(( 40000 + (JOB_ID % 10000) )) + trap 'rc=$?; scancel "$JOB_ID" 2>/dev/null || true; cleanup_stale_benchmark_logs; exit "$rc"' EXIT # Use flock to serialize concurrent imports to the same squash file From 5471c536f4e2ba3ff5a7059b44c8b43257bbf9e7 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Sat, 30 May 2026 20:44:03 +0000 Subject: [PATCH 6/6] minimaxm2.5-fp8-mi325x-vllm: gate SHUFFLE_KV_CACHE_LAYOUT per (TP, CONC) Set VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1 (recipes#481 pillar 2) only at shapes where AITER's gfx942 ASM paged-attn kernel exists: TP=2 EP=1 CONC<=16, TP=8 EP=8 CONC<=64. Above those, pa_fwd_asm hits `get_heuristic_kernel: cannot get heuristic kernel!` (gqa=6, block_size=32, qTile=0) and HTTP-500s every request. Mirrors the per-shape toggle in the mi355x sibling. vllm#43029, sweep run 26692603804. --- benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh | 11 ++++++++++- perf-changelog.yaml | 1 + 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh index 4c01bc3d8..657e1b5ae 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh @@ -25,7 +25,16 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then fi export VLLM_ROCM_USE_AITER=1 -export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1 + +ENABLE_SHUFFLE_KV_CACHE_LAYOUT=0 +if [[ "$TP" == "2" && "$EP_SIZE" == "1" ]] && (( CONC <= 16 )); then + ENABLE_SHUFFLE_KV_CACHE_LAYOUT=1 +elif [[ "$TP" == "8" && "$EP_SIZE" == "8" ]] && (( CONC <= 64 )); then + ENABLE_SHUFFLE_KV_CACHE_LAYOUT=1 +fi +if (( ENABLE_SHUFFLE_KV_CACHE_LAYOUT )); then + export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1 +fi SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} diff --git a/perf-changelog.yaml b/perf-changelog.yaml index c764c0c1a..ad93a24b4 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3237,4 +3237,5 @@ - "Pass --attention-backend ROCM_AITER_FA to vllm serve, aligning with the merged upstream MiniMax ROCm recipe (vllm-project/recipes#481)" - "Pin image vllm/vllm-openai-rocm:v0.20.2 — the version the upstream recipe explicitly validates (`min_vllm_version: 0.20.2`). v0.21.0 separately crashes during AITER MoE CUDA-graph capture on MiniMax-M2.5 (silent worker death, `Engine core initialization failed`) reproducible via the recipe's exact flags; v0.20.2 + recipe completes a 100-prompt vllm bench serve cleanly at 2030 tok/s total throughput on MI325X (TP=4)" - "Add --compilation-config '{\"mode\":3,\"cudagraph_mode\":\"PIECEWISE\"}' to vllm serve, mirroring `model.base_args` from the upstream recipe. `pass_config.fuse_minimax_qk_norm` from the recipe is intentionally omitted — it triggers an upstream NameError on ROCm because vllm/compilation/passes/pass_manager.py imports MiniMaxQKNormPass under `is_cuda()` (NVIDIA-only) while using it unconditionally" + - "Conditionally enable VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1 per (TP, EP, CONC) — on for shapes where the AITER ASM paged-attention kernel exists in the gfx942 heuristic table (TP=2 EP=1 CONC<=16, TP=8 EP=8 CONC<=64), off otherwise. Above the thresholds vllm/v1/attention/backends/rocm_aiter_fa.py routes decode through aiter pa_fwd_asm and crashes with `RuntimeError: get_heuristic_kernel: cannot get heuristic kernel!` for MiniMax-M2.5's attention shape (gqa=6 block_size=32 qTile=0); below them the ASM auto-dispatch is the perf win the recipe wants. Thresholds confirmed across 17 bench cells + 3 eval cells in PR #1594 sweep run 26692603804. Mirrors the per-shape toggle pattern in benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh; can collapse to unconditional SHUFFLE=1 once AITER registers the missing kernel on gfx942" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1594