From 2a364805d23509c078a45fab78f292e10ea6abf0 Mon Sep 17 00:00:00 2001
From: Chun Fang <chun.fang@amd.com>
Date: Sat, 30 May 2026 12:50:39 +0000
Subject: [PATCH 1/6] MinimaxM2.5-FP8-MI325x-vLLM: pin AITER FA attention
 backend

vLLM PR #36702 (between v0.18.0 and v0.21.0) flipped the dense
full-attention default on ROCm from ROCM_AITER_FA to ROCM_ATTN, causing
a ~38% throughput regression for MiniMax-M2.5 FP8 on MI325X
(vllm-project/vllm#43029).

Align benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh with the merged
upstream recipe (vllm-project/recipes#481) to restore the v0.18.0
attention path on the v0.21.0 image:

  - export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1 (asm/hip paged-attention
    auto-dispatch)
  - pass --attention-backend ROCM_AITER_FA to vllm serve
---
 benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh
index 13867ce7e..22f05a644 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh
+++ b/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh
@@ -24,9 +24,8 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
     export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
 fi
 
-# following AMD andy's recipe 
-# https://www.linkedin.com/posts/andyluo77_day-0-support-of-minimax-25-on-amd-gpu-activity-7428151527309025280-hXR8/
 export VLLM_ROCM_USE_AITER=1
+export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1
 
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
@@ -52,6 +51,7 @@ $EP \
 --max-model-len $MAX_MODEL_LEN \
 --block-size=32 \
 --no-enable-prefix-caching \
+--attention-backend ROCM_AITER_FA \
 --trust-remote-code > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!

From e58aba4da1bea431527f12f8642c73eb99ae376f Mon Sep 17 00:00:00 2001
From: Chun Fang <chun.fang@amd.com>
Date: Sat, 30 May 2026 12:57:33 +0000
Subject: [PATCH 2/6] Update the perf-changelog

---
 perf-changelog.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 2afe61dbe..1f70cc3b2 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3228,3 +3228,11 @@
     - "Picks up the fix for the GSM8K accuracy regression reported in sgl-project/sglang#25742 (v0.5.12-20260517 collapsed to ~0.32 at TP=2)"
     - "Local eval-only runs on MI355X recover to gsm8k strict-match 0.975 at TP=2/conc=64 and 0.974 at TP=4/conc=16, well above the 0.92 upstream gate added in sgl-project/sglang#26396"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1593
+
+- config-keys:
+    - minimaxm2.5-fp8-mi325x-vllm
+  description:
+    - "Pin AITER FA attention backend in benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh to recover the ~38% MI325X throughput regression introduced when vLLM PR #36702 (between v0.18.0 and v0.21.0) flipped the dense full-attention default on ROCm from ROCM_AITER_FA to ROCM_ATTN (vllm-project/vllm#43029)"
+    - "Export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1 to enable the AITER asm/hip paged-attention auto-dispatch"
+    - "Pass --attention-backend ROCM_AITER_FA to vllm serve, aligning with the merged upstream MiniMax ROCm recipe (vllm-project/recipes#481)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1594

From b69765b16983668d7a7e5a5ae61663ad12034074 Mon Sep 17 00:00:00 2001
From: Chun Fang <chun.fang@amd.com>
Date: Sat, 30 May 2026 14:25:23 +0000
Subject: [PATCH 3/6] runners/launch_mi325x-amds.sh: propagate srun failures

---
 runners/launch_mi325x-amds.sh | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/runners/launch_mi325x-amds.sh b/runners/launch_mi325x-amds.sh
index 810cbde2f..2e9bc406a 100644
--- a/runners/launch_mi325x-amds.sh
+++ b/runners/launch_mi325x-amds.sh
@@ -1,4 +1,5 @@
 #!/usr/bin/env bash
+set -euo pipefail
 
 export HF_HUB_CACHE_MOUNT="/nfsdata/sa/gharunner/gharunners/hf-hub-cache/"
 export PORT=8888
@@ -13,17 +14,20 @@ set -x
 #   chi-mi325x-pod1-121: enroot-aufs2ovlfs setcap fails on this node's NFS-backed
 #                        squash dir; container image import never completes
 #                        (root-caused via #1467/#1468/#1469 sweep failures).
-JOB_ID=$(salloc --partition=$PARTITION --exclude=chi-mi325x-pod1-121.ord.vultr.cpe.ice.amd.com --gres=gpu:$TP --cpus-per-task=256 --time=480 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+')
+JOB_ID=$(set +o pipefail; salloc --partition=$PARTITION --exclude=chi-mi325x-pod1-121.ord.vultr.cpe.ice.amd.com --gres=gpu:$TP --cpus-per-task=256 --time=480 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+')
 
 if [ -z "$JOB_ID" ]; then
-    echo "ERROR: salloc failed to allocate a job"
+    echo "ERROR: salloc failed to allocate a job" >&2
     exit 1
 fi
 
+trap 'rc=$?; scancel "$JOB_ID" 2>/dev/null || true; exit "$rc"' EXIT
+
 # Use flock to serialize concurrent imports to the same squash file
-srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c "
+srun --jobid="$JOB_ID" --job-name="$RUNNER_NAME" bash -c "
+    set -euo pipefail
     exec 9>\"$LOCK_FILE\"
-    flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; }
+    flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE' >&2; exit 1; }
     if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then
         echo 'Squash file already exists and is valid, skipping import'
     else
@@ -31,9 +35,9 @@ srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c "
         enroot import -o \"$SQUASH_FILE\" docker://$IMAGE
     fi
 "
-srun --jobid=$JOB_ID \
---container-image=$SQUASH_FILE \
---container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
+srun --jobid="$JOB_ID" \
+--container-image="$SQUASH_FILE" \
+--container-mounts="$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE" \
 --container-mount-home \
 --container-writable \
 --container-remap-root \

From fde577731bccaab2cce9470d669eb184cd85100c Mon Sep 17 00:00:00 2001
From: Chun Fang <chun.fang@amd.com>
Date: Sat, 30 May 2026 17:54:52 +0000
Subject: [PATCH 4/6] minimaxm2.5-fp8-mi325x-vllm: align with upstream
 MiniMax-M2.5 ROCm recipe

---
 .github/configs/amd-master.yaml                  |  2 +-
 benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh |  1 +
 perf-changelog.yaml                              |  2 ++
 runners/launch_mi325x-amds.sh                    | 10 +++++++++-
 4 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index aba66160b..da9db6ed6 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1038,7 +1038,7 @@ minimaxm2.5-fp8-mi300x-vllm-agentic:
       - { tp: 4, offloading: cpu,  conc-list: [16, 20, 24, 28, 32] }
 
 minimaxm2.5-fp8-mi325x-vllm:
-  image: vllm/vllm-openai-rocm:v0.21.0
+  image: vllm/vllm-openai-rocm:v0.20.2
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: mi325x
diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh
index 22f05a644..4c01bc3d8 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh
+++ b/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh
@@ -52,6 +52,7 @@ $EP \
 --block-size=32 \
 --no-enable-prefix-caching \
 --attention-backend ROCM_AITER_FA \
+--compilation-config '{"mode":3,"cudagraph_mode":"PIECEWISE"}' \
 --trust-remote-code > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 1f70cc3b2..c764c0c1a 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3235,4 +3235,6 @@
     - "Pin AITER FA attention backend in benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh to recover the ~38% MI325X throughput regression introduced when vLLM PR #36702 (between v0.18.0 and v0.21.0) flipped the dense full-attention default on ROCm from ROCM_AITER_FA to ROCM_ATTN (vllm-project/vllm#43029)"
     - "Export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1 to enable the AITER asm/hip paged-attention auto-dispatch"
     - "Pass --attention-backend ROCM_AITER_FA to vllm serve, aligning with the merged upstream MiniMax ROCm recipe (vllm-project/recipes#481)"
+    - "Pin image vllm/vllm-openai-rocm:v0.20.2 — the version the upstream recipe explicitly validates (`min_vllm_version: 0.20.2`). v0.21.0 separately crashes during AITER MoE CUDA-graph capture on MiniMax-M2.5 (silent worker death, `Engine core initialization failed`) reproducible via the recipe's exact flags; v0.20.2 + recipe completes a 100-prompt vllm bench serve cleanly at 2030 tok/s total throughput on MI325X (TP=4)"
+    - "Add --compilation-config '{\"mode\":3,\"cudagraph_mode\":\"PIECEWISE\"}' to vllm serve, mirroring `model.base_args` from the upstream recipe. `pass_config.fuse_minimax_qk_norm` from the recipe is intentionally omitted — it triggers an upstream NameError on ROCm because vllm/compilation/passes/pass_manager.py imports MiniMaxQKNormPass under `is_cuda()` (NVIDIA-only) while using it unconditionally"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1594
diff --git a/runners/launch_mi325x-amds.sh b/runners/launch_mi325x-amds.sh
index 2e9bc406a..d9b6de862 100644
--- a/runners/launch_mi325x-amds.sh
+++ b/runners/launch_mi325x-amds.sh
@@ -8,6 +8,14 @@ PARTITION="compute"
 SQUASH_FILE="/nfsdata/sa/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
 LOCK_FILE="${SQUASH_FILE}.lock"
 
+cleanup_stale_benchmark_logs() {
+    if [[ -n "${GITHUB_WORKSPACE:-}" ]]; then
+        sudo rm -rf "$GITHUB_WORKSPACE/benchmark_logs" 2>/dev/null || \
+            rm -rf "$GITHUB_WORKSPACE/benchmark_logs" 2>/dev/null || true
+    fi
+}
+cleanup_stale_benchmark_logs
+
 set -x
 
 # Exclude known-broken mi325x nodes:
@@ -21,7 +29,7 @@ if [ -z "$JOB_ID" ]; then
     exit 1
 fi
 
-trap 'rc=$?; scancel "$JOB_ID" 2>/dev/null || true; exit "$rc"' EXIT
+trap 'rc=$?; scancel "$JOB_ID" 2>/dev/null || true; cleanup_stale_benchmark_logs; exit "$rc"' EXIT
 
 # Use flock to serialize concurrent imports to the same squash file
 srun --jobid="$JOB_ID" --job-name="$RUNNER_NAME" bash -c "

From d0fbcf4d19363c5b434694d23d0aabc8cac304de Mon Sep 17 00:00:00 2001
From: Chun Fang <chun.fang@amd.com>
Date: Sat, 30 May 2026 19:15:32 +0000
Subject: [PATCH 5/6] runners/launch_mi325x-amds.sh: derive PORT per job; sudo
 -n in cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use `40000 + (JOB_ID % 10000)` instead of a hard-coded 8888 — a
non-SLURM Docker workload on chi-mi325x-pod1-019 bound :8888 and
made every sweep job scheduled there fail in sock.bind() with
EADDRINUSE before vLLM ran. Also harden the benchmark_logs trap with
`sudo -n` so it fails fast under a non-tty instead of hanging.
---
 runners/launch_mi325x-amds.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/runners/launch_mi325x-amds.sh b/runners/launch_mi325x-amds.sh
index d9b6de862..f44fb46ae 100644
--- a/runners/launch_mi325x-amds.sh
+++ b/runners/launch_mi325x-amds.sh
@@ -2,7 +2,6 @@
 set -euo pipefail
 
 export HF_HUB_CACHE_MOUNT="/nfsdata/sa/gharunner/gharunners/hf-hub-cache/"
-export PORT=8888
 
 PARTITION="compute"
 SQUASH_FILE="/nfsdata/sa/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
@@ -10,7 +9,7 @@ LOCK_FILE="${SQUASH_FILE}.lock"
 
 cleanup_stale_benchmark_logs() {
     if [[ -n "${GITHUB_WORKSPACE:-}" ]]; then
-        sudo rm -rf "$GITHUB_WORKSPACE/benchmark_logs" 2>/dev/null || \
+        sudo -n rm -rf "$GITHUB_WORKSPACE/benchmark_logs" 2>/dev/null || \
             rm -rf "$GITHUB_WORKSPACE/benchmark_logs" 2>/dev/null || true
     fi
 }
@@ -29,6 +28,8 @@ if [ -z "$JOB_ID" ]; then
     exit 1
 fi
 
+export PORT=$(( 40000 + (JOB_ID % 10000) ))
+
 trap 'rc=$?; scancel "$JOB_ID" 2>/dev/null || true; cleanup_stale_benchmark_logs; exit "$rc"' EXIT
 
 # Use flock to serialize concurrent imports to the same squash file

From 5471c536f4e2ba3ff5a7059b44c8b43257bbf9e7 Mon Sep 17 00:00:00 2001
From: Chun Fang <chun.fang@amd.com>
Date: Sat, 30 May 2026 20:44:03 +0000
Subject: [PATCH 6/6] minimaxm2.5-fp8-mi325x-vllm: gate SHUFFLE_KV_CACHE_LAYOUT
 per (TP, CONC)

Set VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1 (recipes#481 pillar 2) only at
shapes where AITER's gfx942 ASM paged-attn kernel exists: TP=2 EP=1
CONC<=16, TP=8 EP=8 CONC<=64. Above those, pa_fwd_asm hits
`get_heuristic_kernel: cannot get heuristic kernel!` (gqa=6,
block_size=32, qTile=0) and HTTP-500s every request. Mirrors the
per-shape toggle in the mi355x sibling. vllm#43029, sweep run
26692603804.
---
 benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh | 11 ++++++++++-
 perf-changelog.yaml                              |  1 +
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh
index 4c01bc3d8..657e1b5ae 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh
+++ b/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh
@@ -25,7 +25,16 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
 fi
 
 export VLLM_ROCM_USE_AITER=1
-export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1
+
+ENABLE_SHUFFLE_KV_CACHE_LAYOUT=0
+if   [[ "$TP" == "2" && "$EP_SIZE" == "1" ]] && (( CONC <= 16 )); then
+    ENABLE_SHUFFLE_KV_CACHE_LAYOUT=1
+elif [[ "$TP" == "8" && "$EP_SIZE" == "8" ]] && (( CONC <= 64 )); then
+    ENABLE_SHUFFLE_KV_CACHE_LAYOUT=1
+fi
+if (( ENABLE_SHUFFLE_KV_CACHE_LAYOUT )); then
+    export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1
+fi
 
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index c764c0c1a..ad93a24b4 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3237,4 +3237,5 @@
     - "Pass --attention-backend ROCM_AITER_FA to vllm serve, aligning with the merged upstream MiniMax ROCm recipe (vllm-project/recipes#481)"
     - "Pin image vllm/vllm-openai-rocm:v0.20.2 — the version the upstream recipe explicitly validates (`min_vllm_version: 0.20.2`). v0.21.0 separately crashes during AITER MoE CUDA-graph capture on MiniMax-M2.5 (silent worker death, `Engine core initialization failed`) reproducible via the recipe's exact flags; v0.20.2 + recipe completes a 100-prompt vllm bench serve cleanly at 2030 tok/s total throughput on MI325X (TP=4)"
     - "Add --compilation-config '{\"mode\":3,\"cudagraph_mode\":\"PIECEWISE\"}' to vllm serve, mirroring `model.base_args` from the upstream recipe. `pass_config.fuse_minimax_qk_norm` from the recipe is intentionally omitted — it triggers an upstream NameError on ROCm because vllm/compilation/passes/pass_manager.py imports MiniMaxQKNormPass under `is_cuda()` (NVIDIA-only) while using it unconditionally"
+    - "Conditionally enable VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1 per (TP, EP, CONC) — on for shapes where the AITER ASM paged-attention kernel exists in the gfx942 heuristic table (TP=2 EP=1 CONC<=16, TP=8 EP=8 CONC<=64), off otherwise. Above the thresholds vllm/v1/attention/backends/rocm_aiter_fa.py routes decode through aiter pa_fwd_asm and crashes with `RuntimeError: get_heuristic_kernel: cannot get heuristic kernel!` for MiniMax-M2.5's attention shape (gqa=6 block_size=32 qTile=0); below them the ASM auto-dispatch is the perf win the recipe wants. Thresholds confirmed across 17 bench cells + 3 eval cells in PR #1594 sweep run 26692603804. Mirrors the per-shape toggle pattern in benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh; can collapse to unconditional SHUFFLE=1 once AITER registers the missing kernel on gfx942"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1594