ROCm · alex-breslow-amd · Jun 18, 2026 · Jun 18, 2026 · Jun 18, 2026
@@ -4888,93 +4888,50 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
   #define TEMPORAL_STORE 2
   #define TEMPORAL_BOTH  3
 
-  template <int TEMPORAL_MODE>
-  __device__ __forceinline__ void Load(float const* src, float& dst) {
-    if (TEMPORAL_MODE & TEMPORAL_LOAD) {
-#if !defined(__NVCC__)
-      dst = __builtin_nontemporal_load(src);
-
-#endif
-    } else {
-      dst = *src;
-    }
-  }
-
-  template <int TEMPORAL_MODE>
-  __device__ __forceinline__ void Load(float2 const* src, float2& dst) {
-    if (TEMPORAL_MODE & TEMPORAL_LOAD) {
-#if !defined(__NVCC__)
-      dst.x = __builtin_nontemporal_load(&(src->x));
-      dst.y = __builtin_nontemporal_load(&(src->y));
-#endif
-    } else {
-      dst = *src;
-    }
-  }
+  template <typename T>
+  struct VecCast {
+    static_assert(
+      std::is_same_v<T, float> || std::is_same_v<T, float2> || std::is_same_v<T, float4>,
+      "NonTemporalVec: T must be float, float2, or float4");
+    using type = float __attribute__((ext_vector_type(sizeof(T) / sizeof(float))));
+  };
+  template <typename T>
+  using VecCastType = typename VecCast<T>::type;
 
-  template <int TEMPORAL_MODE>
-  __device__ __forceinline__ void Load(float4 const* src, float4& dst) {
-    if (TEMPORAL_MODE & TEMPORAL_LOAD) {
+  template <int TEMPORAL_MODE, typename T>
+  __device__ __forceinline__ void Load(T const* src, T& dst) {
+    if constexpr (TEMPORAL_MODE & TEMPORAL_LOAD) {
 #if !defined(__NVCC__)
-      dst.x = __builtin_nontemporal_load(&(src->x));
-      dst.y = __builtin_nontemporal_load(&(src->y));
-      dst.z = __builtin_nontemporal_load(&(src->z));
-      dst.w = __builtin_nontemporal_load(&(src->w));
+      using Vec = VecCastType<T>;
+      *reinterpret_cast<Vec*>(&dst) = __builtin_nontemporal_load(reinterpret_cast<Vec const*>(src));
 #endif
     } else {
       dst = *src;
     }
   }
 
-  template <int TEMPORAL_MODE>
-  __device__ __forceinline__ void Store(float const& src, float* dst) {
-    if (TEMPORAL_MODE & TEMPORAL_STORE) {
+  template <int TEMPORAL_MODE, typename T>
+  __device__ __forceinline__ void Store(T const& src, T* dst) {
+    if constexpr (TEMPORAL_MODE & TEMPORAL_STORE) {
 #if !defined(__NVCC__)
-      __builtin_nontemporal_store(src, dst);
-#endif
-    } else {
-      *dst = src;
-    }
-  }
-
-  template <int TEMPORAL_MODE>
-  __device__ __forceinline__ void Store(float2 const& src, float2* dst) {
-    if (TEMPORAL_MODE & TEMPORAL_STORE) {
-#if !defined(__NVCC__)
-      __builtin_nontemporal_store(src.x, &(dst->x));
-      __builtin_nontemporal_store(src.y, &(dst->y));
-#endif
-    } else {
-      *dst = src;
-    }
-  }
-
-  template <int TEMPORAL_MODE>
-  __device__ __forceinline__ void Store(float4 const& src, float4* dst) {
-    if (TEMPORAL_MODE & TEMPORAL_STORE) {
-#if !defined(__NVCC__)
-      __builtin_nontemporal_store(src.x, &(dst->x));
-      __builtin_nontemporal_store(src.y, &(dst->y));
-      __builtin_nontemporal_store(src.z, &(dst->z));
-      __builtin_nontemporal_store(src.w, &(dst->w));
+      using Vec = VecCastType<T>;
+      __builtin_nontemporal_store(*reinterpret_cast<Vec const*>(&src), reinterpret_cast<Vec*>(dst));
 #endif
     } else {
       *dst = src;
     }
   }
 
   //----------------------------------------------------------------------------
-#if defined(__NVCC__)
-  __global__ void GpuAsyncTensorOpsStubKernel(float const* __restrict__ src,
-                                               float* __restrict__ dst,
-                                               size_t numFloats)
-  {
+#if !defined(__gfx1250__)
+  __global__ void NonAsyncFallbackKernel(float const* __restrict__ src, float* __restrict__ dst, size_t numFloats){
     size_t const gid = blockIdx.x * blockDim.x + threadIdx.x;
     size_t const stride = gridDim.x * blockDim.x;
-    for (size_t i = gid; i < numFloats; i += stride)
+    for (size_t i = gid; i < numFloats; i += stride){
       dst[i] = src[i];
+    }
   }
-#else
+#else // __gfx1250__
 // The TDM API does not have a function to set the transfer size, so we need to do it manually.
 __device__ void SetTransferSize(gfx1250_TDM_GROUP1& group1, int numElements){
   group1.tensorDim0(numElements);
@@ -5100,22 +5057,32 @@ __global__ void  GpuAsyncPipelinedTensorOpsKernel(float const* __restrict__ src,
   tileMovers[slot].StoreTile();
   __builtin_amdgcn_s_wait_tensorcnt(0);
 }
-#endif
 
   // Source for definitions: https://github.com/llvm/llvm-project/blob/4e3bac3ea2cc6fd778d53e317dd9fc27c1ddfc4f/clang/include/clang/Basic/BuiltinsAMDGPU.td#L956
   // and internal ISA documentation
-  using uint32x4 = __attribute__((__vector_size__(4 * sizeof(int)))) int;
+  using int32x4 = __attribute__((__vector_size__(4 * sizeof(int)))) int;
   // Loads 16 bytes/lane from global (src) into LDS (dst).
   __device__ void asyncLoadX4(float const* src, float* dst){
     __builtin_amdgcn_global_load_async_to_lds_b128(
-        (__attribute__((address_space(1))) uint32x4*)src,
-        (__attribute__((address_space(3))) uint32x4*)dst, 0, 0);
+        (__attribute__((address_space(1))) int32x4*)src,
+        (__attribute__((address_space(3))) int32x4*)dst, 0, 0);
   }
   // Stores 16 bytes/lane from LDS (src) to global (dst).
   __device__ void asyncStoreX4(float const* src, float* dst){
     __builtin_amdgcn_global_store_async_from_lds_b128(
-        (__attribute__((address_space(1))) uint32x4*)dst,
-        (__attribute__((address_space(3))) uint32x4*)src, 0, 0);
+        (__attribute__((address_space(1))) int32x4*)dst,
+        (__attribute__((address_space(3))) int32x4*)src, 0, 0);
+  }
+
+  __device__ void asyncLoad(float const* src, float* dst){
+    __builtin_amdgcn_global_load_async_to_lds_b32(
+        (__attribute__((address_space(1))) int32_t*)src,
+        (__attribute__((address_space(3))) int32_t*)dst, 0, 0);
+  }
+  __device__ void asyncStore(float const* src, float* dst){
+    __builtin_amdgcn_global_store_async_from_lds_b32(
+        (__attribute__((address_space(1))) int32_t*)dst,
+        (__attribute__((address_space(3))) int32_t*)src, 0, 0);
   }
 
   // numElementsPerTile should be a multiple of 128, given the asyncLoadX4 and asyncStoreX4 functions operate on a 32-lane warp and 4 elements per lane (512 bytes total)
@@ -5158,7 +5125,7 @@ __global__ void  GpuAsyncPipelinedTensorOpsKernel(float const* __restrict__ src,
     tileBase += itemsProcessedPerGridIteration;
   }
 }
-
+#endif // __gfx1250__
 
   // Simplified Kernel for GFX execution for copies only
   template <typename PACKED_FLOAT, int LAUNCH_BOUND, int UNROLL, int TEMPORAL_MODE>
@@ -5763,8 +5730,8 @@ __global__ void  GpuAsyncPipelinedTensorOpsKernel(float const* __restrict__ src,
     do {
       if (startEvent)
         ERR_CHECK(hipEventRecord(startEvent, stream));
-#if defined(__NVCC__)
-      GpuAsyncTensorOpsStubKernel<<<dim3(blocks), dim3(threads), 0, stream>>>(src, dst, numFloats);
+#if !defined(__gfx1250__) // NVIDIA and AMD Instinct GPUs prior to GFX1250
+      NonAsyncFallbackKernel<<<dim3(blocks), dim3(threads), 0, stream>>>(src, dst, numFloats);
 #else
       bool usePipelinedTensorOps = (cfg.tdm.pipelined != 0);
       auto gpuKernel = tensorFlavor ? (usePipelinedTensorOps ? GpuAsyncPipelinedTensorOpsKernel : GpuAsyncTensorOpsKernel) : GpuAsyncLoadStoreKernel;