diff --git a/src/header/TransferBench.hpp b/src/header/TransferBench.hpp index fadfe22..d6b6475 100644 --- a/src/header/TransferBench.hpp +++ b/src/header/TransferBench.hpp @@ -4888,75 +4888,34 @@ static bool IsConfiguredGid(union ibv_gid const& gid) #define TEMPORAL_STORE 2 #define TEMPORAL_BOTH 3 - template - __device__ __forceinline__ void Load(float const* src, float& dst) { - if (TEMPORAL_MODE & TEMPORAL_LOAD) { -#if !defined(__NVCC__) - dst = __builtin_nontemporal_load(src); - -#endif - } else { - dst = *src; - } - } - - template - __device__ __forceinline__ void Load(float2 const* src, float2& dst) { - if (TEMPORAL_MODE & TEMPORAL_LOAD) { -#if !defined(__NVCC__) - dst.x = __builtin_nontemporal_load(&(src->x)); - dst.y = __builtin_nontemporal_load(&(src->y)); -#endif - } else { - dst = *src; - } - } + template + struct VecCast { + static_assert( + std::is_same_v || std::is_same_v || std::is_same_v, + "NonTemporalVec: T must be float, float2, or float4"); + using type = float __attribute__((ext_vector_type(sizeof(T) / sizeof(float)))); + }; + template + using VecCastType = typename VecCast::type; - template - __device__ __forceinline__ void Load(float4 const* src, float4& dst) { - if (TEMPORAL_MODE & TEMPORAL_LOAD) { + template + __device__ __forceinline__ void Load(T const* src, T& dst) { + if constexpr (TEMPORAL_MODE & TEMPORAL_LOAD) { #if !defined(__NVCC__) - dst.x = __builtin_nontemporal_load(&(src->x)); - dst.y = __builtin_nontemporal_load(&(src->y)); - dst.z = __builtin_nontemporal_load(&(src->z)); - dst.w = __builtin_nontemporal_load(&(src->w)); + using Vec = VecCastType; + *reinterpret_cast(&dst) = __builtin_nontemporal_load(reinterpret_cast(src)); #endif } else { dst = *src; } } - template - __device__ __forceinline__ void Store(float const& src, float* dst) { - if (TEMPORAL_MODE & TEMPORAL_STORE) { + template + __device__ __forceinline__ void Store(T const& src, T* dst) { + if constexpr (TEMPORAL_MODE & TEMPORAL_STORE) { #if !defined(__NVCC__) - __builtin_nontemporal_store(src, dst); -#endif - } else { - *dst = src; - } - } - - template - __device__ __forceinline__ void Store(float2 const& src, float2* dst) { - if (TEMPORAL_MODE & TEMPORAL_STORE) { -#if !defined(__NVCC__) - __builtin_nontemporal_store(src.x, &(dst->x)); - __builtin_nontemporal_store(src.y, &(dst->y)); -#endif - } else { - *dst = src; - } - } - - template - __device__ __forceinline__ void Store(float4 const& src, float4* dst) { - if (TEMPORAL_MODE & TEMPORAL_STORE) { -#if !defined(__NVCC__) - __builtin_nontemporal_store(src.x, &(dst->x)); - __builtin_nontemporal_store(src.y, &(dst->y)); - __builtin_nontemporal_store(src.z, &(dst->z)); - __builtin_nontemporal_store(src.w, &(dst->w)); + using Vec = VecCastType; + __builtin_nontemporal_store(*reinterpret_cast(&src), reinterpret_cast(dst)); #endif } else { *dst = src; @@ -4964,17 +4923,15 @@ static bool IsConfiguredGid(union ibv_gid const& gid) } //---------------------------------------------------------------------------- -#if defined(__NVCC__) - __global__ void GpuAsyncTensorOpsStubKernel(float const* __restrict__ src, - float* __restrict__ dst, - size_t numFloats) - { +#if !defined(__gfx1250__) + __global__ void NonAsyncFallbackKernel(float const* __restrict__ src, float* __restrict__ dst, size_t numFloats){ size_t const gid = blockIdx.x * blockDim.x + threadIdx.x; size_t const stride = gridDim.x * blockDim.x; - for (size_t i = gid; i < numFloats; i += stride) + for (size_t i = gid; i < numFloats; i += stride){ dst[i] = src[i]; + } } -#else +#else // __gfx1250__ // The TDM API does not have a function to set the transfer size, so we need to do it manually. __device__ void SetTransferSize(gfx1250_TDM_GROUP1& group1, int numElements){ group1.tensorDim0(numElements); @@ -5100,22 +5057,32 @@ __global__ void GpuAsyncPipelinedTensorOpsKernel(float const* __restrict__ src, tileMovers[slot].StoreTile(); __builtin_amdgcn_s_wait_tensorcnt(0); } -#endif // Source for definitions: https://github.com/llvm/llvm-project/blob/4e3bac3ea2cc6fd778d53e317dd9fc27c1ddfc4f/clang/include/clang/Basic/BuiltinsAMDGPU.td#L956 // and internal ISA documentation - using uint32x4 = __attribute__((__vector_size__(4 * sizeof(int)))) int; + using int32x4 = __attribute__((__vector_size__(4 * sizeof(int)))) int; // Loads 16 bytes/lane from global (src) into LDS (dst). __device__ void asyncLoadX4(float const* src, float* dst){ __builtin_amdgcn_global_load_async_to_lds_b128( - (__attribute__((address_space(1))) uint32x4*)src, - (__attribute__((address_space(3))) uint32x4*)dst, 0, 0); + (__attribute__((address_space(1))) int32x4*)src, + (__attribute__((address_space(3))) int32x4*)dst, 0, 0); } // Stores 16 bytes/lane from LDS (src) to global (dst). __device__ void asyncStoreX4(float const* src, float* dst){ __builtin_amdgcn_global_store_async_from_lds_b128( - (__attribute__((address_space(1))) uint32x4*)dst, - (__attribute__((address_space(3))) uint32x4*)src, 0, 0); + (__attribute__((address_space(1))) int32x4*)dst, + (__attribute__((address_space(3))) int32x4*)src, 0, 0); + } + + __device__ void asyncLoad(float const* src, float* dst){ + __builtin_amdgcn_global_load_async_to_lds_b32( + (__attribute__((address_space(1))) int32_t*)src, + (__attribute__((address_space(3))) int32_t*)dst, 0, 0); + } + __device__ void asyncStore(float const* src, float* dst){ + __builtin_amdgcn_global_store_async_from_lds_b32( + (__attribute__((address_space(1))) int32_t*)dst, + (__attribute__((address_space(3))) int32_t*)src, 0, 0); } // numElementsPerTile should be a multiple of 128, given the asyncLoadX4 and asyncStoreX4 functions operate on a 32-lane warp and 4 elements per lane (512 bytes total) @@ -5158,7 +5125,7 @@ __global__ void GpuAsyncPipelinedTensorOpsKernel(float const* __restrict__ src, tileBase += itemsProcessedPerGridIteration; } } - +#endif // __gfx1250__ // Simplified Kernel for GFX execution for copies only template @@ -5763,8 +5730,8 @@ __global__ void GpuAsyncPipelinedTensorOpsKernel(float const* __restrict__ src, do { if (startEvent) ERR_CHECK(hipEventRecord(startEvent, stream)); -#if defined(__NVCC__) - GpuAsyncTensorOpsStubKernel<<>>(src, dst, numFloats); +#if !defined(__gfx1250__) // NVIDIA and AMD Instinct GPUs prior to GFX1250 + NonAsyncFallbackKernel<<>>(src, dst, numFloats); #else bool usePipelinedTensorOps = (cfg.tdm.pipelined != 0); auto gpuKernel = tensorFlavor ? (usePipelinedTensorOps ? GpuAsyncPipelinedTensorOpsKernel : GpuAsyncTensorOpsKernel) : GpuAsyncLoadStoreKernel;