Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 43 additions & 76 deletions src/header/TransferBench.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4888,93 +4888,50 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
#define TEMPORAL_STORE 2
#define TEMPORAL_BOTH 3

template <int TEMPORAL_MODE>
__device__ __forceinline__ void Load(float const* src, float& dst) {
if (TEMPORAL_MODE & TEMPORAL_LOAD) {
#if !defined(__NVCC__)
dst = __builtin_nontemporal_load(src);

#endif
} else {
dst = *src;
}
}

template <int TEMPORAL_MODE>
__device__ __forceinline__ void Load(float2 const* src, float2& dst) {
if (TEMPORAL_MODE & TEMPORAL_LOAD) {
#if !defined(__NVCC__)
dst.x = __builtin_nontemporal_load(&(src->x));
dst.y = __builtin_nontemporal_load(&(src->y));
#endif
} else {
dst = *src;
}
}
template <typename T>
struct VecCast {
static_assert(
std::is_same_v<T, float> || std::is_same_v<T, float2> || std::is_same_v<T, float4>,
"NonTemporalVec: T must be float, float2, or float4");
using type = float __attribute__((ext_vector_type(sizeof(T) / sizeof(float))));
};
template <typename T>
using VecCastType = typename VecCast<T>::type;

template <int TEMPORAL_MODE>
__device__ __forceinline__ void Load(float4 const* src, float4& dst) {
if (TEMPORAL_MODE & TEMPORAL_LOAD) {
template <int TEMPORAL_MODE, typename T>
__device__ __forceinline__ void Load(T const* src, T& dst) {
if constexpr (TEMPORAL_MODE & TEMPORAL_LOAD) {
#if !defined(__NVCC__)
dst.x = __builtin_nontemporal_load(&(src->x));
dst.y = __builtin_nontemporal_load(&(src->y));
dst.z = __builtin_nontemporal_load(&(src->z));
dst.w = __builtin_nontemporal_load(&(src->w));
using Vec = VecCastType<T>;
*reinterpret_cast<Vec*>(&dst) = __builtin_nontemporal_load(reinterpret_cast<Vec const*>(src));
#endif
} else {
dst = *src;
}
}

template <int TEMPORAL_MODE>
__device__ __forceinline__ void Store(float const& src, float* dst) {
if (TEMPORAL_MODE & TEMPORAL_STORE) {
template <int TEMPORAL_MODE, typename T>
__device__ __forceinline__ void Store(T const& src, T* dst) {
if constexpr (TEMPORAL_MODE & TEMPORAL_STORE) {
#if !defined(__NVCC__)
__builtin_nontemporal_store(src, dst);
#endif
} else {
*dst = src;
}
}

template <int TEMPORAL_MODE>
__device__ __forceinline__ void Store(float2 const& src, float2* dst) {
if (TEMPORAL_MODE & TEMPORAL_STORE) {
#if !defined(__NVCC__)
__builtin_nontemporal_store(src.x, &(dst->x));
__builtin_nontemporal_store(src.y, &(dst->y));
#endif
} else {
*dst = src;
}
}

template <int TEMPORAL_MODE>
__device__ __forceinline__ void Store(float4 const& src, float4* dst) {
if (TEMPORAL_MODE & TEMPORAL_STORE) {
#if !defined(__NVCC__)
__builtin_nontemporal_store(src.x, &(dst->x));
__builtin_nontemporal_store(src.y, &(dst->y));
__builtin_nontemporal_store(src.z, &(dst->z));
__builtin_nontemporal_store(src.w, &(dst->w));
using Vec = VecCastType<T>;
__builtin_nontemporal_store(*reinterpret_cast<Vec const*>(&src), reinterpret_cast<Vec*>(dst));
#endif
} else {
*dst = src;
}
}

//----------------------------------------------------------------------------
#if defined(__NVCC__)
__global__ void GpuAsyncTensorOpsStubKernel(float const* __restrict__ src,
float* __restrict__ dst,
size_t numFloats)
{
#if !defined(__gfx1250__)
__global__ void NonAsyncFallbackKernel(float const* __restrict__ src, float* __restrict__ dst, size_t numFloats){
size_t const gid = blockIdx.x * blockDim.x + threadIdx.x;
size_t const stride = gridDim.x * blockDim.x;
for (size_t i = gid; i < numFloats; i += stride)
for (size_t i = gid; i < numFloats; i += stride){
dst[i] = src[i];
}
}
#else
#else // __gfx1250__
// The TDM API does not have a function to set the transfer size, so we need to do it manually.
__device__ void SetTransferSize(gfx1250_TDM_GROUP1& group1, int numElements){
group1.tensorDim0(numElements);
Expand Down Expand Up @@ -5100,22 +5057,32 @@ __global__ void GpuAsyncPipelinedTensorOpsKernel(float const* __restrict__ src,
tileMovers[slot].StoreTile();
__builtin_amdgcn_s_wait_tensorcnt(0);
}
#endif

// Source for definitions: https://github.com/llvm/llvm-project/blob/4e3bac3ea2cc6fd778d53e317dd9fc27c1ddfc4f/clang/include/clang/Basic/BuiltinsAMDGPU.td#L956
// and internal ISA documentation
using uint32x4 = __attribute__((__vector_size__(4 * sizeof(int)))) int;
using int32x4 = __attribute__((__vector_size__(4 * sizeof(int)))) int;
// Loads 16 bytes/lane from global (src) into LDS (dst).
__device__ void asyncLoadX4(float const* src, float* dst){
__builtin_amdgcn_global_load_async_to_lds_b128(
(__attribute__((address_space(1))) uint32x4*)src,
(__attribute__((address_space(3))) uint32x4*)dst, 0, 0);
(__attribute__((address_space(1))) int32x4*)src,
(__attribute__((address_space(3))) int32x4*)dst, 0, 0);
}
// Stores 16 bytes/lane from LDS (src) to global (dst).
__device__ void asyncStoreX4(float const* src, float* dst){
__builtin_amdgcn_global_store_async_from_lds_b128(
(__attribute__((address_space(1))) uint32x4*)dst,
(__attribute__((address_space(3))) uint32x4*)src, 0, 0);
(__attribute__((address_space(1))) int32x4*)dst,
(__attribute__((address_space(3))) int32x4*)src, 0, 0);
}

__device__ void asyncLoad(float const* src, float* dst){
__builtin_amdgcn_global_load_async_to_lds_b32(
(__attribute__((address_space(1))) int32_t*)src,
(__attribute__((address_space(3))) int32_t*)dst, 0, 0);
}
__device__ void asyncStore(float const* src, float* dst){
__builtin_amdgcn_global_store_async_from_lds_b32(
(__attribute__((address_space(1))) int32_t*)dst,
(__attribute__((address_space(3))) int32_t*)src, 0, 0);
}

// numElementsPerTile should be a multiple of 128, given the asyncLoadX4 and asyncStoreX4 functions operate on a 32-lane warp and 4 elements per lane (512 bytes total)
Expand Down Expand Up @@ -5158,7 +5125,7 @@ __global__ void GpuAsyncPipelinedTensorOpsKernel(float const* __restrict__ src,
tileBase += itemsProcessedPerGridIteration;
}
}

#endif // __gfx1250__

// Simplified Kernel for GFX execution for copies only
template <typename PACKED_FLOAT, int LAUNCH_BOUND, int UNROLL, int TEMPORAL_MODE>
Expand Down Expand Up @@ -5763,8 +5730,8 @@ __global__ void GpuAsyncPipelinedTensorOpsKernel(float const* __restrict__ src,
do {
if (startEvent)
ERR_CHECK(hipEventRecord(startEvent, stream));
#if defined(__NVCC__)
GpuAsyncTensorOpsStubKernel<<<dim3(blocks), dim3(threads), 0, stream>>>(src, dst, numFloats);
#if !defined(__gfx1250__) // NVIDIA and AMD Instinct GPUs prior to GFX1250
NonAsyncFallbackKernel<<<dim3(blocks), dim3(threads), 0, stream>>>(src, dst, numFloats);
#else
bool usePipelinedTensorOps = (cfg.tdm.pipelined != 0);
auto gpuKernel = tensorFlavor ? (usePipelinedTensorOps ? GpuAsyncPipelinedTensorOpsKernel : GpuAsyncTensorOpsKernel) : GpuAsyncLoadStoreKernel;
Expand Down