From dccae1050f01d543580ffe40e5e62badfce5b0be Mon Sep 17 00:00:00 2001 From: AtlantaPepsi Date: Tue, 9 Jun 2026 21:10:32 +0000 Subject: [PATCH 1/4] nic_exe/ibverbs/dmabuf refactor --- CMakeLists.txt | 87 +---- Makefile | 56 +-- build_packages_local.sh | 2 - src/header/TransferBench.hpp | 351 +++++++++---------- third-party/ibverbs/IbvDynLoad.hpp | 210 +++++++++++ third-party/ibverbs/IbvHeader.hpp | 544 +++++++++++++++++++++++++++++ 6 files changed, 942 insertions(+), 308 deletions(-) create mode 100644 third-party/ibverbs/IbvDynLoad.hpp create mode 100644 third-party/ibverbs/IbvHeader.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index af9197e5..704ae42e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -199,9 +199,7 @@ include(CMakePushCheckState) # Build options #================================================================================================== option(BUILD_LOCAL_GPU_TARGET_ONLY "Build only for GPUs detected on this machine" OFF) -option(ENABLE_NIC_EXEC "Enable RDMA NIC Executor in TransferBench" OFF) option(ENABLE_MPI_COMM "Enable MPI Communicator support" OFF) -option(ENABLE_DMA_BUF "Enable DMA-BUF support for GPU Direct RDMA" OFF) option(ENABLE_AMD_SMI "Enable AMD-SMI pod membership queries" OFF) option(ENABLE_POD_COMM "Enable pod communication" OFF) option(BUILD_RELOCATABLE_PACKAGE "Build with RVS-style relocatable RPATH and amdrocm-transferbench package naming" OFF) @@ -313,71 +311,13 @@ else() message(FATAL_ERROR "HSA library or headers not found under ${ROCM_PATH}; TransferBench requires libhsa-runtime64") endif() -## Check for infiniband verbs support -if(DEFINED ENV{DISABLE_NIC_EXEC} AND "$ENV{DISABLE_NIC_EXEC}" STREQUAL "1") - message(STATUS "Disabling NIC Executor support as env. flag DISABLE_NIC_EXEC was enabled") -elseif(NOT ENABLE_NIC_EXEC) - message(STATUS "For CMake builds, NIC Executor support requires explicit opt-in by setting CMake flag -DENABLE_NIC_EXEC=ON") - message(STATUS "- Disabling NIC Executor support") -else() - message(STATUS "Attempting to build with NIC executor support") - - find_library(IBVERBS_LIBRARY ibverbs) - find_path(IBVERBS_INCLUDE_DIR infiniband/verbs.h) - if(IBVERBS_LIBRARY AND IBVERBS_INCLUDE_DIR) - add_library(ibverbs SHARED IMPORTED) - set_target_properties(ibverbs PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${IBVERBS_INCLUDE_DIR}" IMPORTED_LOCATION "${IBVERBS_LIBRARY}" INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${IBVERBS_INCLUDE_DIR}") - set(IBVERBS_FOUND 1) - message(STATUS "- Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable") - else() - if(NOT IBVERBS_LIBRARY) - message(WARNING "- IBVerbs library not found") - elseif(NOT IBVERBS_INCLUDE_DIR) - message(WARNING "- infiniband/verbs.h not found") - endif() - message(WARNING "- Building without NIC executor support. To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed") - endif() -endif() - -## Check for DMA-BUF support (requires IBVERBS) -if(IBVERBS_FOUND) - if(DEFINED ENV{DISABLE_DMA_BUF} AND "$ENV{DISABLE_DMA_BUF}" STREQUAL "1") - message(STATUS "Disabling DMA-BUF support as env. flag DISABLE_DMA_BUF was enabled") - elseif(NOT ENABLE_DMA_BUF) - message(STATUS "For CMake builds, DMA-BUF support requires explicit opt-in by setting CMake flags -DENABLE_DMA_BUF=ON") - message(STATUS "- Disabling DMA-BUF support") - else() - message(STATUS "Attempting to build with DMA-BUF support") - - # Check for ibv_reg_dmabuf_mr - cmake_push_check_state() - set(CMAKE_REQUIRED_INCLUDES ${IBVERBS_INCLUDE_DIR}) - set(CMAKE_REQUIRED_LIBRARIES ${IBVERBS_LIBRARY}) - check_symbol_exists(ibv_reg_dmabuf_mr "infiniband/verbs.h" HAVE_IBV_DMABUF) - cmake_pop_check_state() - - # Check for hsa_amd_portable_export_dmabuf - cmake_push_check_state() - set(CMAKE_REQUIRED_INCLUDES ${HSA_INCLUDE_DIR}) - set(CMAKE_REQUIRED_LIBRARIES ${HSA_LIBRARY}) - check_symbol_exists(hsa_amd_portable_export_dmabuf "hsa/hsa_ext_amd.h" HAVE_ROCM_DMABUF) - cmake_pop_check_state() - - # Enable DMA-BUF only if both APIs are available - if(HAVE_IBV_DMABUF AND HAVE_ROCM_DMABUF) - set(DMABUF_SUPPORT_FOUND 1) - message(STATUS "- Building with DMA-BUF support") - else() - if(NOT HAVE_IBV_DMABUF AND NOT HAVE_ROCM_DMABUF) - message(WARNING "- Building without DMA-BUF support: missing both ibv_reg_dmabuf_mr and ROCm DMA-BUF export") - elseif(NOT HAVE_IBV_DMABUF) - message(WARNING "- Building without DMA-BUF support: missing ibv_reg_dmabuf_mr") - else() - message(WARNING "- Building without DMA-BUF support: missing ROCm DMA-BUF export") - endif() - endif() - endif() -endif() +## NIC / RDMA executor +## +## libibverbs is loaded dynamically at runtime via dlopen/dlsym +## (see vendors/ibverbs/IbvDynLoad.hpp). The build neither links against +## -libverbs nor requires libibverbs-dev to be installed on the build host. +## Only libdl needs to be linked so that dlopen/dlsym resolve. +message(STATUS "NIC executor: libibverbs is loaded dynamically at runtime (no -libverbs link, no build-host dependency)") ## Check for MPI support set(MPI_PATH "" CACHE PATH "Path to MPI installation (takes priority over system MPI)") @@ -510,13 +450,11 @@ add_executable(TransferBench src/client/Client.cpp) target_include_directories(TransferBench PRIVATE src/header src/client - src/client/Presets) + src/client/Presets + vendors/ibverbs) -if(IBVERBS_FOUND) - target_include_directories(TransferBench PRIVATE ${IBVERBS_INCLUDE_DIR}) - target_link_libraries(TransferBench PRIVATE ${IBVERBS_LIBRARY}) - target_compile_definitions(TransferBench PRIVATE NIC_EXEC_ENABLED) -endif() +# libdl supplies dlopen/dlsym used by vendors/ibverbs/IbvDynLoad.hpp. +target_link_libraries(TransferBench PRIVATE ${CMAKE_DL_LIBS}) if(MPI_COMM_FOUND) if(TARGET MPI::MPI_CXX) target_link_libraries(TransferBench PRIVATE MPI::MPI_CXX) @@ -526,9 +464,6 @@ if(MPI_COMM_FOUND) endif() target_compile_definitions(TransferBench PRIVATE MPI_COMM_ENABLED) endif() -if(DMABUF_SUPPORT_FOUND) - target_compile_definitions(TransferBench PRIVATE HAVE_DMABUF_SUPPORT) -endif() if(AMD_SMI_FOUND) target_include_directories(TransferBench PRIVATE ${AMD_SMI_INCLUDE_DIR}) target_link_libraries(TransferBench PRIVATE ${AMD_SMI_LIBRARY}) diff --git a/Makefile b/Makefile index db40ed20..ea0ce643 100644 --- a/Makefile +++ b/Makefile @@ -11,9 +11,7 @@ NVCC ?= $(CUDA_PATH)/bin/nvcc DEBUG ?= 0 # Optional features (set to 0 to disable, 1 to enable) -# DISABLE_NIC_EXEC: Disable RDMA/NIC executor support (default: 0) # DISABLE_MPI_COMM: Disable MPI communicator support (default: 0) -# DISABLE_DMA_BUF: Disable DMA-BUF support for GPU Direct RDMA (default: 1) # DISABLE_AMD_SMI: Disable AMD-SMI pod membership checking support (default: 0) # DISABLE_NVML: Disable NVML pod membership detection for CUDA builds (default: 0) # DISABLE_POD_COMM: Disable pod communication support (default: 0) @@ -83,55 +81,13 @@ ifeq ($(filter clean,$(MAKECMDGOALS)),) else COMMON_FLAGS += -O0 -g -ggdb3 endif - COMMON_FLAGS += -I./src/header -I./src/client -I./src/client/Presets + COMMON_FLAGS += -I./src/header -I./src/client -I./src/client/Presets -I./vendors/ibverbs - LDFLAGS += -lpthread - - NIC_ENABLED = 0 - # Compile RDMA executor if - # 1) DISABLE_NIC_EXEC is not set to 1 - # 2) IBVerbs is found in the Dynamic Linker cache - # 3) infiniband/verbs.h is found in the default include path - DISABLE_NIC_EXEC ?= 0 - ifneq ($(DISABLE_NIC_EXEC),1) - $(info Attempting to build with NIC executor support) - ifeq ("$(shell ldconfig -p | grep -c ibverbs)", "0") - $(info - ibverbs library not found) - else ifeq ("$(shell echo '#include ' | $(CXX) -E - 2>/dev/null | grep -c 'infiniband/verbs.h')", "0") - $(info - infiniband/verbs.h not found) - else - COMMON_FLAGS += -DNIC_EXEC_ENABLED - LDFLAGS += -libverbs - NIC_ENABLED = 1 - - # Disable DMA-BUF support by default (set DISABLE_DMA_BUF=0 to enable) - DISABLE_DMA_BUF ?= 1 - ifeq ($(DISABLE_DMA_BUF), 0) - # Check for both ibv_reg_dmabuf_mr and ROCm DMA-BUF export support - HAVE_IBV_DMABUF := $(shell echo '#include ' | $(CXX) -E - 2>/dev/null | grep -c 'ibv_reg_dmabuf_mr') - HAVE_ROCM_DMABUF := $(shell echo '#include ' | $(CXX) -I$(ROCM_PATH)/include -E - 2>/dev/null | grep -c 'hsa_amd_portable_export_dmabuf') - - ifeq ($(HAVE_IBV_DMABUF):$(HAVE_ROCM_DMABUF), 0:0) - $(info Building without DMA-BUF support: missing both ibv_reg_dmabuf_mr and ROCm DMA-BUF export) - else ifeq ($(HAVE_IBV_DMABUF), 0) - $(info Building without DMA-BUF support: missing ibv_reg_dmabuf_mr) - else ifeq ($(HAVE_ROCM_DMABUF), 0) - $(info Building without DMA-BUF support: missing ROCm DMA-BUF export) - else - COMMON_FLAGS += -DHAVE_DMABUF_SUPPORT - $(info Building with DMA-BUF support) - endif - else - $(info Building with DMA-BUF support disabled (DISABLE_DMA_BUF=1)) - endif - endif - ifeq ($(NIC_ENABLED), 0) - $(info - Building without NIC executor support) - $(info - To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed) - else - $(info - Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable) - endif - endif + # libibverbs is loaded dynamically at runtime via dlopen/dlsym (see + # vendors/ibverbs/IbvDynLoad.hpp), so the build never links against -libverbs + # and does not require libibverbs-dev to be installed. We only need -ldl so + # the dynamic loader API is resolvable. + LDFLAGS += -lpthread -ldl MPI_ENABLED = 0 # Compile with MPI communicator support if diff --git a/build_packages_local.sh b/build_packages_local.sh index 91ace56b..e2c073d1 100755 --- a/build_packages_local.sh +++ b/build_packages_local.sh @@ -229,9 +229,7 @@ CMAKE_ARGS=( -DCMAKE_VERBOSE_MAKEFILE=ON -DBUILD_RELOCATABLE_PACKAGE=ON -DBUILD_LOCAL_GPU_TARGET_ONLY=OFF - -DENABLE_NIC_EXEC=OFF -DENABLE_MPI_COMM=OFF - -DDISABLE_DMABUF=OFF -DGPU_TARGETS="${GPU_TARGETS}" -DTRANSFERBENCH_PACKAGE_RELEASE="${PKG_RELEASE}" ) diff --git a/src/header/TransferBench.hpp b/src/header/TransferBench.hpp index fa536155..cd39f892 100644 --- a/src/header/TransferBench.hpp +++ b/src/header/TransferBench.hpp @@ -55,9 +55,7 @@ THE SOFTWARE. #include #include -#ifdef NIC_EXEC_ENABLED -#include -#endif +#include "IbvDynLoad.hpp" #ifdef MPI_COMM_ENABLED #include @@ -1669,16 +1667,34 @@ namespace { return ERR_NONE; } -#if defined(NIC_EXEC_ENABLED) && defined(HAVE_DMABUF_SUPPORT) && !defined(__NVCC__) +#if defined(__NVCC__) + static bool CheckDmabufSupport() + { + return false; + } + static ErrResult ExportDmabuf(void* gpuPtr, size_t numBytes, int& dmabufFd, uint64_t& dmabufOffset) + { + return {ERR_FATAL, "DMA-BUF export not yet supported on NVIDIA platform"}; + } +#else + hsa_status_t (*pfn_hsa_amd_portable_export_dmabuf)(const void*, size_t, int*, uint64_t*); // Check kernel configuration for required DMA-BUF support // Returns true if kernel supports CONFIG_DMABUF_MOVE_NOTIFY and CONFIG_PCI_P2PDMA - static bool CheckKernelDmabufSupport() + static bool CheckDmabufSupport() { static int support = -1; // -1: not checked, 0: disabled, 1: enabled if (support != -1) { return support; } + // Check hsa_amd_portable_export_dmabuf and ibv_reg_dmabuf_mr symbols are available + // rocr and hsa_ext_amd header is always mandatory, so no need to check for them + pfn_hsa_amd_portable_export_dmabuf = + (hsa_status_t (*)(const void*, size_t, int*, uint64_t*))dlsym(RTLD_DEFAULT, "hsa_amd_portable_export_dmabuf"); + if (pfn_hsa_amd_portable_export_dmabuf == nullptr || !TbIbvDmabufPresent()) { + support = 0; + return support; + } struct utsname utsname; FILE* fp = NULL; @@ -1802,7 +1818,7 @@ namespace { // Export the aligned GPU buffer as DMA-BUF uint64_t exportOffset = 0; - hsa_status_t status = hsa_amd_portable_export_dmabuf(alignedPtr, alignedSize, &dmabufFd, &exportOffset); + hsa_status_t status = pfn_hsa_amd_portable_export_dmabuf(alignedPtr, alignedSize, &dmabufFd, &exportOffset); if (status != HSA_STATUS_SUCCESS) { return {ERR_FATAL, "Failed to export DMA-BUF: hsa_amd_portable_export_dmabuf returned %d", status}; @@ -2110,14 +2126,14 @@ namespace { } // Check NIC options -#ifdef NIC_EXEC_ENABLED - if (cfg.nic.chunkBytes == 0 || (cfg.nic.chunkBytes % 4 != 0)) { - errors.push_back({ERR_FATAL, "[nic.chunkBytes] must be a non-negative multiple of 4"}); - } - if (cfg.nic.cqPollBatch <= 0) { - errors.push_back({ERR_FATAL, "[nic.cqPollBatch] must be positive"}); + if (TbIbvSymbolsReady()) { + if (cfg.nic.chunkBytes == 0 || (cfg.nic.chunkBytes % 4 != 0)) { + errors.push_back({ERR_FATAL, "[nic.chunkBytes] must be a non-negative multiple of 4"}); + } + if (cfg.nic.cqPollBatch <= 0) { + errors.push_back({ERR_FATAL, "[nic.cqPollBatch] must be positive"}); + } } -#endif // NVIDIA specific #if defined(__NVCC__) @@ -2488,7 +2504,7 @@ namespace { break; #endif case EXE_NIC: case EXE_NIC_NEAREST: -#ifdef NIC_EXEC_ENABLED + if (TbIbvSymbolsReady()) { // NIC Executors can only execute a copy operation if (t.srcs.size() != 1 || t.dsts.size() != 1) { @@ -2542,11 +2558,10 @@ namespace { hasFatalError = true; break; } + } else { + errors.push_back({ERR_FATAL, "Transfer %d: NIC executor is requested but is not available.", i}); + hasFatalError = true; } -#else - errors.push_back({ERR_FATAL, "Transfer %d: NIC executor is requested but is not available.", i}); - hasFatalError = true; -#endif break; } @@ -2767,7 +2782,6 @@ namespace { #endif // For IBV executor -#ifdef NIC_EXEC_ENABLED int srcNicIndex; ///< SRC NIC index int dstNicIndex; ///< DST NIC index ibv_context* srcContext; ///< Device context for SRC NIC @@ -2792,7 +2806,6 @@ namespace { bool srcIsExeNic; ///< Whether SRC or DST NIC initiates traffic vector> sgePerQueuePair; ///< Scatter-gather elements per queue pair vector>sendWorkRequests; ///< Send work requests per queue pair -#endif // For BMA executor #ifdef BMA_EXEC_ENABLED @@ -2850,7 +2863,6 @@ namespace { } }; -#ifdef NIC_EXEC_ENABLED // Structure to track information about IBV devices struct IbvDevice { @@ -2863,12 +2875,11 @@ namespace { std::string gidDescriptor; bool isRoce; }; -#endif -#ifdef NIC_EXEC_ENABLED // Function to collect information about IBV devices //======================================================================================== -static bool IsConfiguredGid(union ibv_gid const& gid) + + static bool IsConfiguredGid(union ibv_gid const& gid) { const struct in6_addr *a = (struct in6_addr *) gid.raw; int trailer = (a->s6_addr32[1] | a->s6_addr32[2] | a->s6_addr32[3]); @@ -2981,7 +2992,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) static vector ibvDeviceList = {}; // Build list on first use - if (!isInitialized) { + if (TbIbvSymbolsReady() && !isInitialized) { // Query the number of IBV devices int numIbvDevices = 0; @@ -3066,9 +3077,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) } return ibvDeviceList; } -#endif // NIC_EXEC_ENABLED -#ifdef NIC_EXEC_ENABLED // PCIe-related functions //======================================================================================== @@ -3253,9 +3262,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) } return matches; } -#endif // NIC_EXEC_ENABLED -#ifdef NIC_EXEC_ENABLED // IB Verbs-related functions //======================================================================================== @@ -3426,16 +3433,12 @@ static bool IsConfiguredGid(union ibv_gid const& gid) if (!dmabufStatusPrinted) { dmabufStatusPrinted = true; printf("[INFO] Rank %d DMA-BUF support: ", GetRank()); -#if defined(HAVE_DMABUF_SUPPORT) && !defined(__NVCC__) - bool kernelSupport = CheckKernelDmabufSupport(); + bool kernelSupport = CheckDmabufSupport(); if (kernelSupport) { printf("ENABLED\n"); } else { - printf("DISABLED (kernel config missing, using standard ibv_reg_mr)\n"); + printf("DISABLED (kernel config or export symbol missing, using standard ibv_reg_mr)\n"); } -#else - printf("DISABLED (using standard ibv_reg_mr)\n"); -#endif } } @@ -3464,27 +3467,22 @@ static bool IsConfiguredGid(union ibv_gid const& gid) IBV_PTR_CALL(rss.srcProtect, ibv_alloc_pd, rss.srcContext); // Export DMA-BUF for SRC memory if it's GPU memory -#if defined(HAVE_DMABUF_SUPPORT) && !defined(__NVCC__) - if (!t.srcs.empty() && IsGpuMemType(t.srcs[0].memType) && CheckKernelDmabufSupport()) { + if (CheckDmabufSupport() && !t.srcs.empty() && IsGpuMemType(t.srcs[0].memType)) { ERR_CHECK(ExportDmabuf(rss.srcMem[0], rss.numBytes, rss.srcDmabufFd, rss.srcDmabufOffset)); if (System::Get().IsVerbose()) { printf("[INFO] Rank %d exported SRC GPU memory as DMA-BUF (fd=%d, offset=%lu)\n", GetRank(), rss.srcDmabufFd, rss.srcDmabufOffset); } } -#endif // Register SRC memory region -#ifdef HAVE_DMABUF_SUPPORT if (rss.srcDmabufFd >= 0) { IBV_PTR_CALL(rss.srcMemRegion, ibv_reg_dmabuf_mr, rss.srcProtect, rss.srcDmabufOffset, rss.numBytes, (uint64_t)rss.srcMem[0], rss.srcDmabufFd, rdmaMemRegFlags); if (System::Get().IsVerbose()) { printf("[INFO] Rank %d registered SRC memory using ibv_reg_dmabuf_mr\n", GetRank()); } - } else -#endif - { + } else { IBV_PTR_CALL(rss.srcMemRegion, ibv_reg_mr, rss.srcProtect, rss.srcMem[0], rss.numBytes, rdmaMemRegFlags); if (System::Get().IsVerbose()) { printf("[INFO] Rank %d registered SRC memory using ibv_reg_mr (standard path)\n", GetRank()); @@ -3530,27 +3528,22 @@ static bool IsConfiguredGid(union ibv_gid const& gid) IBV_PTR_CALL(rss.dstProtect, ibv_alloc_pd, rss.dstContext); // Export DMA-BUF for DST memory if it's GPU memory -#if defined(HAVE_DMABUF_SUPPORT) && !defined(__NVCC__) - if (!t.dsts.empty() && IsGpuMemType(t.dsts[0].memType) && CheckKernelDmabufSupport()) { + if (CheckDmabufSupport() && !t.dsts.empty() && IsGpuMemType(t.dsts[0].memType)) { ERR_CHECK(ExportDmabuf(rss.dstMem[0], rss.numBytes, rss.dstDmabufFd, rss.dstDmabufOffset)); if (System::Get().IsVerbose()) { printf("[INFO] Rank %d exported DST GPU memory as DMA-BUF (fd=%d, offset=%lu)\n", GetRank(), rss.dstDmabufFd, rss.dstDmabufOffset); } } -#endif // Register DST memory region -#ifdef HAVE_DMABUF_SUPPORT if (rss.dstDmabufFd >= 0) { IBV_PTR_CALL(rss.dstMemRegion, ibv_reg_dmabuf_mr, rss.dstProtect, rss.dstDmabufOffset, rss.numBytes, (uint64_t)rss.dstMem[0], rss.dstDmabufFd, rdmaMemRegFlags); if (System::Get().IsVerbose()) { printf("[INFO] Rank %d registered DST memory using ibv_reg_dmabuf_mr\n", GetRank()); } - } else -#endif - { + } else { IBV_PTR_CALL(rss.dstMemRegion, ibv_reg_mr, rss.dstProtect, rss.dstMem[0], rss.numBytes, rdmaMemRegFlags); if (System::Get().IsVerbose()) { printf("[INFO] Rank %d registered DST memory using ibv_reg_mr (standard path)\n", GetRank()); @@ -3721,16 +3714,16 @@ static bool IsConfiguredGid(union ibv_gid const& gid) if (isDstRank) IBV_CALL(ibv_dereg_mr, rss.dstMemRegion); // Close DMA-BUF file descriptors -#if defined(HAVE_DMABUF_SUPPORT) && !defined(__NVCC__) - if (isSrcRank && rss.srcDmabufFd >= 0) { - close(rss.srcDmabufFd); - rss.srcDmabufFd = -1; - } - if (isDstRank && rss.dstDmabufFd >= 0) { - close(rss.dstDmabufFd); - rss.dstDmabufFd = -1; + if (CheckDmabufSupport()) { + if (isSrcRank && rss.srcDmabufFd >= 0) { + close(rss.srcDmabufFd); + rss.srcDmabufFd = -1; + } + if (isDstRank && rss.dstDmabufFd >= 0) { + close(rss.dstDmabufFd); + rss.dstDmabufFd = -1; + } } -#endif // Destroy queue pairs if (isSrcRank) { @@ -3758,7 +3751,6 @@ static bool IsConfiguredGid(union ibv_gid const& gid) return ERR_NONE; } -#endif // NIC_EXEC_ENABLED // Data validation-related functions //======================================================================================== @@ -4445,14 +4437,14 @@ static bool IsConfiguredGid(union ibv_gid const& gid) // Prepare for NIC-based executors if (IsNicExeType(exeDevice.exeType)) { -#ifdef NIC_EXEC_ENABLED - for (auto& rss : exeInfo.resources) { - Transfer const& t = transfers[rss.transferIdx]; - ERR_CHECK(PrepareNicTransferResources(cfg, exeDevice, t, rss)); + if (TbIbvSymbolsReady()) { + for (auto& rss : exeInfo.resources) { + Transfer const& t = transfers[rss.transferIdx]; + ERR_CHECK(PrepareNicTransferResources(cfg, exeDevice, t, rss)); + } + } else { + return {ERR_FATAL, "RDMA executor is not supported"}; } -#else - return {ERR_FATAL, "RDMA executor is not supported"}; -#endif } // Check that GPU wallclock rate is non-zero @@ -4549,11 +4541,9 @@ static bool IsConfiguredGid(union ibv_gid const& gid) #endif // Destroy NIC related resources -#ifdef NIC_EXEC_ENABLED - if (IsNicExeType(exeDevice.exeType)) { + if (TbIbvSymbolsReady() && IsNicExeType(exeDevice.exeType)) { ERR_CHECK(TeardownNicTransferResources(rss, t)); } -#endif } // Teardown additional requirements for GPU-based executors @@ -4679,7 +4669,6 @@ static bool IsConfiguredGid(union ibv_gid const& gid) return ERR_NONE; } -#ifdef NIC_EXEC_ENABLED // Execution of a single NIC Transfer static ErrResult ExecuteNicTransfer(int const iteration, ConfigOptions const& cfg, @@ -4780,7 +4769,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) } return ERR_NONE; } -#endif + // GFX Executor-related functions //======================================================================================== @@ -5701,9 +5690,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) case EXE_CPU: return RunCpuExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo); case EXE_GPU_GFX: return RunGpuExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo); case EXE_GPU_DMA: return RunDmaExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo); -#ifdef NIC_EXEC_ENABLED case EXE_NIC: return RunNicExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo); -#endif #ifdef BMA_EXEC_ENABLED case EXE_GPU_BDMA: return RunBmaExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo); #endif @@ -6110,11 +6097,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) TransferResult& tfrResult = results.tfrResults[transferIdx]; tfrResult.exeDevice = exeDevice; -#ifdef NIC_EXEC_ENABLED tfrResult.exeDstDevice = {exeDevice.exeType, rss.dstNicIndex}; -#else - tfrResult.exeDstDevice = exeDevice; -#endif tfrResult.numBytes = rss.numBytes; tfrResult.avgDurationMsec = rss.totalDurationMsec / numTimedIterations; tfrResult.avgBandwidthGbPerSec = (rss.numBytes / 1.0e6) / tfrResult.avgDurationMsec; @@ -6616,6 +6599,14 @@ static bool IsConfiguredGid(union ibv_gid const& gid) Log("[INFO] Running in single node mode\n"); } + // Probe libibverbs at process start (idempotent via std::call_once inside + // TbIbvEnsureLoaded). The integer status maps directly onto the rdma / + // dmabuf feature flags consumed elsewhere in TransferBench: + // TB_IBV_OK -> rdma=true, dmabuf=true + // TB_IBV_NO_DMABUF -> rdma=true, dmabuf=false + // TB_IBV_NO_RDMA -> rdma=false, dmabuf=false + TbIbvLoadStatus const ibvStatus = TbIbvGetLoadStatus(); + // Collect topology and distribute across all ranks CollectTopology(); } @@ -7349,22 +7340,23 @@ static bool IsConfiguredGid(union ibv_gid const& gid) // NIC Executor int numNics = 0; -#ifdef NIC_EXEC_ENABLED - numNics = GetIbvDeviceList().size(); - for (int exeIndex = 0; exeIndex < numNics; exeIndex++) { - topo.closestCpuNumaToNic[exeIndex] = GetIbvDeviceList()[exeIndex].numaNode; - topo.executorName[{EXE_NIC, exeIndex}] = GetIbvDeviceList()[exeIndex].name; - topo.nicIsActive[exeIndex] = GetIbvDeviceList()[exeIndex].hasActivePort; - if (verbose) { - auto const& nic = GetIbvDeviceList()[exeIndex]; - Log("[INFO] Rank %03d: NIC [%02d/%02d] %s BDF %s NUMA %d active=%s\n", - rank, exeIndex, numNics, nic.name.c_str(), - nic.busId.empty() ? "?" : nic.busId.c_str(), - topo.closestCpuNumaToNic[exeIndex], - nic.hasActivePort ? "yes" : "no"); + if (TbIbvSymbolsReady()) + { + numNics = GetIbvDeviceList().size(); + for (int exeIndex = 0; exeIndex < numNics; exeIndex++) { + topo.closestCpuNumaToNic[exeIndex] = GetIbvDeviceList()[exeIndex].numaNode; + topo.executorName[{EXE_NIC, exeIndex}] = GetIbvDeviceList()[exeIndex].name; + topo.nicIsActive[exeIndex] = GetIbvDeviceList()[exeIndex].hasActivePort; + if (verbose) { + auto const& nic = GetIbvDeviceList()[exeIndex]; + Log("[INFO] Rank %03d: NIC [%02d/%02d] %s BDF %s NUMA %d active=%s\n", + rank, exeIndex, numNics, nic.name.c_str(), + nic.busId.empty() ? "?" : nic.busId.c_str(), + topo.closestCpuNumaToNic[exeIndex], + nic.hasActivePort ? "yes" : "no"); + } } } -#endif topo.numExecutors[EXE_NIC] = topo.numExecutors[EXE_NIC_NEAREST] = numNics; for (int nicIndex = 0; nicIndex < numNics; nicIndex++) { @@ -7389,101 +7381,100 @@ static bool IsConfiguredGid(union ibv_gid const& gid) } // Figure out closest NICs to GPUs -#ifdef NIC_EXEC_ENABLED - // Build up list of NIC bus addresses std::vector ibvAddressList; auto const& ibvDeviceList = GetIbvDeviceList(); - for (auto const& ibvDevice : ibvDeviceList) - ibvAddressList.push_back(ibvDevice.hasActivePort ? ibvDevice.busId : ""); - - // Track how many times a device has been assigned as "closest" - // This allows distributed work across devices using multiple ports (sharing the same busID) - // NOTE: This isn't necessarily optimal, but likely to work in most cases involving multi-port - // Counter example: - // - // G0 prefers (N0,N1), picks N0 - // G1 prefers (N1,N2), picks N1 - // G2 prefers N0, picks N0 - // - // instead of G0->N1, G1->N2, G2->N0 - - std::vector assignedCount(ibvDeviceList.size(), 0); - - // Loop over each GPU to find the closest NIC(s) based on PCIe address - for (int gpuIndex = 0; gpuIndex < numGpus; gpuIndex++) { - if (gpuAddressList[gpuIndex].empty()) continue; - const char* hipPciBusId = gpuAddressList[gpuIndex].c_str(); - - // Find closest NICs - std::set closestNicIdxs = GetNearestDevicesInTree(hipPciBusId, ibvAddressList); - - // Pick the least-used NIC to assign as closest - int closestIdx = -1; - for (auto idx : closestNicIdxs) { - if (closestIdx == -1 || assignedCount[idx] < assignedCount[closestIdx]) - closestIdx = idx; - } - - // The following will only use distance between bus IDs - // to determine the closest NIC to GPU if the PCIe tree approach fails - if (closestIdx < 0) { -#ifdef VERBS_DEBUG - Log("[WARN] Falling back to PCIe bus ID distance to determine proximity\n"); -#endif - int minDistance = std::numeric_limits::max(); - for (int nicIndex = 0; nicIndex < numNics; nicIndex++) { - if (ibvDeviceList[nicIndex].busId != "") { - int distance = GetBusIdDistance(hipPciBusId, ibvDeviceList[nicIndex].busId); - if (distance < minDistance && distance >= 0) { - minDistance = distance; - closestIdx = nicIndex; + if (TbIbvSymbolsReady()) { + for (auto const& ibvDevice : ibvDeviceList) + ibvAddressList.push_back(ibvDevice.hasActivePort ? ibvDevice.busId : ""); + + // Track how many times a device has been assigned as "closest" + // This allows distributed work across devices using multiple ports (sharing the same busID) + // NOTE: This isn't necessarily optimal, but likely to work in most cases involving multi-port + // Counter example: + // + // G0 prefers (N0,N1), picks N0 + // G1 prefers (N1,N2), picks N1 + // G2 prefers N0, picks N0 + // + // instead of G0->N1, G1->N2, G2->N0 + + std::vector assignedCount(ibvDeviceList.size(), 0); + + // Loop over each GPU to find the closest NIC(s) based on PCIe address + for (int gpuIndex = 0; gpuIndex < numGpus; gpuIndex++) { + if (gpuAddressList[gpuIndex].empty()) continue; + const char* hipPciBusId = gpuAddressList[gpuIndex].c_str(); + + // Find closest NICs + std::set closestNicIdxs = GetNearestDevicesInTree(hipPciBusId, ibvAddressList); + + // Pick the least-used NIC to assign as closest + int closestIdx = -1; + for (auto idx : closestNicIdxs) { + if (closestIdx == -1 || assignedCount[idx] < assignedCount[closestIdx]) + closestIdx = idx; + } + + // The following will only use distance between bus IDs + // to determine the closest NIC to GPU if the PCIe tree approach fails + if (closestIdx < 0) { + #ifdef VERBS_DEBUG + Log("[WARN] Falling back to PCIe bus ID distance to determine proximity\n"); + #endif + int minDistance = std::numeric_limits::max(); + for (int nicIndex = 0; nicIndex < numNics; nicIndex++) { + if (ibvDeviceList[nicIndex].busId != "") { + int distance = GetBusIdDistance(hipPciBusId, ibvDeviceList[nicIndex].busId); + if (distance < minDistance && distance >= 0) { + minDistance = distance; + closestIdx = nicIndex; + } } } } + if (closestIdx != -1) { + topo.closestNicsToGpu[gpuIndex].push_back(closestIdx); + assignedCount[closestIdx]++; + } } - if (closestIdx != -1) { - topo.closestNicsToGpu[gpuIndex].push_back(closestIdx); - assignedCount[closestIdx]++; - } - } - // Compute the reverse mapping: closest GPU(s) for each NIC - // Loop over each NIC to find the closest GPU(s) based on PCIe address - for (int nicIndex = 0; nicIndex < numNics; nicIndex++) { - if (!ibvDeviceList[nicIndex].hasActivePort || ibvDeviceList[nicIndex].busId.empty()) { - continue; - } + // Compute the reverse mapping: closest GPU(s) for each NIC + // Loop over each NIC to find the closest GPU(s) based on PCIe address + for (int nicIndex = 0; nicIndex < numNics; nicIndex++) { + if (!ibvDeviceList[nicIndex].hasActivePort || ibvDeviceList[nicIndex].busId.empty()) { + continue; + } - // Find closest GPUs using LCA algorithm - std::set closestGpuIdxs = GetNearestDevicesInTree(ibvDeviceList[nicIndex].busId, gpuAddressList); + // Find closest GPUs using LCA algorithm + std::set closestGpuIdxs = GetNearestDevicesInTree(ibvDeviceList[nicIndex].busId, gpuAddressList); - if (closestGpuIdxs.empty()) { - // Fallback: use bus ID distance - int minDistance = std::numeric_limits::max(); - int closestIdx = -1; + if (closestGpuIdxs.empty()) { + // Fallback: use bus ID distance + int minDistance = std::numeric_limits::max(); + int closestIdx = -1; - for (int gpuIdx = 0; gpuIdx < numGpus; gpuIdx++) { - if (gpuAddressList[gpuIdx].empty()) continue; + for (int gpuIdx = 0; gpuIdx < numGpus; gpuIdx++) { + if (gpuAddressList[gpuIdx].empty()) continue; - int distance = GetBusIdDistance(ibvDeviceList[nicIndex].busId, gpuAddressList[gpuIdx]); - if (distance >= 0 && distance < minDistance) { - minDistance = distance; - closestIdx = gpuIdx; + int distance = GetBusIdDistance(ibvDeviceList[nicIndex].busId, gpuAddressList[gpuIdx]); + if (distance >= 0 && distance < minDistance) { + minDistance = distance; + closestIdx = gpuIdx; + } } - } - if (closestIdx != -1) { - topo.closestGpusToNic[nicIndex].push_back(closestIdx); - } - } else { - // Store all GPUs that are equally close - for (int idx : closestGpuIdxs) { - topo.closestGpusToNic[nicIndex].push_back(idx); + if (closestIdx != -1) { + topo.closestGpusToNic[nicIndex].push_back(closestIdx); + } + } else { + // Store all GPUs that are equally close + for (int idx : closestGpuIdxs) { + topo.closestGpusToNic[nicIndex].push_back(idx); + } } } } -#endif if (verbose) { for (int exeIndex = 0; exeIndex < numGpus; exeIndex++) { @@ -7503,20 +7494,20 @@ static bool IsConfiguredGid(union ibv_gid const& gid) Log("\n"); } } -#ifdef NIC_EXEC_ENABLED - for (int nicIndex = 0; nicIndex < numNics; nicIndex++) { - Log("[INFO] Rank %03d: NIC [%02d/%02d] %s Closest GPUs:", rank, nicIndex, numNics, - ibvDeviceList[nicIndex].name.c_str()); - if (topo.closestGpusToNic[nicIndex].size() == 0) { - Log(" none"); - } else { - for (auto gpuIndex : topo.closestGpusToNic[nicIndex]) { - Log(" %d", gpuIndex); + if (TbIbvSymbolsReady()) { + for (int nicIndex = 0; nicIndex < numNics; nicIndex++) { + Log("[INFO] Rank %03d: NIC [%02d/%02d] %s Closest GPUs:", rank, nicIndex, numNics, + ibvDeviceList[nicIndex].name.c_str()); + if (topo.closestGpusToNic[nicIndex].size() == 0) { + Log(" none"); + } else { + for (auto gpuIndex : topo.closestGpusToNic[nicIndex]) { + Log(" %d", gpuIndex); + } } + Log("\n"); } - Log("\n"); } -#endif } } diff --git a/third-party/ibverbs/IbvDynLoad.hpp b/third-party/ibverbs/IbvDynLoad.hpp new file mode 100644 index 00000000..e64928ac --- /dev/null +++ b/third-party/ibverbs/IbvDynLoad.hpp @@ -0,0 +1,210 @@ +/* +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#include +#include + +#include "ibv_core.hpp" + +/// @brief Outcome of the runtime libibverbs probe. +/// +/// Stored on @ref IbvDynloadState::status and surfaced via +/// @ref TbIbvGetLoadStatus(). The integer values are stable and may be copied +/// directly into TransferBench's `System` fields. +/// - TB_IBV_OK (0): all required symbols resolved (RDMA + DMA-BUF). +/// - TB_IBV_NO_DMABUF (1): RDMA symbols resolved, but `ibv_reg_dmabuf_mr` +/// is missing (older libibverbs / no DMA-BUF API). +/// - TB_IBV_NO_RDMA (2): dlopen failed, or any non-DMA-BUF symbol is +/// missing. Whole library is treated as unusable. +enum TbIbvLoadStatus { + TB_IBV_OK = 0, + TB_IBV_NO_DMABUF = 1, + TB_IBV_NO_RDMA = 2, +}; + +#define IBV_FN(name, rettype, arglist) rettype(*name)arglist = nullptr; + +namespace { + +IBV_FN(ibv_alloc_pd, ibv_pd*, (ibv_context*)) +IBV_FN(ibv_close_device, int, (ibv_context*)) +IBV_FN(ibv_create_cq, ibv_cq*, (ibv_context*, int, void*, ibv_comp_channel*, int)) +IBV_FN(ibv_create_qp, ibv_qp*, (ibv_pd*, ibv_qp_init_attr*)) +IBV_FN(ibv_dealloc_pd, int, (ibv_pd*)) +IBV_FN(ibv_dereg_mr, int, (ibv_mr*)) +IBV_FN(ibv_destroy_cq, int, (ibv_cq*)) +IBV_FN(ibv_destroy_qp, int, (ibv_qp*)) +IBV_FN(ibv_free_device_list, void, (ibv_device**)) +IBV_FN(ibv_get_device_list, ibv_device**, (int*)) +IBV_FN(ibv_get_device_name, const char*, (ibv_device*)) +IBV_FN(ibv_modify_qp, int, (ibv_qp*, ibv_qp_attr*, int)) +IBV_FN(ibv_open_device, ibv_context*, (ibv_device*)) +IBV_FN(ibv_poll_cq, int, (ibv_cq*, int, ibv_wc*)) +IBV_FN(ibv_post_send, int, (ibv_qp*, ibv_send_wr*, ibv_send_wr**)) +IBV_FN(ibv_query_device, int, (ibv_context*, ibv_device_attr*)) +IBV_FN(ibv_query_gid, int, (ibv_context*, uint8_t, int, ibv_gid*)) +// MEMO: Previously the IBV_DIRECT path bound to `___ibv_query_port` because +// older versions of only exposed `ibv_query_port` as an +// inline wrapper around that internal extern symbol. Now that we no longer +// include verbs.h (it has been vendored into ibv_core.hpp), we always link or +// dlsym the public `ibv_query_port` symbol from libibverbs.so.1 directly. +// Revisit only if support for pre-1.1 libibverbs ever becomes a requirement. +IBV_FN(ibv_query_port, int, (ibv_context*, uint8_t, ibv_port_attr*)) +// `ibv_reg_dmabuf_mr` is always declared; whether the underlying symbol +// actually exists in the loaded libibverbs is decided at runtime by tryLoad(). +IBV_FN(ibv_reg_dmabuf_mr, ibv_mr*, (ibv_pd*, uint64_t, size_t, uint64_t, int, int)) +IBV_FN(ibv_reg_mr, ibv_mr*, (ibv_pd*, void*, size_t, int)) + +} // namespace + +struct IbvDynloadState { + std::once_flag once{}; + void* handle = nullptr; + TbIbvLoadStatus status = TB_IBV_NO_RDMA; + + /// @brief Run dlopen + dlsym once and classify the outcome. + /// @return One of the @ref TbIbvLoadStatus values, also stored in @c status. + TbIbvLoadStatus tryLoad() + { + status = TB_IBV_NO_RDMA; + + handle = dlopen("libibverbs.so.1", RTLD_NOW); + if (handle == nullptr) + return status; + + struct Symbol { void **ppfn; char const *name; }; + + // Core RDMA symbols. Failure of any of these means RDMA is unusable, so we + // tear the whole library back down and report TB_IBV_NO_RDMA. + Symbol coreSymbols[] = { + {(void**)&ibv_alloc_pd, "ibv_alloc_pd"}, + {(void**)&ibv_close_device, "ibv_close_device"}, + {(void**)&ibv_create_cq, "ibv_create_cq"}, + {(void**)&ibv_create_qp, "ibv_create_qp"}, + {(void**)&ibv_dealloc_pd, "ibv_dealloc_pd"}, + {(void**)&ibv_dereg_mr, "ibv_dereg_mr"}, + {(void**)&ibv_destroy_cq, "ibv_destroy_cq"}, + {(void**)&ibv_destroy_qp, "ibv_destroy_qp"}, + {(void**)&ibv_free_device_list, "ibv_free_device_list"}, + {(void**)&ibv_get_device_list, "ibv_get_device_list"}, + {(void**)&ibv_get_device_name, "ibv_get_device_name"}, + {(void**)&ibv_modify_qp, "ibv_modify_qp"}, + {(void**)&ibv_open_device, "ibv_open_device"}, + {(void**)&ibv_poll_cq, "ibv_poll_cq"}, + {(void**)&ibv_post_send, "ibv_post_send"}, + {(void**)&ibv_query_device, "ibv_query_device"}, + {(void**)&ibv_query_gid, "ibv_query_gid"}, + {(void**)&ibv_query_port, "ibv_query_port"}, + {(void**)&ibv_reg_mr, "ibv_reg_mr"}, + }; + + for (Symbol const& s : coreSymbols) { + void* sym = dlsym(handle, s.name); + if (sym == nullptr) { + // Roll back any pointer already wired so callers don't see a half-loaded library. + for (Symbol const& r : coreSymbols) *r.ppfn = nullptr; + dlclose(handle); + handle = nullptr; + return status; // TB_IBV_NO_RDMA + } + *s.ppfn = sym; + } + + // DMA-BUF probe is independent: missing symbol downgrades to TB_IBV_NO_DMABUF + // but RDMA stays usable. + void* dmabufSym = dlsym(handle, "ibv_reg_dmabuf_mr"); + if (dmabufSym != nullptr) { + *((void**)&ibv_reg_dmabuf_mr) = dmabufSym; + status = TB_IBV_OK; + } else { + ibv_reg_dmabuf_mr = nullptr; + status = TB_IBV_NO_DMABUF; + } + return status; + } +}; + +inline IbvDynloadState& ibvDynloadState() +{ + static IbvDynloadState s; + return s; +} + +inline void TbIbvEnsureLoaded() +{ + IbvDynloadState& st = ibvDynloadState(); + std::call_once(st.once, [&]() { st.tryLoad(); }); +} + +inline TbIbvLoadStatus TbIbvGetLoadStatus() +{ + TbIbvEnsureLoaded(); + return ibvDynloadState().status; +} + +inline bool TbIbvSymbolsReady() +{ + return TbIbvGetLoadStatus() != TB_IBV_NO_RDMA; +} + +inline bool TbIbvDmabufPresent() +{ + return TbIbvGetLoadStatus() == TB_IBV_OK; +} + +inline void* TbIbvDlHandle() +{ + TbIbvEnsureLoaded(); + return ibvDynloadState().handle; +} + +inline void TbIbvUnload() +{ + IbvDynloadState& st = ibvDynloadState(); + if (st.handle != nullptr) { + dlclose(st.handle); + st.handle = nullptr; + st.status = TB_IBV_NO_RDMA; + ibv_alloc_pd = nullptr; + ibv_close_device = nullptr; + ibv_create_cq = nullptr; + ibv_create_qp = nullptr; + ibv_dealloc_pd = nullptr; + ibv_dereg_mr = nullptr; + ibv_destroy_cq = nullptr; + ibv_destroy_qp = nullptr; + ibv_free_device_list = nullptr; + ibv_get_device_list = nullptr; + ibv_get_device_name = nullptr; + ibv_modify_qp = nullptr; + ibv_open_device = nullptr; + ibv_poll_cq = nullptr; + ibv_post_send = nullptr; + ibv_query_device = nullptr; + ibv_query_gid = nullptr; + ibv_query_port = nullptr; + ibv_reg_dmabuf_mr = nullptr; + ibv_reg_mr = nullptr; + } +} diff --git a/third-party/ibverbs/IbvHeader.hpp b/third-party/ibverbs/IbvHeader.hpp new file mode 100644 index 00000000..8c964a05 --- /dev/null +++ b/third-party/ibverbs/IbvHeader.hpp @@ -0,0 +1,544 @@ +/* +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#include +#include + +extern "C" { + +// --------------------------------------------------------------------------- +// Opaque handles (forward declarations only) +// --------------------------------------------------------------------------- +struct ibv_pd; +struct ibv_cq; +struct ibv_srq; +struct ibv_ah; +struct ibv_mw; +struct ibv_dm; +struct ibv_xrcd; +struct ibv_comp_channel; + +// --------------------------------------------------------------------------- +// ibv_gid - 16-byte GID. verbs.h declares the inner fields as __be64 (big +// endian). TransferBench only memcpys/broadcasts/compares this opaque blob, +// so plain uint64_t preserves the layout without dragging in . +// --------------------------------------------------------------------------- +union ibv_gid { + uint8_t raw[16]; + struct { + uint64_t subnet_prefix; + uint64_t interface_id; + } global; +}; + +// --------------------------------------------------------------------------- +// Device enumeration types +// --------------------------------------------------------------------------- +enum ibv_node_type { + IBV_NODE_UNKNOWN = -1, + IBV_NODE_CA = 1, + IBV_NODE_SWITCH, + IBV_NODE_ROUTER, + IBV_NODE_RNIC, + IBV_NODE_USNIC, + IBV_NODE_USNIC_UDP, + IBV_NODE_UNSPECIFIED, +}; + +enum ibv_transport_type { + IBV_TRANSPORT_UNKNOWN = -1, + IBV_TRANSPORT_IB = 0, + IBV_TRANSPORT_IWARP, + IBV_TRANSPORT_USNIC, + IBV_TRANSPORT_USNIC_UDP, + IBV_TRANSPORT_UNSPECIFIED, +}; + +enum ibv_atomic_cap { + IBV_ATOMIC_NONE, + IBV_ATOMIC_HCA, + IBV_ATOMIC_GLOB, +}; + +// ibv_device_ops: 2 opaque function pointers; preserved purely for layout. +struct _ibv_device_ops { + struct ibv_context *(*_dummy1)(struct ibv_device *device, int cmd_fd); + void (*_dummy2)(struct ibv_context *context); +}; + +enum { + IBV_SYSFS_NAME_MAX = 64, + IBV_SYSFS_PATH_MAX = 256, +}; + +struct ibv_device { + struct _ibv_device_ops _ops; + enum ibv_node_type node_type; + enum ibv_transport_type transport_type; + char name[IBV_SYSFS_NAME_MAX]; + char dev_name[IBV_SYSFS_NAME_MAX]; + char dev_path[IBV_SYSFS_PATH_MAX]; + char ibdev_path[IBV_SYSFS_PATH_MAX]; +}; + +// We only ever read ->device (offset 0). The remaining fields (ops table, +// fds, mutex, ...) are intentionally omitted - libibverbs allocates and +// frees these objects and we never take sizeof(ibv_context). +struct ibv_context { + struct ibv_device *device; +}; + +// --------------------------------------------------------------------------- +// Device / port attributes (populated by ibv_query_device / ibv_query_port) +// --------------------------------------------------------------------------- +struct ibv_device_attr { + char fw_ver[64]; + uint64_t node_guid; + uint64_t sys_image_guid; + uint64_t max_mr_size; + uint64_t page_size_cap; + uint32_t vendor_id; + uint32_t vendor_part_id; + uint32_t hw_ver; + int max_qp; + int max_qp_wr; + unsigned int device_cap_flags; + int max_sge; + int max_sge_rd; + int max_cq; + int max_cqe; + int max_mr; + int max_pd; + int max_qp_rd_atom; + int max_ee_rd_atom; + int max_res_rd_atom; + int max_qp_init_rd_atom; + int max_ee_init_rd_atom; + enum ibv_atomic_cap atomic_cap; + int max_ee; + int max_rdd; + int max_mw; + int max_raw_ipv6_qp; + int max_raw_ethy_qp; + int max_mcast_grp; + int max_mcast_qp_attach; + int max_total_mcast_qp_attach; + int max_ah; + int max_fmr; + int max_map_per_fmr; + int max_srq; + int max_srq_wr; + int max_srq_sge; + uint16_t max_pkeys; + uint8_t local_ca_ack_delay; + uint8_t phys_port_cnt; +}; + +enum ibv_mtu { + IBV_MTU_256 = 1, + IBV_MTU_512 = 2, + IBV_MTU_1024 = 3, + IBV_MTU_2048 = 4, + IBV_MTU_4096 = 5, +}; + +enum ibv_port_state { + IBV_PORT_NOP = 0, + IBV_PORT_DOWN = 1, + IBV_PORT_INIT = 2, + IBV_PORT_ARMED = 3, + IBV_PORT_ACTIVE = 4, + IBV_PORT_ACTIVE_DEFER = 5, +}; + +enum { + IBV_LINK_LAYER_UNSPECIFIED, + IBV_LINK_LAYER_INFINIBAND, + IBV_LINK_LAYER_ETHERNET, +}; + +struct ibv_port_attr { + enum ibv_port_state state; + enum ibv_mtu max_mtu; + enum ibv_mtu active_mtu; + int gid_tbl_len; + uint32_t port_cap_flags; + uint32_t max_msg_sz; + uint32_t bad_pkey_cntr; + uint32_t qkey_viol_cntr; + uint16_t pkey_tbl_len; + uint16_t lid; + uint16_t sm_lid; + uint8_t lmc; + uint8_t max_vl_num; + uint8_t sm_sl; + uint8_t subnet_timeout; + uint8_t init_type_reply; + uint8_t active_width; + uint8_t active_speed; + uint8_t phys_state; + uint8_t link_layer; + uint8_t flags; + uint16_t port_cap_flags2; + uint32_t active_speed_ex; +}; + +// --------------------------------------------------------------------------- +// Memory region (populated by ibv_reg_mr / ibv_reg_dmabuf_mr) +// --------------------------------------------------------------------------- +struct ibv_mr { + struct ibv_context *context; + struct ibv_pd *pd; + void *addr; + size_t length; + uint32_t handle; + uint32_t lkey; + uint32_t rkey; +}; + +// --------------------------------------------------------------------------- +// Address handle / global route (used inside ibv_qp_attr.ah_attr) +// --------------------------------------------------------------------------- +struct ibv_global_route { + union ibv_gid dgid; + uint32_t flow_label; + uint8_t sgid_index; + uint8_t hop_limit; + uint8_t traffic_class; +}; + +struct ibv_ah_attr { + struct ibv_global_route grh; + uint16_t dlid; + uint8_t sl; + uint8_t src_path_bits; + uint8_t static_rate; + uint8_t is_global; + uint8_t port_num; +}; + +// --------------------------------------------------------------------------- +// Queue pair init / modify attributes +// --------------------------------------------------------------------------- +enum ibv_qp_type { + IBV_QPT_RC = 2, + IBV_QPT_UC = 3, + IBV_QPT_UD = 4, + IBV_QPT_RAW_PACKET = 8, + IBV_QPT_XRC_SEND = 9, + IBV_QPT_XRC_RECV = 10, + IBV_QPT_DRIVER = 0xff, +}; + +struct ibv_qp_cap { + uint32_t max_send_wr; + uint32_t max_recv_wr; + uint32_t max_send_sge; + uint32_t max_recv_sge; + uint32_t max_inline_data; +}; + +struct ibv_qp_init_attr { + void *qp_context; + struct ibv_cq *send_cq; + struct ibv_cq *recv_cq; + struct ibv_srq *srq; + struct ibv_qp_cap cap; + enum ibv_qp_type qp_type; + int sq_sig_all; +}; + +enum ibv_qp_attr_mask { + IBV_QP_STATE = 1 << 0, + IBV_QP_CUR_STATE = 1 << 1, + IBV_QP_EN_SQD_ASYNC_NOTIFY = 1 << 2, + IBV_QP_ACCESS_FLAGS = 1 << 3, + IBV_QP_PKEY_INDEX = 1 << 4, + IBV_QP_PORT = 1 << 5, + IBV_QP_QKEY = 1 << 6, + IBV_QP_AV = 1 << 7, + IBV_QP_PATH_MTU = 1 << 8, + IBV_QP_TIMEOUT = 1 << 9, + IBV_QP_RETRY_CNT = 1 << 10, + IBV_QP_RNR_RETRY = 1 << 11, + IBV_QP_RQ_PSN = 1 << 12, + IBV_QP_MAX_QP_RD_ATOMIC = 1 << 13, + IBV_QP_ALT_PATH = 1 << 14, + IBV_QP_MIN_RNR_TIMER = 1 << 15, + IBV_QP_SQ_PSN = 1 << 16, + IBV_QP_MAX_DEST_RD_ATOMIC = 1 << 17, + IBV_QP_PATH_MIG_STATE = 1 << 18, + IBV_QP_CAP = 1 << 19, + IBV_QP_DEST_QPN = 1 << 20, + IBV_QP_RATE_LIMIT = 1 << 25, +}; + +enum ibv_qp_state { + IBV_QPS_RESET, + IBV_QPS_INIT, + IBV_QPS_RTR, + IBV_QPS_RTS, + IBV_QPS_SQD, + IBV_QPS_SQE, + IBV_QPS_ERR, + IBV_QPS_UNKNOWN, +}; + +// ibv_qp - layout matches libibverbs through qp_num. TransferBench only ever +// holds ibv_qp* returned by ibv_create_qp and reads qp_num, so the trailing +// libibverbs members (mutex/cond/events_completed) are intentionally omitted: +// we never allocate or sizeof an ibv_qp, and every accessed field sits at its +// real ABI offset. ibv_srq stays opaque (pointer only). +struct ibv_qp { + struct ibv_context *context; + void *qp_context; + struct ibv_pd *pd; + struct ibv_cq *send_cq; + struct ibv_cq *recv_cq; + struct ibv_srq *srq; + uint32_t handle; + uint32_t qp_num; + enum ibv_qp_state state; + enum ibv_qp_type qp_type; +}; + +enum ibv_mig_state { + IBV_MIG_MIGRATED, + IBV_MIG_REARM, + IBV_MIG_ARMED, +}; + +struct ibv_qp_attr { + enum ibv_qp_state qp_state; + enum ibv_qp_state cur_qp_state; + enum ibv_mtu path_mtu; + enum ibv_mig_state path_mig_state; + uint32_t qkey; + uint32_t rq_psn; + uint32_t sq_psn; + uint32_t dest_qp_num; + unsigned int qp_access_flags; + struct ibv_qp_cap cap; + struct ibv_ah_attr ah_attr; + struct ibv_ah_attr alt_ah_attr; + uint16_t pkey_index; + uint16_t alt_pkey_index; + uint8_t en_sqd_async_notify; + uint8_t sq_draining; + uint8_t max_rd_atomic; + uint8_t max_dest_rd_atomic; + uint8_t min_rnr_timer; + uint8_t port_num; + uint8_t timeout; + uint8_t retry_cnt; + uint8_t rnr_retry; + uint8_t alt_port_num; + uint8_t alt_timeout; + uint32_t rate_limit; +}; + +// --------------------------------------------------------------------------- +// Memory access / send flags +// --------------------------------------------------------------------------- +// IBV_ACCESS_RELAXED_ORDERING resolves to IB_UVERBS_ACCESS_OPTIONAL_FIRST, +// which the kernel uAPI defines as (1 << 20). +enum ibv_access_flags { + IBV_ACCESS_LOCAL_WRITE = 1, + IBV_ACCESS_REMOTE_WRITE = (1 << 1), + IBV_ACCESS_REMOTE_READ = (1 << 2), + IBV_ACCESS_REMOTE_ATOMIC = (1 << 3), + IBV_ACCESS_MW_BIND = (1 << 4), + IBV_ACCESS_ZERO_BASED = (1 << 5), + IBV_ACCESS_ON_DEMAND = (1 << 6), + IBV_ACCESS_HUGETLB = (1 << 7), + IBV_ACCESS_FLUSH_GLOBAL = (1 << 8), + IBV_ACCESS_FLUSH_PERSISTENT = (1 << 9), + IBV_ACCESS_RELAXED_ORDERING = (1 << 20), +}; + +enum ibv_wr_opcode { + IBV_WR_RDMA_WRITE, + IBV_WR_RDMA_WRITE_WITH_IMM, + IBV_WR_SEND, + IBV_WR_SEND_WITH_IMM, + IBV_WR_RDMA_READ, + IBV_WR_ATOMIC_CMP_AND_SWP, + IBV_WR_ATOMIC_FETCH_AND_ADD, + IBV_WR_LOCAL_INV, + IBV_WR_BIND_MW, + IBV_WR_SEND_WITH_INV, + IBV_WR_TSO, + IBV_WR_DRIVER1, + IBV_WR_FLUSH = 14, + IBV_WR_ATOMIC_WRITE = 15, +}; + +enum ibv_send_flags { + IBV_SEND_FENCE = 1 << 0, + IBV_SEND_SIGNALED = 1 << 1, + IBV_SEND_SOLICITED = 1 << 2, + IBV_SEND_INLINE = 1 << 3, + IBV_SEND_IP_CSUM = 1 << 4, +}; + +// --------------------------------------------------------------------------- +// Scatter/gather and work request (consumed by ibv_post_send) +// --------------------------------------------------------------------------- +struct ibv_sge { + uint64_t addr; + uint32_t length; + uint32_t lkey; +}; + +// Forward decl needed by ibv_send_wr.bind_mw (kept for layout). Mirrors +// verbs.h's struct ibv_mw_bind_info exactly. +struct ibv_mw_bind_info { + struct ibv_mr *mr; + uint64_t addr; + uint64_t length; + unsigned int mw_access_flags; +}; + +// Full ABI-exact ibv_send_wr. TransferBench only sets the rdma arm of `wr`, +// but the union's overall size must match the system layout because the +// driver may write through the entire struct. +struct ibv_send_wr { + uint64_t wr_id; + struct ibv_send_wr *next; + struct ibv_sge *sg_list; + int num_sge; + enum ibv_wr_opcode opcode; + unsigned int send_flags; + union { + uint32_t imm_data; + uint32_t invalidate_rkey; + }; + union { + struct { + uint64_t remote_addr; + uint32_t rkey; + } rdma; + struct { + uint64_t remote_addr; + uint64_t compare_add; + uint64_t swap; + uint32_t rkey; + } atomic; + struct { + struct ibv_ah *ah; + uint32_t remote_qpn; + uint32_t remote_qkey; + } ud; + } wr; + union { + struct { + uint32_t remote_srqn; + } xrc; + } qp_type; + union { + struct { + struct ibv_mw *mw; + uint32_t rkey; + struct ibv_mw_bind_info bind_info; + } bind_mw; + struct { + void *hdr; + uint16_t hdr_sz; + uint16_t mss; + } tso; + }; +}; + +// --------------------------------------------------------------------------- +// Completion queue entry (populated by ibv_poll_cq) +// --------------------------------------------------------------------------- +enum ibv_wc_status { + IBV_WC_SUCCESS, + IBV_WC_LOC_LEN_ERR, + IBV_WC_LOC_QP_OP_ERR, + IBV_WC_LOC_EEC_OP_ERR, + IBV_WC_LOC_PROT_ERR, + IBV_WC_WR_FLUSH_ERR, + IBV_WC_MW_BIND_ERR, + IBV_WC_BAD_RESP_ERR, + IBV_WC_LOC_ACCESS_ERR, + IBV_WC_REM_INV_REQ_ERR, + IBV_WC_REM_ACCESS_ERR, + IBV_WC_REM_OP_ERR, + IBV_WC_RETRY_EXC_ERR, + IBV_WC_RNR_RETRY_EXC_ERR, + IBV_WC_LOC_RDD_VIOL_ERR, + IBV_WC_REM_INV_RD_REQ_ERR, + IBV_WC_REM_ABORT_ERR, + IBV_WC_INV_EECN_ERR, + IBV_WC_INV_EEC_STATE_ERR, + IBV_WC_FATAL_ERR, + IBV_WC_RESP_TIMEOUT_ERR, + IBV_WC_GENERAL_ERR, + IBV_WC_TM_ERR, + IBV_WC_TM_RNDV_INCOMPLETE, +}; + +enum ibv_wc_opcode { + IBV_WC_SEND, + IBV_WC_RDMA_WRITE, + IBV_WC_RDMA_READ, + IBV_WC_COMP_SWAP, + IBV_WC_FETCH_ADD, + IBV_WC_BIND_MW, + IBV_WC_LOCAL_INV, + IBV_WC_TSO, + IBV_WC_FLUSH, + IBV_WC_ATOMIC_WRITE = 9, + IBV_WC_RECV = 1 << 7, + IBV_WC_RECV_RDMA_WITH_IMM, + IBV_WC_TM_ADD, + IBV_WC_TM_DEL, + IBV_WC_TM_SYNC, + IBV_WC_TM_RECV, + IBV_WC_TM_NO_TAG, + IBV_WC_DRIVER1, + IBV_WC_DRIVER2, + IBV_WC_DRIVER3, +}; + +struct ibv_wc { + uint64_t wr_id; + enum ibv_wc_status status; + enum ibv_wc_opcode opcode; + uint32_t vendor_err; + uint32_t byte_len; + union { + uint32_t imm_data; + uint32_t invalidated_rkey; + }; + uint32_t qp_num; + uint32_t src_qp; + unsigned int wc_flags; + uint16_t pkey_index; + uint16_t slid; + uint8_t sl; + uint8_t dlid_path_bits; +}; +} // extern "C" From 87875fe81b4763d7908dc86c093d60ac706051e3 Mon Sep 17 00:00:00 2001 From: AtlantaPepsi Date: Wed, 10 Jun 2026 20:51:00 +0000 Subject: [PATCH 2/4] missed some changes --- CMakeLists.txt | 6 ++-- Makefile | 4 +-- third-party/ibverbs/IbvDynLoad.hpp | 23 ++----------- third-party/ibverbs/IbvHeader.hpp | 53 +++++++++++++++++++----------- 4 files changed, 41 insertions(+), 45 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 704ae42e..8a4b7f65 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -314,7 +314,7 @@ endif() ## NIC / RDMA executor ## ## libibverbs is loaded dynamically at runtime via dlopen/dlsym -## (see vendors/ibverbs/IbvDynLoad.hpp). The build neither links against +## (see third-party/ibverbs/IbvDynLoad.hpp). The build neither links against ## -libverbs nor requires libibverbs-dev to be installed on the build host. ## Only libdl needs to be linked so that dlopen/dlsym resolve. message(STATUS "NIC executor: libibverbs is loaded dynamically at runtime (no -libverbs link, no build-host dependency)") @@ -451,9 +451,9 @@ target_include_directories(TransferBench PRIVATE src/header src/client src/client/Presets - vendors/ibverbs) + third-party/ibverbs) -# libdl supplies dlopen/dlsym used by vendors/ibverbs/IbvDynLoad.hpp. +# libdl supplies dlopen/dlsym used by third-party/ibverbs/IbvDynLoad.hpp. target_link_libraries(TransferBench PRIVATE ${CMAKE_DL_LIBS}) if(MPI_COMM_FOUND) if(TARGET MPI::MPI_CXX) diff --git a/Makefile b/Makefile index ea0ce643..66477a9d 100644 --- a/Makefile +++ b/Makefile @@ -81,10 +81,10 @@ ifeq ($(filter clean,$(MAKECMDGOALS)),) else COMMON_FLAGS += -O0 -g -ggdb3 endif - COMMON_FLAGS += -I./src/header -I./src/client -I./src/client/Presets -I./vendors/ibverbs + COMMON_FLAGS += -I./src/header -I./src/client -I./src/client/Presets -I./third-party/ibverbs # libibverbs is loaded dynamically at runtime via dlopen/dlsym (see - # vendors/ibverbs/IbvDynLoad.hpp), so the build never links against -libverbs + # third-party/ibverbs/IbvDynLoad.hpp), so the build never links against -libverbs # and does not require libibverbs-dev to be installed. We only need -ldl so # the dynamic loader API is resolvable. LDFLAGS += -lpthread -ldl diff --git a/third-party/ibverbs/IbvDynLoad.hpp b/third-party/ibverbs/IbvDynLoad.hpp index e64928ac..54c63b60 100644 --- a/third-party/ibverbs/IbvDynLoad.hpp +++ b/third-party/ibverbs/IbvDynLoad.hpp @@ -25,18 +25,8 @@ THE SOFTWARE. #include #include -#include "ibv_core.hpp" - -/// @brief Outcome of the runtime libibverbs probe. -/// -/// Stored on @ref IbvDynloadState::status and surfaced via -/// @ref TbIbvGetLoadStatus(). The integer values are stable and may be copied -/// directly into TransferBench's `System` fields. -/// - TB_IBV_OK (0): all required symbols resolved (RDMA + DMA-BUF). -/// - TB_IBV_NO_DMABUF (1): RDMA symbols resolved, but `ibv_reg_dmabuf_mr` -/// is missing (older libibverbs / no DMA-BUF API). -/// - TB_IBV_NO_RDMA (2): dlopen failed, or any non-DMA-BUF symbol is -/// missing. Whole library is treated as unusable. +#include "IbvHeader.hpp" + enum TbIbvLoadStatus { TB_IBV_OK = 0, TB_IBV_NO_DMABUF = 1, @@ -64,18 +54,11 @@ IBV_FN(ibv_poll_cq, int, (ibv_cq*, int, ibv_wc*)) IBV_FN(ibv_post_send, int, (ibv_qp*, ibv_send_wr*, ibv_send_wr**)) IBV_FN(ibv_query_device, int, (ibv_context*, ibv_device_attr*)) IBV_FN(ibv_query_gid, int, (ibv_context*, uint8_t, int, ibv_gid*)) -// MEMO: Previously the IBV_DIRECT path bound to `___ibv_query_port` because -// older versions of only exposed `ibv_query_port` as an -// inline wrapper around that internal extern symbol. Now that we no longer -// include verbs.h (it has been vendored into ibv_core.hpp), we always link or -// dlsym the public `ibv_query_port` symbol from libibverbs.so.1 directly. -// Revisit only if support for pre-1.1 libibverbs ever becomes a requirement. IBV_FN(ibv_query_port, int, (ibv_context*, uint8_t, ibv_port_attr*)) // `ibv_reg_dmabuf_mr` is always declared; whether the underlying symbol // actually exists in the loaded libibverbs is decided at runtime by tryLoad(). IBV_FN(ibv_reg_dmabuf_mr, ibv_mr*, (ibv_pd*, uint64_t, size_t, uint64_t, int, int)) IBV_FN(ibv_reg_mr, ibv_mr*, (ibv_pd*, void*, size_t, int)) - } // namespace struct IbvDynloadState { @@ -83,8 +66,6 @@ struct IbvDynloadState { void* handle = nullptr; TbIbvLoadStatus status = TB_IBV_NO_RDMA; - /// @brief Run dlopen + dlsym once and classify the outcome. - /// @return One of the @ref TbIbvLoadStatus values, also stored in @c status. TbIbvLoadStatus tryLoad() { status = TB_IBV_NO_RDMA; diff --git a/third-party/ibverbs/IbvHeader.hpp b/third-party/ibverbs/IbvHeader.hpp index 8c964a05..d961e82d 100644 --- a/third-party/ibverbs/IbvHeader.hpp +++ b/third-party/ibverbs/IbvHeader.hpp @@ -1,23 +1,38 @@ /* -Copyright (c) Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. +* Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. +* Copyright (c) 2004, 2011-2012 Intel Corporation. All rights reserved. +* Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved. +* Copyright (c) 2005 PathScale, Inc. All rights reserved. +* Copyright (c) 2020 Intel Corporation. All rights reserved. +* Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. +* +* This software is available to you under a choice of one of two +* licenses. You may choose to be licensed under the terms of the GNU +* General Public License (GPL) Version 2, available from the file +* COPYING in the main directory of this source tree, or the +* OpenIB.org BSD license below: +* +* Redistribution and use in source and binary forms, with or +* without modification, are permitted provided that the following +* conditions are met: +* +* - Redistributions of source code must retain the above +* copyright notice, this list of conditions and the following +* disclaimer. +* +* - Redistributions in binary form must reproduce the above +* copyright notice, this list of conditions and the following +* disclaimer in the documentation and/or other materials +* provided with the distribution. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. */ #pragma once From 311c7ec4a20e60e32f505469ab82ac65f74a98a4 Mon Sep 17 00:00:00 2001 From: AtlantaPepsi Date: Tue, 16 Jun 2026 20:02:22 +0000 Subject: [PATCH 3/4] fixing client --- src/client/Client.cpp | 4 +- src/client/EnvVars.hpp | 77 +++++++++++++----------------- src/client/Topology.hpp | 3 +- src/header/TransferBench.hpp | 26 ++++------ third-party/ibverbs/IbvDynLoad.hpp | 50 +++++++++---------- 5 files changed, 71 insertions(+), 89 deletions(-) diff --git a/src/client/Client.cpp b/src/client/Client.cpp index 3074f3b1..5df1d62b 100644 --- a/src/client/Client.cpp +++ b/src/client/Client.cpp @@ -226,9 +226,7 @@ int main(int argc, char **argv) void DisplayVersion() { bool nicSupport = false, mpiSupport = false, podSupport = false; -#if NIC_EXEC_ENABLED - nicSupport = true; -#endif + nicSupport = IsIbvSymbolsReady(); #if MPI_COMM_ENABLED mpiSupport = true; #endif diff --git a/src/client/EnvVars.hpp b/src/client/EnvVars.hpp index 34940b69..a3fe80d0 100644 --- a/src/client/EnvVars.hpp +++ b/src/client/EnvVars.hpp @@ -369,27 +369,23 @@ class EnvVars printf(" GFX_WORD_SIZE - GFX kernel packed data size (4=DWORDx4, 2=DWORDx2, 1=DWORDx1)\n"); printf(" GPU_MAX_HW_QUEUES - Max hardware queues per GPU device (default = 4)\n"); printf(" HIDE_ENV - Hide environment variable value listing\n"); -#if NIC_EXEC_ENABLED - printf(" IB_GID_INDEX - Required for RoCE NICs (default=-1/auto)\n"); - printf(" IB_PORT_NUMBER - RDMA port count for RDMA NIC (default=1)\n"); - printf(" IP_ADDRESS_FAMILY - IP address family (4=v4, 6=v6, default=v4)\n"); -#endif + if (IsIbvSymbolsReady()) { + printf(" IB_GID_INDEX - Required for RoCE NICs (default=-1/auto)\n"); + printf(" IB_PORT_NUMBER - RDMA port count for RDMA NIC (default=1)\n"); + printf(" IP_ADDRESS_FAMILY - IP address family (4=v4, 6=v6, default=v4)\n"); + printf(" NIC_CHUNK_BYTES - Number of bytes to send at a time using NIC (default = 1GB)\n"); + printf(" NIC_CQ_POLL_BATCH - Number of CQ entries to poll per ibv_poll_cq call (default = 4)\n"); + printf(" NIC_RELAX_ORDER - Set to non-zero to use relaxed ordering\n"); + printf(" NIC_SERVICE_LEVEL - IB service level (sl) for InfiniBand QPs (default=0)\n"); + printf(" NIC_TRAFFIC_CLASS - DSCP/traffic class byte for RoCE GRH (default=0)\n"); + printf(" ROCE_VERSION - RoCE version (default=2)\n"); + } printf(" MIN_VAR_SUBEXEC - Minimum # of subexecutors to use for variable subExec Transfers\n"); printf(" MAX_VAR_SUBEXEC - Maximum # of subexecutors to use for variable subExec Transfers (0 for device limits)\n"); -#if NIC_EXEC_ENABLED - printf(" NIC_CHUNK_BYTES - Number of bytes to send at a time using NIC (default = 1GB)\n"); - printf(" NIC_CQ_POLL_BATCH - Number of CQ entries to poll per ibv_poll_cq call (default = 4)\n"); - printf(" NIC_RELAX_ORDER - Set to non-zero to use relaxed ordering\n"); - printf(" NIC_SERVICE_LEVEL - IB service level (sl) for InfiniBand QPs (default=0)\n"); - printf(" NIC_TRAFFIC_CLASS - DSCP/traffic class byte for RoCE GRH (default=0)\n"); -#endif printf(" NUM_ITERATIONS - # of timed iterations per test. If negative, run for this many seconds instead\n"); printf(" NUM_SUBITERATIONS - # of sub-iterations to run per iteration. Must be non-negative\n"); printf(" NUM_WARMUPS - # of untimed warmup iterations per test\n"); printf(" OUTPUT_TO_CSV - Outputs to CSV format if set\n"); -#if NIC_EXEC_ENABLED - printf(" ROCE_VERSION - RoCE version (default=2)\n"); -#endif printf(" SAMPLING_FACTOR - Add this many samples (when possible) between powers of 2 when auto-generating data sizes\n"); printf(" SHOW_BORDERS - Show ASCII box-drawing characters in tables\n"); printf(" SHOW_ITERATIONS - Show per-iteration timing info\n"); @@ -443,9 +439,9 @@ class EnvVars { int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX); std::string nicSupport = ""; -#if NIC_EXEC_ENABLED - nicSupport = " (with NIC support)"; -#endif + if (IsIbvSymbolsReady()) { + nicSupport = " (with NIC support)"; + } if (!outputToCsv) { if (!hideEnv) printf("[Common] (Suppress by setting HIDE_ENV=1)\n"); } @@ -499,32 +495,31 @@ class EnvVars Print("GPU_MAX_HW_QUEUES", gpuMaxHwQueues, "Max %d hardware queues per GPU device", gpuMaxHwQueues); -#if NIC_EXEC_ENABLED - Print("IP_ADDRESS_FAMILY", ipAddressFamily, - "IP address family is set to IPv%d", ipAddressFamily); - - Print("IB_GID_INDEX", ibGidIndex, - "RoCE GID index is set to %s", (ibGidIndex < 0 ? "auto" : std::to_string(ibGidIndex).c_str())); - Print("IB_PORT_NUMBER", ibPort, - "IB port number is set to %d", ibPort); -#endif + if (IsIbvSymbolsReady()) { + Print("IP_ADDRESS_FAMILY", ipAddressFamily, + "IP address family is set to IPv%d", ipAddressFamily); + Print("IB_GID_INDEX", ibGidIndex, + "RoCE GID index is set to %s", (ibGidIndex < 0 ? "auto" : std::to_string(ibGidIndex).c_str())); + Print("IB_PORT_NUMBER", ibPort, + "IB port number is set to %d", ibPort); + Print("NIC_CHUNK_BYTES", nicChunkBytes, + "Sending %lu bytes at a time for NIC RDMA", nicChunkBytes); + Print("NIC_CQ_POLL_BATCH", nicCqPollBatch, + "Polling %d CQ entries per ibv_poll_cq call", nicCqPollBatch); + Print("NIC_RELAX_ORDER", nicRelaxedOrder, + "Using %s ordering for NIC RDMA", nicRelaxedOrder ? "relaxed" : "strict"); + Print("NIC_SERVICE_LEVEL", nicServiceLevel, + "IB service level (sl) set to %d", nicServiceLevel); + Print("NIC_TRAFFIC_CLASS", nicTrafficClass, + "RoCE traffic class (DSCP) set to %d", nicTrafficClass); + Print("ROCE_VERSION", roceVersion, + "RoCE version is set to %d", roceVersion); + } Print("MIN_VAR_SUBEXEC", minNumVarSubExec, "Using at least %d subexecutor(s) for variable subExec tranfers", minNumVarSubExec); Print("MAX_VAR_SUBEXEC", maxNumVarSubExec, "Using up to %s subexecutors for variable subExec transfers", maxNumVarSubExec ? std::to_string(maxNumVarSubExec).c_str() : "all available"); -#if NIC_EXEC_ENABLED - Print("NIC_CHUNK_BYTES", nicChunkBytes, - "Sending %lu bytes at a time for NIC RDMA", nicChunkBytes); - Print("NIC_CQ_POLL_BATCH", nicCqPollBatch, - "Polling %d CQ entries per ibv_poll_cq call", nicCqPollBatch); - Print("NIC_RELAX_ORDER", nicRelaxedOrder, - "Using %s ordering for NIC RDMA", nicRelaxedOrder ? "relaxed" : "strict"); - Print("NIC_SERVICE_LEVEL", nicServiceLevel, - "IB service level (sl) set to %d", nicServiceLevel); - Print("NIC_TRAFFIC_CLASS", nicTrafficClass, - "RoCE traffic class (DSCP) set to %d", nicTrafficClass); -#endif Print("NUM_ITERATIONS", numIterations, (numIterations == 0) ? "Running infinitely" : "Running %d %s", abs(numIterations), (numIterations > 0 ? " timed iteration(s)" : "seconds(s) per Test")); @@ -532,10 +527,6 @@ class EnvVars "Running %s subiterations", (numSubIterations == 0 ? "infinite" : std::to_string(numSubIterations)).c_str()); Print("NUM_WARMUPS", numWarmups, "Running %d warmup iteration(s) per Test", numWarmups); -#if NIC_EXEC_ENABLED - Print("ROCE_VERSION", roceVersion, - "RoCE version is set to %d", roceVersion); -#endif Print("SHOW_BORDERS", showBorders, "%s ASCII box-drawing characaters in tables", showBorders ? "Showing" : "Hiding"); Print("SHOW_ITERATIONS", showIterations, "%s per-iteration timing", showIterations ? "Showing" : "Hiding"); diff --git a/src/client/Topology.hpp b/src/client/Topology.hpp index 09269377..1f5501eb 100644 --- a/src/client/Topology.hpp +++ b/src/client/Topology.hpp @@ -45,7 +45,7 @@ static int RemappedCpuIndex(int origIdx) static void PrintNicToGPUTopo(bool outputToCsv) { -#ifdef NIC_EXEC_ENABLED + if (!IsIbvSymbolsReady()) return; printf(" NIC | Device Name | Active | PCIe Bus ID | NUMA | Closest GPU(s) | GID Index | GID Descriptor\n"); if(!outputToCsv) printf("-----+-------------+--------+--------------+------+----------------+-----------+-------------------\n"); @@ -73,7 +73,6 @@ static void PrintNicToGPUTopo(bool outputToCsv) ); } printf("\n"); -#endif } void DisplaySingleRankTopology(bool outputToCsv) diff --git a/src/header/TransferBench.hpp b/src/header/TransferBench.hpp index cd39f892..1b51d082 100644 --- a/src/header/TransferBench.hpp +++ b/src/header/TransferBench.hpp @@ -1691,7 +1691,7 @@ namespace { // rocr and hsa_ext_amd header is always mandatory, so no need to check for them pfn_hsa_amd_portable_export_dmabuf = (hsa_status_t (*)(const void*, size_t, int*, uint64_t*))dlsym(RTLD_DEFAULT, "hsa_amd_portable_export_dmabuf"); - if (pfn_hsa_amd_portable_export_dmabuf == nullptr || !TbIbvDmabufPresent()) { + if (pfn_hsa_amd_portable_export_dmabuf == nullptr || !IsIbvDmabufPresent()) { support = 0; return support; } @@ -2126,7 +2126,7 @@ namespace { } // Check NIC options - if (TbIbvSymbolsReady()) { + if (IsIbvSymbolsReady()) { if (cfg.nic.chunkBytes == 0 || (cfg.nic.chunkBytes % 4 != 0)) { errors.push_back({ERR_FATAL, "[nic.chunkBytes] must be a non-negative multiple of 4"}); } @@ -2504,7 +2504,7 @@ namespace { break; #endif case EXE_NIC: case EXE_NIC_NEAREST: - if (TbIbvSymbolsReady()) + if (IsIbvSymbolsReady()) { // NIC Executors can only execute a copy operation if (t.srcs.size() != 1 || t.dsts.size() != 1) { @@ -2992,7 +2992,7 @@ namespace { static vector ibvDeviceList = {}; // Build list on first use - if (TbIbvSymbolsReady() && !isInitialized) { + if (IsIbvSymbolsReady() && !isInitialized) { // Query the number of IBV devices int numIbvDevices = 0; @@ -4437,7 +4437,7 @@ namespace { // Prepare for NIC-based executors if (IsNicExeType(exeDevice.exeType)) { - if (TbIbvSymbolsReady()) { + if (IsIbvSymbolsReady()) { for (auto& rss : exeInfo.resources) { Transfer const& t = transfers[rss.transferIdx]; ERR_CHECK(PrepareNicTransferResources(cfg, exeDevice, t, rss)); @@ -4541,7 +4541,7 @@ namespace { #endif // Destroy NIC related resources - if (TbIbvSymbolsReady() && IsNicExeType(exeDevice.exeType)) { + if (IsIbvSymbolsReady() && IsNicExeType(exeDevice.exeType)) { ERR_CHECK(TeardownNicTransferResources(rss, t)); } } @@ -6599,14 +6599,6 @@ namespace { Log("[INFO] Running in single node mode\n"); } - // Probe libibverbs at process start (idempotent via std::call_once inside - // TbIbvEnsureLoaded). The integer status maps directly onto the rdma / - // dmabuf feature flags consumed elsewhere in TransferBench: - // TB_IBV_OK -> rdma=true, dmabuf=true - // TB_IBV_NO_DMABUF -> rdma=true, dmabuf=false - // TB_IBV_NO_RDMA -> rdma=false, dmabuf=false - TbIbvLoadStatus const ibvStatus = TbIbvGetLoadStatus(); - // Collect topology and distribute across all ranks CollectTopology(); } @@ -7340,7 +7332,7 @@ namespace { // NIC Executor int numNics = 0; - if (TbIbvSymbolsReady()) + if (IsIbvSymbolsReady()) { numNics = GetIbvDeviceList().size(); for (int exeIndex = 0; exeIndex < numNics; exeIndex++) { @@ -7384,7 +7376,7 @@ namespace { // Build up list of NIC bus addresses std::vector ibvAddressList; auto const& ibvDeviceList = GetIbvDeviceList(); - if (TbIbvSymbolsReady()) { + if (IsIbvSymbolsReady()) { for (auto const& ibvDevice : ibvDeviceList) ibvAddressList.push_back(ibvDevice.hasActivePort ? ibvDevice.busId : ""); @@ -7494,7 +7486,7 @@ namespace { Log("\n"); } } - if (TbIbvSymbolsReady()) { + if (IsIbvSymbolsReady()) { for (int nicIndex = 0; nicIndex < numNics; nicIndex++) { Log("[INFO] Rank %03d: NIC [%02d/%02d] %s Closest GPUs:", rank, nicIndex, numNics, ibvDeviceList[nicIndex].name.c_str()); diff --git a/third-party/ibverbs/IbvDynLoad.hpp b/third-party/ibverbs/IbvDynLoad.hpp index 54c63b60..ee3cb7a7 100644 --- a/third-party/ibverbs/IbvDynLoad.hpp +++ b/third-party/ibverbs/IbvDynLoad.hpp @@ -27,10 +27,10 @@ THE SOFTWARE. #include "IbvHeader.hpp" -enum TbIbvLoadStatus { - TB_IBV_OK = 0, - TB_IBV_NO_DMABUF = 1, - TB_IBV_NO_RDMA = 2, +enum IbvLoadStatus { + IBV_OK = 0, + IBV_NO_DMABUF = 1, + IBV_NO_RDMA = 2, }; #define IBV_FN(name, rettype, arglist) rettype(*name)arglist = nullptr; @@ -59,16 +59,18 @@ IBV_FN(ibv_query_port, int, (ibv_context*, uint8_t, ibv_port_attr*)) // actually exists in the loaded libibverbs is decided at runtime by tryLoad(). IBV_FN(ibv_reg_dmabuf_mr, ibv_mr*, (ibv_pd*, uint64_t, size_t, uint64_t, int, int)) IBV_FN(ibv_reg_mr, ibv_mr*, (ibv_pd*, void*, size_t, int)) -} // namespace +} + +#undef IBV_FN struct IbvDynloadState { std::once_flag once{}; void* handle = nullptr; - TbIbvLoadStatus status = TB_IBV_NO_RDMA; + IbvLoadStatus status = IBV_NO_RDMA; - TbIbvLoadStatus tryLoad() + IbvLoadStatus tryLoad() { - status = TB_IBV_NO_RDMA; + status = IBV_NO_RDMA; handle = dlopen("libibverbs.so.1", RTLD_NOW); if (handle == nullptr) @@ -77,7 +79,7 @@ struct IbvDynloadState { struct Symbol { void **ppfn; char const *name; }; // Core RDMA symbols. Failure of any of these means RDMA is unusable, so we - // tear the whole library back down and report TB_IBV_NO_RDMA. + // tear the whole library back down and report IBV_NO_RDMA. Symbol coreSymbols[] = { {(void**)&ibv_alloc_pd, "ibv_alloc_pd"}, {(void**)&ibv_close_device, "ibv_close_device"}, @@ -107,20 +109,20 @@ struct IbvDynloadState { for (Symbol const& r : coreSymbols) *r.ppfn = nullptr; dlclose(handle); handle = nullptr; - return status; // TB_IBV_NO_RDMA + return status; // IBV_NO_RDMA } *s.ppfn = sym; } - // DMA-BUF probe is independent: missing symbol downgrades to TB_IBV_NO_DMABUF + // DMA-BUF probe is independent: missing symbol downgrades to IBV_NO_DMABUF // but RDMA stays usable. void* dmabufSym = dlsym(handle, "ibv_reg_dmabuf_mr"); if (dmabufSym != nullptr) { *((void**)&ibv_reg_dmabuf_mr) = dmabufSym; - status = TB_IBV_OK; + status = IBV_OK; } else { ibv_reg_dmabuf_mr = nullptr; - status = TB_IBV_NO_DMABUF; + status = IBV_NO_DMABUF; } return status; } @@ -132,41 +134,41 @@ inline IbvDynloadState& ibvDynloadState() return s; } -inline void TbIbvEnsureLoaded() +inline void IbvEnsureLoaded() { IbvDynloadState& st = ibvDynloadState(); std::call_once(st.once, [&]() { st.tryLoad(); }); } -inline TbIbvLoadStatus TbIbvGetLoadStatus() +inline IbvLoadStatus IbvGetLoadStatus() { - TbIbvEnsureLoaded(); + IbvEnsureLoaded(); return ibvDynloadState().status; } -inline bool TbIbvSymbolsReady() +inline bool IsIbvSymbolsReady() { - return TbIbvGetLoadStatus() != TB_IBV_NO_RDMA; + return IbvGetLoadStatus() != IBV_NO_RDMA; } -inline bool TbIbvDmabufPresent() +inline bool IsIbvDmabufPresent() { - return TbIbvGetLoadStatus() == TB_IBV_OK; + return IbvGetLoadStatus() == IBV_OK; } -inline void* TbIbvDlHandle() +inline void* IbvDlHandle() { - TbIbvEnsureLoaded(); + IbvEnsureLoaded(); return ibvDynloadState().handle; } -inline void TbIbvUnload() +inline void IbvUnload() { IbvDynloadState& st = ibvDynloadState(); if (st.handle != nullptr) { dlclose(st.handle); st.handle = nullptr; - st.status = TB_IBV_NO_RDMA; + st.status = IBV_NO_RDMA; ibv_alloc_pd = nullptr; ibv_close_device = nullptr; ibv_create_cq = nullptr; From 7e2cacb19551c7c9891df52cc23a54002a6e95df Mon Sep 17 00:00:00 2001 From: AtlantaPepsi Date: Mon, 22 Jun 2026 00:23:48 +0000 Subject: [PATCH 4/4] redirect context ops --- third-party/ibverbs/IbvDynLoad.hpp | 6 ----- third-party/ibverbs/IbvHeader.hpp | 35 ++++++++++++++++++++++++++---- 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/third-party/ibverbs/IbvDynLoad.hpp b/third-party/ibverbs/IbvDynLoad.hpp index ee3cb7a7..32875278 100644 --- a/third-party/ibverbs/IbvDynLoad.hpp +++ b/third-party/ibverbs/IbvDynLoad.hpp @@ -50,8 +50,6 @@ IBV_FN(ibv_get_device_list, ibv_device**, (int*)) IBV_FN(ibv_get_device_name, const char*, (ibv_device*)) IBV_FN(ibv_modify_qp, int, (ibv_qp*, ibv_qp_attr*, int)) IBV_FN(ibv_open_device, ibv_context*, (ibv_device*)) -IBV_FN(ibv_poll_cq, int, (ibv_cq*, int, ibv_wc*)) -IBV_FN(ibv_post_send, int, (ibv_qp*, ibv_send_wr*, ibv_send_wr**)) IBV_FN(ibv_query_device, int, (ibv_context*, ibv_device_attr*)) IBV_FN(ibv_query_gid, int, (ibv_context*, uint8_t, int, ibv_gid*)) IBV_FN(ibv_query_port, int, (ibv_context*, uint8_t, ibv_port_attr*)) @@ -94,8 +92,6 @@ struct IbvDynloadState { {(void**)&ibv_get_device_name, "ibv_get_device_name"}, {(void**)&ibv_modify_qp, "ibv_modify_qp"}, {(void**)&ibv_open_device, "ibv_open_device"}, - {(void**)&ibv_poll_cq, "ibv_poll_cq"}, - {(void**)&ibv_post_send, "ibv_post_send"}, {(void**)&ibv_query_device, "ibv_query_device"}, {(void**)&ibv_query_gid, "ibv_query_gid"}, {(void**)&ibv_query_port, "ibv_query_port"}, @@ -182,8 +178,6 @@ inline void IbvUnload() ibv_get_device_name = nullptr; ibv_modify_qp = nullptr; ibv_open_device = nullptr; - ibv_poll_cq = nullptr; - ibv_post_send = nullptr; ibv_query_device = nullptr; ibv_query_gid = nullptr; ibv_query_port = nullptr; diff --git a/third-party/ibverbs/IbvHeader.hpp b/third-party/ibverbs/IbvHeader.hpp index d961e82d..ca734a5a 100644 --- a/third-party/ibverbs/IbvHeader.hpp +++ b/third-party/ibverbs/IbvHeader.hpp @@ -117,11 +117,25 @@ struct ibv_device { char ibdev_path[IBV_SYSFS_PATH_MAX]; }; -// We only ever read ->device (offset 0). The remaining fields (ops table, -// fds, mutex, ...) are intentionally omitted - libibverbs allocates and -// frees these objects and we never take sizeof(ibv_context). +struct ibv_context_ops { + void *_reserved_pre[11]; // _compat_query_device .. _compat_create_cq + void *poll_cq; // index 11 + void *_reserved_mid[13]; // req_notify_cq .. _compat_destroy_qp + void *post_send; // index 25 +}; + +// We read ->device (offset 0) and dispatch through ->ops. +// The trailing fields(fds, mutex, ...) are intentionally omitted, +// libibverbs allocates and frees these objects. struct ibv_context { - struct ibv_device *device; + struct ibv_device *device; + struct ibv_context_ops ops; +}; + +// We only ever read ->context (offset 0) to reach the ops dispatch table. +// The remaining fields are omitted since libibverbs owns the allocation. +struct ibv_cq { + struct ibv_context *context; }; // --------------------------------------------------------------------------- @@ -556,4 +570,17 @@ struct ibv_wc { uint8_t sl; uint8_t dlid_path_bits; }; + +static inline int ibv_poll_cq(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc) +{ + return ((int (*)(struct ibv_cq*, int, struct ibv_wc*))cq->context->ops.poll_cq)( + cq, num_entries, wc); +} + +static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr) +{ + return ((int (*)(struct ibv_qp*, struct ibv_send_wr*, struct ibv_send_wr**)) + qp->context->ops.post_send)(qp, wr, bad_wr); +} } // extern "C"