diff --git a/build.sh b/build.sh index 492c033686..bd0c5b3256 100755 --- a/build.sh +++ b/build.sh @@ -10,10 +10,12 @@ set -e # ./build.sh breakout --fast # Standalone executable (optimized) # ./build.sh breakout --web # Emscripten web build # ./build.sh breakout --profile # Kernel profiling binary +# ./build.sh breakout --rocm # HIP/ROCm training backend +# ./build.sh breakout --cuda # CUDA training backend # ./build.sh all # Build all envs with default and --float if [ -z "$1" ]; then - echo "Usage: ./build.sh ENV_NAME [--float] [--debug] [--local|--fast|--web|--profile|--cpu|--all]" + echo "Usage: ./build.sh ENV_NAME [--float] [--debug] [--cuda|--rocm] [--local|--fast|--web|--profile|--cpu|--all]" exit 1 fi ENV=$1 @@ -28,6 +30,8 @@ for arg in "$@"; do --web) MODE=web ;; --profile) MODE=profile ;; --cpu) MODE=cpu; PRECISION="-DPRECISION_FLOAT" ;; + --cuda) BACKEND=cuda ;; + --rocm) BACKEND=rocm ;; *) echo "Error: unknown argument '$arg'" && exit 1 ;; esac done @@ -168,57 +172,9 @@ elif [ "$MODE" = "web" ]; then exit 0 fi -# Find cuDNN path -CUDA_HOME=${CUDA_HOME:-${CUDA_PATH:-$(dirname "$(dirname "$(which nvcc)")")}} -CUDNN_IFLAG="" -CUDNN_LFLAG="" -for dir in /usr/local/cuda/include /usr/include; do - if [ -f "$dir/cudnn.h" ]; then - CUDNN_IFLAG="-I$dir" - break - fi -done -for dir in /usr/local/cuda/lib64 /usr/lib/x86_64-linux-gnu; do - if [ -f "$dir/libcudnn.so" ]; then - CUDNN_LFLAG="-L$dir" - break - fi -done -if [ -z "$CUDNN_IFLAG" ]; then - CUDNN_IFLAG=$(python -c "import nvidia.cudnn, os; print('-I' + os.path.join(nvidia.cudnn.__path__[0], 'include'))" 2>/dev/null || echo "") -fi -if [ -z "$CUDNN_LFLAG" ]; then - CUDNN_LFLAG=$(python -c "import nvidia.cudnn, os; print('-L' + os.path.join(nvidia.cudnn.__path__[0], 'lib'))" 2>/dev/null || echo "") -fi - -# NCCL include/lib fallback (mirrors the cuDNN fallback above). -# Needed when NCCL is provided by the nvidia-nccl-cu12 wheel in the active venv. -NCCL_IFLAG="" -NCCL_LFLAG="" -for dir in /usr/include /usr/local/cuda/include; do - if [ -f "$dir/nccl.h" ]; then NCCL_IFLAG="-I$dir"; break; fi -done -for dir in /usr/lib/x86_64-linux-gnu /usr/local/cuda/lib64; do - if [ -f "$dir/libnccl.so" ] || [ -f "$dir/libnccl.so.2" ]; then NCCL_LFLAG="-L$dir"; break; fi -done -if [ -z "$NCCL_IFLAG" ]; then - NCCL_IFLAG=$(python -c "import nvidia.nccl, os; print('-I' + os.path.join(nvidia.nccl.__path__[0], 'include'))" 2>/dev/null || echo "") -fi -if [ -z "$NCCL_LFLAG" ]; then - NCCL_LFLAG=$(python -c "import nvidia.nccl, os; print('-L' + os.path.join(nvidia.nccl.__path__[0], 'lib'))" 2>/dev/null || echo "") -fi - -WHEEL_RPATH_FLAGS=() -for lib_flag in "$CUDNN_LFLAG" "$NCCL_LFLAG"; do - if [[ "$lib_flag" == -L* ]]; then - WHEEL_RPATH_FLAGS+=("-Wl,-rpath,${lib_flag#-L}") - fi -done - export CCACHE_DIR="${CCACHE_DIR:-$HOME/.ccache}" export CCACHE_BASEDIR="$(pwd)" export CCACHE_COMPILERCHECK=content -NVCC="ccache $CUDA_HOME/bin/nvcc" CC="${CC:-$(command -v ccache >/dev/null && echo 'ccache clang' || echo 'clang')}" ARCH=${NVCC_ARCH:-native} @@ -238,10 +194,42 @@ if [ ! -f "$BINDING_SRC" ]; then exit 1 fi +if [ -z "$MODE" ]; then + if [ -z "$BACKEND" ]; then + if python -c "from torch.utils.cpp_extension import IS_HIP_EXTENSION; raise SystemExit(0 if IS_HIP_EXTENSION else 1)" 2>/dev/null && ! command -v nvcc >/dev/null 2>&1; then + BACKEND=rocm + else + BACKEND=cuda + fi + fi +elif [ -n "$BACKEND" ]; then + echo "Error: --cuda/--rocm only apply to the training backend" + exit 1 +fi + +if [ "$BACKEND" = "rocm" ] && [ "$ENV" = "nmmo3" ]; then + echo "Error: NMMO3 native encoder is CUDA-only in build.sh --rocm" + exit 1 +fi + +CUDA_HOME=${CUDA_HOME:-${CUDA_PATH:-}} +CUDA_IFLAG="" +if [ "$BACKEND" = "cuda" ] || [ "$MODE" = "profile" ]; then + if [ -z "$CUDA_HOME" ]; then + if command -v nvcc >/dev/null 2>&1; then + CUDA_HOME=$(dirname "$(dirname "$(command -v nvcc)")") + else + echo "Error: nvcc not found. Use --rocm for HIP/ROCm or --cpu for CPU fallback." + exit 1 + fi + fi + CUDA_IFLAG="-I$CUDA_HOME/include" +fi + echo "Compiling static library for $ENV..." ${CC:-clang} -c "${CLANG_OPT[@]}" $EXTRA_CFLAGS \ -I. -Isrc -I$SRC_DIR -Ivendor \ - -I./$RAYLIB_NAME/include -I$CUDA_HOME/include \ + -I./$RAYLIB_NAME/include $CUDA_IFLAG \ -DPLATFORM_DESKTOP \ -fno-semantic-interposition -fvisibility=hidden \ -fPIC -fopenmp \ @@ -255,7 +243,59 @@ if [ -z "$OBS_TENSOR_T" ]; then exit 1 fi -if [ -z "$MODE" ]; then +if [ -z "$MODE" ] && [ "$BACKEND" = "cuda" ]; then + # Find cuDNN path + CUDNN_IFLAG="" + CUDNN_LFLAG="" + for dir in /usr/local/cuda/include /usr/include; do + if [ -f "$dir/cudnn.h" ]; then + CUDNN_IFLAG="-I$dir" + break + fi + done + for dir in /usr/local/cuda/lib64 /usr/lib/x86_64-linux-gnu; do + if [ -f "$dir/libcudnn.so" ]; then + CUDNN_LFLAG="-L$dir" + break + fi + done + if [ -z "$CUDNN_IFLAG" ]; then + CUDNN_IFLAG=$(python -c "import nvidia.cudnn, os; print('-I' + os.path.join(nvidia.cudnn.__path__[0], 'include'))" 2>/dev/null || echo "") + fi + if [ -z "$CUDNN_LFLAG" ]; then + CUDNN_LFLAG=$(python -c "import nvidia.cudnn, os; print('-L' + os.path.join(nvidia.cudnn.__path__[0], 'lib'))" 2>/dev/null || echo "") + fi + + # NCCL include/lib fallback (mirrors the cuDNN fallback above). + # Needed when NCCL is provided by the nvidia-nccl-cu12 wheel in the active venv. + NCCL_IFLAG="" + NCCL_LFLAG="" + for dir in /usr/include /usr/local/cuda/include; do + if [ -f "$dir/nccl.h" ]; then NCCL_IFLAG="-I$dir"; break; fi + done + for dir in /usr/lib/x86_64-linux-gnu /usr/local/cuda/lib64; do + if [ -f "$dir/libnccl.so" ] || [ -f "$dir/libnccl.so.2" ]; then NCCL_LFLAG="-L$dir"; break; fi + done + if [ -z "$NCCL_IFLAG" ]; then + NCCL_IFLAG=$(python -c "import nvidia.nccl, os; print('-I' + os.path.join(nvidia.nccl.__path__[0], 'include'))" 2>/dev/null || echo "") + fi + if [ -z "$NCCL_LFLAG" ]; then + NCCL_LFLAG=$(python -c "import nvidia.nccl, os; print('-L' + os.path.join(nvidia.nccl.__path__[0], 'lib'))" 2>/dev/null || echo "") + fi + + WHEEL_RPATH_FLAGS=() + for lib_flag in "$CUDNN_LFLAG" "$NCCL_LFLAG"; do + if [[ "$lib_flag" == -L* ]]; then + WHEEL_RPATH_FLAGS+=("-Wl,-rpath,${lib_flag#-L}") + fi + done + + if command -v ccache >/dev/null 2>&1; then + NVCC="ccache $CUDA_HOME/bin/nvcc" + else + NVCC="$CUDA_HOME/bin/nvcc" + fi + echo "Compiling CUDA ($ARCH) training backend..." $NVCC -c -arch=$ARCH -Xcompiler -fPIC \ -Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=1 \ @@ -284,6 +324,142 @@ if [ -z "$MODE" ]; then "${LINK_CMD[@]}" echo "Built: $OUTPUT" +elif [ -z "$MODE" ] && [ "$BACKEND" = "rocm" ]; then + mapfile -t ROCM_INFO < <(python - <<'PY' +import os +from torch.utils.cpp_extension import ROCM_HOME, library_paths, include_paths + +rocm_home = os.environ.get("ROCM_HOME") or ROCM_HOME +if not rocm_home: + raise SystemExit("ROCM_HOME not found. Install/use a ROCm-enabled PyTorch environment.") +print(rocm_home) +print(os.environ.get("HIPCC") or os.path.join(rocm_home, "bin", "hipcc")) +print(os.pathsep.join(include_paths("cuda"))) +print(os.pathsep.join(library_paths("cuda"))) +PY +) + ROCM_HOME=${ROCM_INFO[0]} + HIPCC=${ROCM_INFO[1]} + ROCM_INCLUDE_PATHS=${ROCM_INFO[2]} + ROCM_LIBRARY_PATHS=${ROCM_INFO[3]} + + if [ ! -x "$HIPCC" ]; then + if command -v hipcc >/dev/null 2>&1; then + HIPCC=$(command -v hipcc) + else + echo "Error: hipcc not found" + exit 1 + fi + fi + + if [ -z "$HIP_CLANG_PATH" ] || [ ! -x "$HIP_CLANG_PATH/clang++" ]; then + for dir in "$ROCM_HOME/lib/llvm/bin" /usr/lib/llvm/*/bin; do + if [ -x "$dir/clang++" ]; then + export HIP_CLANG_PATH="$dir" + break + fi + done + fi + + HIPIFY_SRC="build/hip/src" + HIPIFY_SRC_ABS="$(pwd)/$HIPIFY_SRC" + SRC_ABS="$(pwd)/src" + echo "Hipifying CUDA sources into $HIPIFY_SRC..." + rm -rf "$HIPIFY_SRC" + python - < 1.2 * upper_cost: + threshold = 0.9 * max_score + else: + threshold = state['A'] + state['B'] * np.log(cost) + + targets = state['running_targets'] + metric_val = flat_logs[target_key] + targets.append(metric_val) + del targets[:-30] + score = max(np.mean(targets), metric_val) + if state.get('metric_distribution') == 'percentile': + score = _logit_transform(score) + + flat_logs['early_stop_threshold'] = max(threshold, -5) + flat_logs['is_loss_nan'] = False + return score < threshold + +def _train(env_name, args, result_queue=None, verbose=False, sweep_early_stop=None): '''Single-GPU training worker. Process target for both DDP ranks and sweep trials.''' backend = _resolve_backend(args) rank = args['rank'] + suppress_model_outputs = result_queue is not None or sweep_early_stop is not None run_id = str(int(1000*time.time())) if args['wandb']: import wandb @@ -233,7 +290,7 @@ def _train(env_name, args, sweep_obj=None, result_queue=None, verbose=False): if epoch < train_epochs: backend.train(pufferl) - if (epoch % args['checkpoint_interval'] == 0 or epoch == train_epochs - 1) and sweep_obj is None: + if (epoch % args['checkpoint_interval'] == 0 or epoch == train_epochs - 1) and not suppress_model_outputs: model_path = os.path.join(checkpoint_dir, f'{pufferl.global_step:016d}.bin') backend.save_weights(pufferl, model_path) @@ -256,9 +313,9 @@ def _train(env_name, args, sweep_obj=None, result_queue=None, verbose=False): if epoch < train_epochs: all_logs.append(flat_logs) - if (sweep_obj is not None - and pufferl.global_step > min(0.20*total_timesteps, 100_000_000) and - sweep_obj.early_stop(logs, target_key)): + if (sweep_early_stop is not None + and pufferl.global_step > min(0.20*total_timesteps, 100_000_000) + and _sweep_should_stop(flat_logs, target_key, sweep_early_stop)): break elif flat_logs['env/n'] > args['eval_episodes']: break @@ -302,7 +359,7 @@ def _train(env_name, args, sweep_obj=None, result_queue=None, verbose=False): json.dump({**args, 'metrics': metrics}, f) if args['wandb']: - if sweep_obj is None and model_path: # Don't spam uploads during sweeps + if not suppress_model_outputs and model_path: # Don't spam uploads during sweeps artifact = wandb.Artifact(run_id, type='model') artifact.add_file(model_path) wandb.run.log_artifact(artifact) @@ -315,6 +372,9 @@ def _train(env_name, args, sweep_obj=None, result_queue=None, verbose=False): def train(env_name, args=None, gpus=None, **kwargs): args = args or load_config(env_name) validate_config(args) + sweep_obj = kwargs.pop('sweep_obj', None) + if sweep_obj is not None and 'sweep_early_stop' not in kwargs: + kwargs['sweep_early_stop'] = _sweep_early_stop_state(sweep_obj) subprocess = gpus is not None gpus = list(gpus or range(args['train']['gpus'])) @@ -331,7 +391,7 @@ def train(env_name, args=None, gpus=None, **kwargs): worker_args['rank'] = rank worker_args['gpu_id'] = gpu_id if rank == 0 and not subprocess: - _train(env_name, worker_args, verbose=True) + _train(env_name, worker_args, verbose=True, **kwargs) else: ctx.Process(target=_train, args=(env_name, worker_args), kwargs=kwargs).start() @@ -340,7 +400,7 @@ def sweep(env_name, args=None, pareto=False): '''Train entry point. Handles single-GPU, multi-GPU DDP, and sweeps.''' args = args or load_config(env_name) exp_gpus = args['train']['gpus'] - sweep_gpus = args['sweep']['gpus'] or len(os.listdir('/proc/driver/nvidia/gpus')) + sweep_gpus = args['sweep']['gpus'] or torch.cuda.device_count() args['vec']['num_threads'] //= (sweep_gpus // exp_gpus) args['no_model_upload'] = True @@ -363,12 +423,15 @@ def sweep(env_name, args=None, pareto=False): active = {} completed = 0 while completed < num_experiments: - if len(active) >= sweep_gpus//exp_gpus: # Collect completed runs + max_active = sweep_gpus // exp_gpus + if active and (len(active) >= max_active or completed + len(active) >= num_experiments): gpu_id, scores, costs, timesteps = result_queue.get() done_args = active.pop(gpu_id) if not scores: sweep_obj.observe(done_args, 0, 0, is_failure=True) + completed += 1 + continue else: completed += 1 @@ -396,7 +459,7 @@ def sweep(env_name, args=None, pareto=False): exp_args = deepcopy(args) active[gpu_id] = exp_args train(env_name, exp_args, range(gpu_id, gpu_id + exp_gpus), - sweep_obj=sweep_obj, result_queue=result_queue) + result_queue=result_queue, sweep_early_stop=_sweep_early_stop_state(sweep_obj)) def eval(env_name, args=None, load_path=None): '''Evaluate a trained policy. Supports both native and --slowly torch backends.''' diff --git a/src/bindings.cu b/src/bindings.cu index 4469cb512c..3eee3064d2 100644 --- a/src/bindings.cu +++ b/src/bindings.cu @@ -69,18 +69,11 @@ pybind11::dict puf_log(pybind11::object pufferl_obj) { // Utilization pybind11::dict util_dict; - nvmlUtilization_t util; - nvmlDeviceGetUtilizationRates(pufferl.nvml_device, &util); - util_dict["gpu_percent"] = (float)util.gpu; - - nvmlMemory_t mem; - nvmlDeviceGetMemoryInfo(pufferl.nvml_device, &mem); - util_dict["gpu_mem"] = 100.0f * (float)mem.used / (float)mem.total; - - size_t cuda_free, cuda_total; - cudaMemGetInfo(&cuda_free, &cuda_total); - util_dict["vram_used_gb"] = (float)(cuda_total - cuda_free) / (1024.0f * 1024.0f * 1024.0f); - util_dict["vram_total_gb"] = (float)cuda_total / (1024.0f * 1024.0f * 1024.0f); + GpuUtil util = gpu_get_utilization(pufferl.gpu_device); + util_dict["gpu_percent"] = util.gpu_percent; + util_dict["gpu_mem"] = util.gpu_mem; + util_dict["vram_used_gb"] = util.vram_used_gb; + util_dict["vram_total_gb"] = util.vram_total_gb; long rss_kb = 0; FILE* f = fopen("/proc/self/status", "r"); @@ -418,25 +411,15 @@ PYBIND11_MODULE(_C, m) { }); // Standalone utilization monitor (no PuffeRL instance needed) m.def("get_utilization", [](int gpu_id) { - static bool nvml_inited = false; - if (!nvml_inited) { nvmlInit(); nvml_inited = true; } - py::dict util_dict; - nvmlDevice_t device; - nvmlDeviceGetHandleByIndex(gpu_id, &device); - - nvmlUtilization_t util; - nvmlDeviceGetUtilizationRates(device, &util); - util_dict["gpu_percent"] = (float)util.gpu; - - nvmlMemory_t mem; - nvmlDeviceGetMemoryInfo(device, &mem); - util_dict["gpu_mem"] = 100.0f * (float)mem.used / (float)mem.total; - - size_t cuda_free, cuda_total; - cudaMemGetInfo(&cuda_free, &cuda_total); - util_dict["vram_used_gb"] = (float)(cuda_total - cuda_free) / (1024.0f * 1024.0f * 1024.0f); - util_dict["vram_total_gb"] = (float)cuda_total / (1024.0f * 1024.0f * 1024.0f); + PufferGpuDevice device; + gpu_monitor_init(gpu_id, &device); + GpuUtil util = gpu_get_utilization(device); + gpu_monitor_shutdown(); + util_dict["gpu_percent"] = util.gpu_percent; + util_dict["gpu_mem"] = util.gpu_mem; + util_dict["vram_used_gb"] = util.vram_used_gb; + util_dict["vram_total_gb"] = util.vram_total_gb; long rss_kb = 0; FILE* f = fopen("/proc/self/status", "r"); diff --git a/src/ocean.cu b/src/ocean.cu index baaa9b7be6..314c8311be 100644 --- a/src/ocean.cu +++ b/src/ocean.cu @@ -1,6 +1,10 @@ // NMMO3 CUDA encoder: multihot, cuDNN conv, embedding, concat, projection // Included by pufferlib.cu — requires precision_t, PrecisionTensor, Allocator, puf_mm, etc. +#ifdef USE_ROCM +static void create_custom_encoder(const std::string&, Encoder*) {} +#else + #include "cudnn_conv2d.cu" // ---- NMMO3 constants ---- @@ -588,3 +592,5 @@ static void create_custom_encoder(const std::string& env_name, Encoder* enc) { }; } } + +#endif diff --git a/src/pufferlib.cu b/src/pufferlib.cu index 3a3e6ee00e..ba3b15bdb0 100644 --- a/src/pufferlib.cu +++ b/src/pufferlib.cu @@ -1,7 +1,11 @@ #include +#ifndef USE_ROCM #include #include #include +#else +#include +#endif #include #include @@ -45,6 +49,110 @@ typedef struct { float accum[NUM_PROF]; } ProfileT; +struct GpuUtil { + float gpu_percent; + float gpu_mem; + float vram_used_gb; + float vram_total_gb; +}; + +#ifndef USE_ROCM +using PufferGpuDevice = nvmlDevice_t; + +inline void gpu_monitor_init(int gpu_id, PufferGpuDevice* device) { + nvmlInit(); + nvmlDeviceGetHandleByIndex(gpu_id, device); +} + +inline void gpu_monitor_shutdown() { + nvmlShutdown(); +} + +inline GpuUtil gpu_get_utilization(PufferGpuDevice device) { + GpuUtil out = {}; + nvmlUtilization_t util; + if (nvmlDeviceGetUtilizationRates(device, &util) == NVML_SUCCESS) { + out.gpu_percent = (float)util.gpu; + } + + nvmlMemory_t mem; + if (nvmlDeviceGetMemoryInfo(device, &mem) == NVML_SUCCESS && mem.total > 0) { + out.gpu_mem = 100.0f * (float)mem.used / (float)mem.total; + } + + size_t free_bytes = 0, total_bytes = 0; + cudaMemGetInfo(&free_bytes, &total_bytes); + if (total_bytes > 0) { + out.vram_used_gb = (float)(total_bytes - free_bytes) / (1024.0f * 1024.0f * 1024.0f); + out.vram_total_gb = (float)total_bytes / (1024.0f * 1024.0f * 1024.0f); + } + return out; +} + +inline void profile_begin(const char* tag, bool enable) { + if (enable) nvtxRangePushA(tag); +} + +inline void profile_end(bool enable) { + if (enable) nvtxRangePop(); +} + +inline void gpu_profiler_start(bool enable) { + if (enable) cudaProfilerStart(); +} + +inline void gpu_profiler_stop(bool enable) { + if (enable) cudaProfilerStop(); +} +#else +using PufferGpuDevice = uint32_t; + +inline void gpu_monitor_init(int gpu_id, PufferGpuDevice* device) { + *device = (uint32_t)gpu_id; + rsmi_init(RSMI_INIT_FLAG_ALL_GPUS); +} + +inline void gpu_monitor_shutdown() { + rsmi_shut_down(); +} + +inline GpuUtil gpu_get_utilization(PufferGpuDevice device) { + GpuUtil out = {}; + uint32_t busy = 0; + if (rsmi_dev_busy_percent_get(device, &busy) == RSMI_STATUS_SUCCESS) { + out.gpu_percent = (float)busy; + } + + uint64_t used = 0, total = 0; + if (rsmi_dev_memory_usage_get(device, RSMI_MEM_TYPE_VRAM, &used) == RSMI_STATUS_SUCCESS && + rsmi_dev_memory_total_get(device, RSMI_MEM_TYPE_VRAM, &total) == RSMI_STATUS_SUCCESS && + total > 0) { + out.gpu_mem = 100.0f * (float)used / (float)total; + } + + size_t free_bytes = 0, total_bytes = 0; + cudaMemGetInfo(&free_bytes, &total_bytes); + if (total_bytes > 0) { + out.vram_used_gb = (float)(total_bytes - free_bytes) / (1024.0f * 1024.0f * 1024.0f); + out.vram_total_gb = (float)total_bytes / (1024.0f * 1024.0f * 1024.0f); + } + return out; +} + +inline void profile_begin(const char*, bool) {} +inline void profile_end(bool) {} +inline void gpu_profiler_start(bool) {} +inline void gpu_profiler_stop(bool) {} +#endif + +inline cudaError_t puf_graph_instantiate(cudaGraphExec_t* exec, cudaGraph_t graph) { +#ifdef USE_ROCM + return cudaGraphInstantiate(exec, graph, nullptr, nullptr, 0); +#else + return cudaGraphInstantiate(exec, graph, 0); +#endif +} + // Data collected by parallel environment workers. Each worker handles // a constant subset of agents struct RolloutBuf { @@ -316,7 +424,7 @@ typedef struct { PrecisionTensor grad_puf; LongTensor rng_offset_puf; // (num_buffers+1,) int64 CUDA device counters ProfileT profile; - nvmlDevice_t nvml_device; + PufferGpuDevice gpu_device; long epoch; long global_step; double start_time; @@ -335,14 +443,6 @@ Dict* log_environments_impl(PuffeRL& pufferl) { return out; } -inline void profile_begin(const char* tag, bool enable) { - if (enable) nvtxRangePushA(tag); -} - -inline void profile_end(bool enable) { - if (enable) nvtxRangePop(); -} - // Thread-local stream for per-buffer threads (set once by thread_init_wrapper) static thread_local cudaStream_t tl_stream = 0; @@ -556,7 +656,7 @@ extern "C" void net_callback_wrapper(void* ctx, int buf, int t) { cudaGraph_t _graph; assert(cudaStreamEndCapture(current_stream, &_graph) == cudaSuccess && "cudaStreamEndCapture failed"); - assert(cudaGraphInstantiate(&pufferl->fused_rollout_cudagraphs[graph], _graph, 0) == cudaSuccess + assert(puf_graph_instantiate(&pufferl->fused_rollout_cudagraphs[graph], _graph) == cudaSuccess && "cudaGraphInstantiate failed"); assert(cudaGraphDestroy(_graph) == cudaSuccess && "cudaGraphDestroy failed"); cudaDeviceSynchronize(); @@ -935,9 +1035,22 @@ void ppo_loss_fwd_bwd( } #define PRIO_WARP_SIZE 32 -#define PRIO_FULL_MASK 0xffffffff +#ifdef USE_ROCM +using PufWarpMask = unsigned long long; +#else +using PufWarpMask = unsigned int; +#endif #define PRIO_BLOCK_SIZE 256 #define PRIO_NUM_WARPS (PRIO_BLOCK_SIZE / PRIO_WARP_SIZE) + +__device__ __forceinline__ float prio_warp_sum(float val, int width = PRIO_WARP_SIZE) { + PufWarpMask mask = (PufWarpMask)__activemask(); + for (int s = width / 2; s >= 1; s /= 2) { + val += __shfl_down_sync(mask, val, s, width); + } + return val; +} + __global__ void compute_prio_adv_reduction( const precision_t* __restrict__ advantages, float* prio_weights, float prio_alpha, int stride) { @@ -950,9 +1063,7 @@ __global__ void compute_prio_adv_reduction( local_sum += fabsf(to_float(advantages[offset + t])); } - for (int s = PRIO_WARP_SIZE / 2; s >= 1; s /= 2) { - local_sum += __shfl_down_sync(PRIO_FULL_MASK, local_sum, s); - } + local_sum = prio_warp_sum(local_sum); if (tx == 0) { float pw = __powf(local_sum, prio_alpha); if (isnan(pw) || isinf(pw)) { @@ -975,9 +1086,7 @@ __global__ void compute_prio_normalize(float* prio_weights, int length) { for (int t = tx; t < length; t += blockDim.x) { local_sum += prio_weights[t]; } - for (int s = PRIO_WARP_SIZE / 2; s >= 1; s /= 2) { - local_sum += __shfl_down_sync(PRIO_FULL_MASK, local_sum, s); - } + local_sum = prio_warp_sum(local_sum); if (lane == 0) { shmem[warp_id] = local_sum; } @@ -985,9 +1094,7 @@ __global__ void compute_prio_normalize(float* prio_weights, int length) { if (warp_id == 0) { float val = (lane < PRIO_NUM_WARPS) ? shmem[lane] : 0.0f; - for (int s = PRIO_NUM_WARPS / 2; s >= 1; s /= 2) { - val += __shfl_down_sync(PRIO_FULL_MASK, val, s); - } + val = prio_warp_sum(val, PRIO_NUM_WARPS); if (tx == 0) { block_sum = val + eps; } @@ -1413,7 +1520,7 @@ void train_impl(PuffeRL& pufferl) { cudaGraph_t _graph; assert(cudaStreamEndCapture(train_stream, &_graph) == cudaSuccess && "cudaStreamEndCapture failed"); - assert(cudaGraphInstantiate(&pufferl.train_cudagraph, _graph, 0) == cudaSuccess + assert(puf_graph_instantiate(&pufferl.train_cudagraph, _graph) == cudaSuccess && "cudaGraphInstantiate failed"); assert(cudaGraphDestroy(_graph) == cudaSuccess && "cudaGraphDestroy failed"); cudaDeviceSynchronize(); @@ -1517,8 +1624,7 @@ std::unique_ptr create_pufferl_impl(HypersT& hypers, cudaEventCreate(&pufferl->profile.events[i]); } memset(pufferl->profile.accum, 0, sizeof(pufferl->profile.accum)); - nvmlInit(); - nvmlDeviceGetHandleByIndex(hypers.gpu_id, &pufferl->nvml_device); + gpu_monitor_init(hypers.gpu_id, &pufferl->gpu_device); // Create policy int input_size = pufferl->env.obs.shape[1]; @@ -1751,10 +1857,8 @@ std::unique_ptr create_pufferl_impl(HypersT& hypers, net_callback_wrapper, thread_init_wrapper); static_vec_reset(vec); - if (hypers.profile) { - cudaDeviceSynchronize(); - cudaProfilerStart(); - } + if (hypers.profile) cudaDeviceSynchronize(); + gpu_profiler_start(hypers.profile); double now = wall_clock(); pufferl->start_time = now; @@ -1766,9 +1870,7 @@ std::unique_ptr create_pufferl_impl(HypersT& hypers, void close_impl(PuffeRL& pufferl) { cudaDeviceSynchronize(); - if (pufferl.hypers.profile) { - cudaProfilerStop(); - } + gpu_profiler_stop(pufferl.hypers.profile); cudaGraphExecDestroy(pufferl.train_cudagraph); for (int i = 0; i < pufferl.hypers.horizon * pufferl.hypers.num_buffers; i++) { @@ -1800,7 +1902,7 @@ void close_impl(PuffeRL& pufferl) { for (int i = 0; i < NUM_TRAIN_EVENTS; i++) { cudaEventDestroy(pufferl.profile.events[i]); } - nvmlShutdown(); + gpu_monitor_shutdown(); static_vec_close(pufferl.vec); diff --git a/src/rocm_cuda_shim.cpp b/src/rocm_cuda_shim.cpp new file mode 100644 index 0000000000..0feed14b2e --- /dev/null +++ b/src/rocm_cuda_shim.cpp @@ -0,0 +1,57 @@ +#include + +extern "C" { + +hipError_t cudaHostAlloc(void** ptr, size_t size, unsigned int flags) { + return hipHostMalloc(ptr, size, flags); +} + +hipError_t cudaMalloc(void** ptr, size_t size) { + return hipMalloc(ptr, size); +} + +hipError_t cudaMemcpy(void* dst, const void* src, size_t size, int kind) { + return hipMemcpy(dst, src, size, (hipMemcpyKind)kind); +} + +hipError_t cudaMemcpyAsync(void* dst, const void* src, size_t size, int kind, hipStream_t stream) { + return hipMemcpyAsync(dst, src, size, (hipMemcpyKind)kind, stream); +} + +hipError_t cudaMemset(void* dst, int value, size_t size) { + return hipMemset(dst, value, size); +} + +hipError_t cudaFree(void* ptr) { + return hipFree(ptr); +} + +hipError_t cudaFreeHost(void* ptr) { + return hipHostFree(ptr); +} + +hipError_t cudaSetDevice(int device) { + return hipSetDevice(device); +} + +hipError_t cudaDeviceSynchronize(void) { + return hipDeviceSynchronize(); +} + +hipError_t cudaStreamSynchronize(hipStream_t stream) { + return hipStreamSynchronize(stream); +} + +hipError_t cudaStreamCreateWithFlags(hipStream_t* stream, unsigned int flags) { + return hipStreamCreateWithFlags(stream, flags); +} + +hipError_t cudaStreamQuery(hipStream_t stream) { + return hipStreamQuery(stream); +} + +const char* cudaGetErrorString(hipError_t error) { + return hipGetErrorString(error); +} + +}