diff --git a/build.sh b/build.sh
index 492c033686..bd0c5b3256 100755
--- a/build.sh
+++ b/build.sh
@@ -10,10 +10,12 @@ set -e
 #   ./build.sh breakout --fast       # Standalone executable (optimized)
 #   ./build.sh breakout --web        # Emscripten web build
 #   ./build.sh breakout --profile    # Kernel profiling binary
+#   ./build.sh breakout --rocm       # HIP/ROCm training backend
+#   ./build.sh breakout --cuda       # CUDA training backend
 #   ./build.sh all                   # Build all envs with default and --float
 
 if [ -z "$1" ]; then
-    echo "Usage: ./build.sh ENV_NAME [--float] [--debug] [--local|--fast|--web|--profile|--cpu|--all]"
+    echo "Usage: ./build.sh ENV_NAME [--float] [--debug] [--cuda|--rocm] [--local|--fast|--web|--profile|--cpu|--all]"
     exit 1
 fi
 ENV=$1
@@ -28,6 +30,8 @@ for arg in "$@"; do
         --web)   MODE=web ;;
         --profile) MODE=profile ;;
         --cpu)   MODE=cpu; PRECISION="-DPRECISION_FLOAT" ;;
+        --cuda)  BACKEND=cuda ;;
+        --rocm)  BACKEND=rocm ;;
         *) echo "Error: unknown argument '$arg'" && exit 1 ;;
     esac
 done
@@ -168,57 +172,9 @@ elif [ "$MODE" = "web" ]; then
     exit 0
 fi
 
-# Find cuDNN path
-CUDA_HOME=${CUDA_HOME:-${CUDA_PATH:-$(dirname "$(dirname "$(which nvcc)")")}}
-CUDNN_IFLAG=""
-CUDNN_LFLAG=""
-for dir in /usr/local/cuda/include /usr/include; do
-    if [ -f "$dir/cudnn.h" ]; then
-        CUDNN_IFLAG="-I$dir"
-        break
-    fi
-done
-for dir in /usr/local/cuda/lib64 /usr/lib/x86_64-linux-gnu; do
-    if [ -f "$dir/libcudnn.so" ]; then
-        CUDNN_LFLAG="-L$dir"
-        break
-    fi
-done
-if [ -z "$CUDNN_IFLAG" ]; then
-    CUDNN_IFLAG=$(python -c "import nvidia.cudnn, os; print('-I' + os.path.join(nvidia.cudnn.__path__[0], 'include'))" 2>/dev/null || echo "")
-fi
-if [ -z "$CUDNN_LFLAG" ]; then
-    CUDNN_LFLAG=$(python -c "import nvidia.cudnn, os; print('-L' + os.path.join(nvidia.cudnn.__path__[0], 'lib'))" 2>/dev/null || echo "")
-fi
-
-# NCCL include/lib fallback (mirrors the cuDNN fallback above).
-# Needed when NCCL is provided by the nvidia-nccl-cu12 wheel in the active venv.
-NCCL_IFLAG=""
-NCCL_LFLAG=""
-for dir in /usr/include /usr/local/cuda/include; do
-    if [ -f "$dir/nccl.h" ]; then NCCL_IFLAG="-I$dir"; break; fi
-done
-for dir in /usr/lib/x86_64-linux-gnu /usr/local/cuda/lib64; do
-    if [ -f "$dir/libnccl.so" ] || [ -f "$dir/libnccl.so.2" ]; then NCCL_LFLAG="-L$dir"; break; fi
-done
-if [ -z "$NCCL_IFLAG" ]; then
-    NCCL_IFLAG=$(python -c "import nvidia.nccl, os; print('-I' + os.path.join(nvidia.nccl.__path__[0], 'include'))" 2>/dev/null || echo "")
-fi
-if [ -z "$NCCL_LFLAG" ]; then
-    NCCL_LFLAG=$(python -c "import nvidia.nccl, os; print('-L' + os.path.join(nvidia.nccl.__path__[0], 'lib'))" 2>/dev/null || echo "")
-fi
-
-WHEEL_RPATH_FLAGS=()
-for lib_flag in "$CUDNN_LFLAG" "$NCCL_LFLAG"; do
-    if [[ "$lib_flag" == -L* ]]; then
-        WHEEL_RPATH_FLAGS+=("-Wl,-rpath,${lib_flag#-L}")
-    fi
-done
-
 export CCACHE_DIR="${CCACHE_DIR:-$HOME/.ccache}"
 export CCACHE_BASEDIR="$(pwd)"
 export CCACHE_COMPILERCHECK=content
-NVCC="ccache $CUDA_HOME/bin/nvcc"
 CC="${CC:-$(command -v ccache >/dev/null && echo 'ccache clang' || echo 'clang')}"
 ARCH=${NVCC_ARCH:-native}
 
@@ -238,10 +194,42 @@ if [ ! -f "$BINDING_SRC" ]; then
     exit 1
 fi
 
+if [ -z "$MODE" ]; then
+    if [ -z "$BACKEND" ]; then
+        if python -c "from torch.utils.cpp_extension import IS_HIP_EXTENSION; raise SystemExit(0 if IS_HIP_EXTENSION else 1)" 2>/dev/null && ! command -v nvcc >/dev/null 2>&1; then
+            BACKEND=rocm
+        else
+            BACKEND=cuda
+        fi
+    fi
+elif [ -n "$BACKEND" ]; then
+    echo "Error: --cuda/--rocm only apply to the training backend"
+    exit 1
+fi
+
+if [ "$BACKEND" = "rocm" ] && [ "$ENV" = "nmmo3" ]; then
+    echo "Error: NMMO3 native encoder is CUDA-only in build.sh --rocm"
+    exit 1
+fi
+
+CUDA_HOME=${CUDA_HOME:-${CUDA_PATH:-}}
+CUDA_IFLAG=""
+if [ "$BACKEND" = "cuda" ] || [ "$MODE" = "profile" ]; then
+    if [ -z "$CUDA_HOME" ]; then
+        if command -v nvcc >/dev/null 2>&1; then
+            CUDA_HOME=$(dirname "$(dirname "$(command -v nvcc)")")
+        else
+            echo "Error: nvcc not found. Use --rocm for HIP/ROCm or --cpu for CPU fallback."
+            exit 1
+        fi
+    fi
+    CUDA_IFLAG="-I$CUDA_HOME/include"
+fi
+
 echo "Compiling static library for $ENV..."
 ${CC:-clang} -c "${CLANG_OPT[@]}" $EXTRA_CFLAGS \
     -I. -Isrc -I$SRC_DIR -Ivendor \
-    -I./$RAYLIB_NAME/include -I$CUDA_HOME/include \
+    -I./$RAYLIB_NAME/include $CUDA_IFLAG \
     -DPLATFORM_DESKTOP \
     -fno-semantic-interposition -fvisibility=hidden \
     -fPIC -fopenmp \
@@ -255,7 +243,59 @@ if [ -z "$OBS_TENSOR_T" ]; then
     exit 1
 fi
 
-if [ -z "$MODE" ]; then
+if [ -z "$MODE" ] && [ "$BACKEND" = "cuda" ]; then
+    # Find cuDNN path
+    CUDNN_IFLAG=""
+    CUDNN_LFLAG=""
+    for dir in /usr/local/cuda/include /usr/include; do
+        if [ -f "$dir/cudnn.h" ]; then
+            CUDNN_IFLAG="-I$dir"
+            break
+        fi
+    done
+    for dir in /usr/local/cuda/lib64 /usr/lib/x86_64-linux-gnu; do
+        if [ -f "$dir/libcudnn.so" ]; then
+            CUDNN_LFLAG="-L$dir"
+            break
+        fi
+    done
+    if [ -z "$CUDNN_IFLAG" ]; then
+        CUDNN_IFLAG=$(python -c "import nvidia.cudnn, os; print('-I' + os.path.join(nvidia.cudnn.__path__[0], 'include'))" 2>/dev/null || echo "")
+    fi
+    if [ -z "$CUDNN_LFLAG" ]; then
+        CUDNN_LFLAG=$(python -c "import nvidia.cudnn, os; print('-L' + os.path.join(nvidia.cudnn.__path__[0], 'lib'))" 2>/dev/null || echo "")
+    fi
+
+    # NCCL include/lib fallback (mirrors the cuDNN fallback above).
+    # Needed when NCCL is provided by the nvidia-nccl-cu12 wheel in the active venv.
+    NCCL_IFLAG=""
+    NCCL_LFLAG=""
+    for dir in /usr/include /usr/local/cuda/include; do
+        if [ -f "$dir/nccl.h" ]; then NCCL_IFLAG="-I$dir"; break; fi
+    done
+    for dir in /usr/lib/x86_64-linux-gnu /usr/local/cuda/lib64; do
+        if [ -f "$dir/libnccl.so" ] || [ -f "$dir/libnccl.so.2" ]; then NCCL_LFLAG="-L$dir"; break; fi
+    done
+    if [ -z "$NCCL_IFLAG" ]; then
+        NCCL_IFLAG=$(python -c "import nvidia.nccl, os; print('-I' + os.path.join(nvidia.nccl.__path__[0], 'include'))" 2>/dev/null || echo "")
+    fi
+    if [ -z "$NCCL_LFLAG" ]; then
+        NCCL_LFLAG=$(python -c "import nvidia.nccl, os; print('-L' + os.path.join(nvidia.nccl.__path__[0], 'lib'))" 2>/dev/null || echo "")
+    fi
+
+    WHEEL_RPATH_FLAGS=()
+    for lib_flag in "$CUDNN_LFLAG" "$NCCL_LFLAG"; do
+        if [[ "$lib_flag" == -L* ]]; then
+            WHEEL_RPATH_FLAGS+=("-Wl,-rpath,${lib_flag#-L}")
+        fi
+    done
+
+    if command -v ccache >/dev/null 2>&1; then
+        NVCC="ccache $CUDA_HOME/bin/nvcc"
+    else
+        NVCC="$CUDA_HOME/bin/nvcc"
+    fi
+
     echo "Compiling CUDA ($ARCH) training backend..."
     $NVCC -c -arch=$ARCH -Xcompiler -fPIC \
         -Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=1 \
@@ -284,6 +324,142 @@ if [ -z "$MODE" ]; then
     "${LINK_CMD[@]}"
     echo "Built: $OUTPUT"
 
+elif [ -z "$MODE" ] && [ "$BACKEND" = "rocm" ]; then
+    mapfile -t ROCM_INFO < <(python - <<'PY'
+import os
+from torch.utils.cpp_extension import ROCM_HOME, library_paths, include_paths
+
+rocm_home = os.environ.get("ROCM_HOME") or ROCM_HOME
+if not rocm_home:
+    raise SystemExit("ROCM_HOME not found. Install/use a ROCm-enabled PyTorch environment.")
+print(rocm_home)
+print(os.environ.get("HIPCC") or os.path.join(rocm_home, "bin", "hipcc"))
+print(os.pathsep.join(include_paths("cuda")))
+print(os.pathsep.join(library_paths("cuda")))
+PY
+)
+    ROCM_HOME=${ROCM_INFO[0]}
+    HIPCC=${ROCM_INFO[1]}
+    ROCM_INCLUDE_PATHS=${ROCM_INFO[2]}
+    ROCM_LIBRARY_PATHS=${ROCM_INFO[3]}
+
+    if [ ! -x "$HIPCC" ]; then
+        if command -v hipcc >/dev/null 2>&1; then
+            HIPCC=$(command -v hipcc)
+        else
+            echo "Error: hipcc not found"
+            exit 1
+        fi
+    fi
+
+    if [ -z "$HIP_CLANG_PATH" ] || [ ! -x "$HIP_CLANG_PATH/clang++" ]; then
+        for dir in "$ROCM_HOME/lib/llvm/bin" /usr/lib/llvm/*/bin; do
+            if [ -x "$dir/clang++" ]; then
+                export HIP_CLANG_PATH="$dir"
+                break
+            fi
+        done
+    fi
+
+    HIPIFY_SRC="build/hip/src"
+    HIPIFY_SRC_ABS="$(pwd)/$HIPIFY_SRC"
+    SRC_ABS="$(pwd)/src"
+    echo "Hipifying CUDA sources into $HIPIFY_SRC..."
+    rm -rf "$HIPIFY_SRC"
+    python - <<PY
+from torch.utils.hipify import hipify_python
+hipify_python.hipify(
+    project_directory="$SRC_ABS",
+    output_directory="$HIPIFY_SRC_ABS",
+    includes=["*"],
+    show_progress=False,
+    show_detailed=False,
+    is_pytorch_extension=True,
+)
+PY
+
+    ROCM_IFLAGS=()
+    IFS=':' read -ra ROCM_INC_ARR <<< "$ROCM_INCLUDE_PATHS"
+    for dir in "${ROCM_INC_ARR[@]}"; do
+        [ -n "$dir" ] && ROCM_IFLAGS+=("-I$dir")
+    done
+    ROCM_LFLAGS=()
+    ROCM_RPATH_FLAGS=()
+    if [ -d /usr/lib64 ]; then
+        ROCM_LFLAGS+=("-L/usr/lib64")
+        ROCM_RPATH_FLAGS+=("-Wl,-rpath,/usr/lib64")
+    fi
+    IFS=':' read -ra ROCM_LIB_ARR <<< "$ROCM_LIBRARY_PATHS"
+    for dir in "${ROCM_LIB_ARR[@]}"; do
+        [ -n "$dir" ] || continue
+        [ "$dir" = "/usr/lib" ] && [ -d /usr/lib64 ] && continue
+        ROCM_LFLAGS+=("-L$dir")
+        ROCM_RPATH_FLAGS+=("-Wl,-rpath,$dir")
+    done
+    ROCM_OMP_LIB=""
+    for dir in /usr/lib64 /usr/lib /usr/local/lib; do
+        if [ -f "$dir/libomp.so" ]; then
+            ROCM_LFLAGS+=("-L$dir")
+            ROCM_RPATH_FLAGS+=("-Wl,-rpath,$dir")
+            ROCM_OMP_LIB="-lomp"
+            break
+        elif [ -f "$dir/libomp5.so" ]; then
+            ROCM_LFLAGS+=("-L$dir")
+            ROCM_RPATH_FLAGS+=("-Wl,-rpath,$dir")
+            ROCM_OMP_LIB="-lomp5"
+            break
+        fi
+    done
+
+    ROCM_ARCH_FLAGS=()
+    if [ -n "$PYTORCH_ROCM_ARCH" ]; then
+        IFS=';' read -ra ROCM_ARCH_ARR <<< "$PYTORCH_ROCM_ARCH"
+        for arch in "${ROCM_ARCH_ARR[@]}"; do
+            [ -n "$arch" ] && ROCM_ARCH_FLAGS+=("--offload-arch=$arch")
+        done
+    fi
+
+    HIPCC_OPT=()
+    if [ -n "$DEBUG" ]; then
+        HIPCC_OPT=(-O0 -g)
+    else
+        HIPCC_OPT=(-O2)
+    fi
+
+    echo "Compiling ROCm/HIP training backend..."
+    "$HIPCC" "${ROCM_ARCH_FLAGS[@]}" -c -fPIC \
+        -D_GLIBCXX_USE_CXX11_ABI=1 \
+        -DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION \
+        -DPLATFORM_DESKTOP \
+        -DUSE_ROCM \
+        -std=c++17 \
+        -I. -I"$HIPIFY_SRC" -I$SRC_DIR -Ivendor -I$RAYLIB_NAME/include \
+        -I$PYTHON_INCLUDE -I$PYBIND_INCLUDE -I$NUMPY_INCLUDE \
+        "${ROCM_IFLAGS[@]}" \
+        -fopenmp \
+        -DOBS_TENSOR_T=$OBS_TENSOR_T \
+        -DENV_NAME=$ENV \
+        $PRECISION "${HIPCC_OPT[@]}" \
+        "$HIPIFY_SRC/bindings.hip" -o build/bindings.o
+
+    "$HIPCC" -c -fPIC -std=c++17 \
+        "${ROCM_IFLAGS[@]}" \
+        src/rocm_cuda_shim.cpp -o build/rocm_cuda_shim.o
+
+    LINK_CMD=(
+        ${CXX:-g++} -shared -fPIC -fopenmp
+        build/bindings.o build/rocm_cuda_shim.o "$STATIC_LIB" "$RAYLIB_A"
+        "${ROCM_LFLAGS[@]}"
+        "${ROCM_RPATH_FLAGS[@]}"
+        -lamdhip64 -lhipblas -lhiprand -lrccl -lrocm_smi64
+        $ROCM_OMP_LIB
+        $LINK_OPT
+        "${SHARED_LDFLAGS[@]}"
+        -o "$OUTPUT"
+    )
+    "${LINK_CMD[@]}"
+    echo "Built: $OUTPUT"
+
 elif [ "$MODE" = "cpu" ]; then
     echo "Compiling CPU training backend..."
     ${CXX:-g++} -c -fPIC -fopenmp \
diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index 8fc0c03a89..158d74ee0d 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -185,10 +185,67 @@ def _train_worker(args):
 
     backend.close(pufferl)
 
-def _train(env_name, args, sweep_obj=None, result_queue=None, verbose=False):
+def _sweep_early_stop_state(sweep_obj):
+    if sweep_obj is None:
+        return None
+
+    state = {'kind': sweep_obj.__class__.__name__}
+    model = getattr(sweep_obj, 'stop_threshold_model', None)
+    if model is not None:
+        state.update(
+            metric_distribution=sweep_obj.metric_distribution,
+            is_fitted=model.is_fitted,
+            upper_cost_threshold=model.upper_cost_threshold,
+            max_score=model.max_score,
+            A=model.A,
+            B=model.B,
+            running_targets=[],
+        )
+    return state
+
+def _logit_transform(value, epsilon=1e-9):
+    value = np.clip(value, epsilon, 1 - epsilon)
+    return np.clip(np.log(value / (1 - value)), -5, 100)
+
+def _sweep_should_stop(flat_logs, target_key, state):
+    for k, v in flat_logs.items():
+        if k.startswith('loss/') and np.issubdtype(type(v), np.number) and np.isnan(v):
+            flat_logs['is_loss_nan'] = True
+            return True
+
+    if not state or state.get('kind') != 'Protein' or not state.get('is_fitted'):
+        return False
+    if 'uptime' not in flat_logs or target_key not in flat_logs:
+        return False
+
+    cost = flat_logs['uptime']
+    upper_cost = state.get('upper_cost_threshold')
+    if upper_cost is None or cost < upper_cost * 0.3 + 10:
+        return False
+
+    max_score = state.get('max_score')
+    if cost > 1.2 * upper_cost:
+        threshold = 0.9 * max_score
+    else:
+        threshold = state['A'] + state['B'] * np.log(cost)
+
+    targets = state['running_targets']
+    metric_val = flat_logs[target_key]
+    targets.append(metric_val)
+    del targets[:-30]
+    score = max(np.mean(targets), metric_val)
+    if state.get('metric_distribution') == 'percentile':
+        score = _logit_transform(score)
+
+    flat_logs['early_stop_threshold'] = max(threshold, -5)
+    flat_logs['is_loss_nan'] = False
+    return score < threshold
+
+def _train(env_name, args, result_queue=None, verbose=False, sweep_early_stop=None):
     '''Single-GPU training worker. Process target for both DDP ranks and sweep trials.'''
     backend = _resolve_backend(args)
     rank = args['rank']
+    suppress_model_outputs = result_queue is not None or sweep_early_stop is not None
     run_id = str(int(1000*time.time()))
     if args['wandb']:
         import wandb
@@ -233,7 +290,7 @@ def _train(env_name, args, sweep_obj=None, result_queue=None, verbose=False):
         if epoch < train_epochs:
             backend.train(pufferl)
 
-        if (epoch % args['checkpoint_interval'] == 0 or epoch == train_epochs - 1) and sweep_obj is None:
+        if (epoch % args['checkpoint_interval'] == 0 or epoch == train_epochs - 1) and not suppress_model_outputs:
             model_path = os.path.join(checkpoint_dir, f'{pufferl.global_step:016d}.bin')
             backend.save_weights(pufferl, model_path)
 
@@ -256,9 +313,9 @@ def _train(env_name, args, sweep_obj=None, result_queue=None, verbose=False):
         if epoch < train_epochs:
             all_logs.append(flat_logs)
 
-            if (sweep_obj is not None
-                    and pufferl.global_step > min(0.20*total_timesteps, 100_000_000) and
-                    sweep_obj.early_stop(logs, target_key)):
+            if (sweep_early_stop is not None
+                    and pufferl.global_step > min(0.20*total_timesteps, 100_000_000)
+                    and _sweep_should_stop(flat_logs, target_key, sweep_early_stop)):
                 break
         elif flat_logs['env/n'] > args['eval_episodes']:
             break
@@ -302,7 +359,7 @@ def _train(env_name, args, sweep_obj=None, result_queue=None, verbose=False):
         json.dump({**args, 'metrics': metrics}, f)
 
     if args['wandb']:
-        if sweep_obj is None and model_path: # Don't spam uploads during sweeps
+        if not suppress_model_outputs and model_path: # Don't spam uploads during sweeps
             artifact = wandb.Artifact(run_id, type='model')
             artifact.add_file(model_path)
             wandb.run.log_artifact(artifact)
@@ -315,6 +372,9 @@ def _train(env_name, args, sweep_obj=None, result_queue=None, verbose=False):
 def train(env_name, args=None, gpus=None, **kwargs):
     args = args or load_config(env_name)
     validate_config(args)
+    sweep_obj = kwargs.pop('sweep_obj', None)
+    if sweep_obj is not None and 'sweep_early_stop' not in kwargs:
+        kwargs['sweep_early_stop'] = _sweep_early_stop_state(sweep_obj)
 
     subprocess = gpus is not None
     gpus = list(gpus or range(args['train']['gpus']))
@@ -331,7 +391,7 @@ def train(env_name, args=None, gpus=None, **kwargs):
         worker_args['rank'] = rank
         worker_args['gpu_id'] = gpu_id
         if rank == 0 and not subprocess:
-            _train(env_name, worker_args, verbose=True)
+            _train(env_name, worker_args, verbose=True, **kwargs)
         else:
             ctx.Process(target=_train, args=(env_name, worker_args),
                 kwargs=kwargs).start()
@@ -340,7 +400,7 @@ def sweep(env_name, args=None, pareto=False):
     '''Train entry point. Handles single-GPU, multi-GPU DDP, and sweeps.'''
     args = args or load_config(env_name)
     exp_gpus = args['train']['gpus']
-    sweep_gpus = args['sweep']['gpus'] or len(os.listdir('/proc/driver/nvidia/gpus'))
+    sweep_gpus = args['sweep']['gpus'] or torch.cuda.device_count()
     args['vec']['num_threads'] //= (sweep_gpus // exp_gpus)
     args['no_model_upload'] = True
 
@@ -363,12 +423,15 @@ def sweep(env_name, args=None, pareto=False):
     active = {}
     completed = 0
     while completed < num_experiments:
-        if len(active) >= sweep_gpus//exp_gpus: # Collect completed runs
+        max_active = sweep_gpus // exp_gpus
+        if active and (len(active) >= max_active or completed + len(active) >= num_experiments):
             gpu_id, scores, costs, timesteps = result_queue.get()
             done_args = active.pop(gpu_id)
 
             if not scores:
                 sweep_obj.observe(done_args, 0, 0, is_failure=True)
+                completed += 1
+                continue
             else:
                 completed += 1
 
@@ -396,7 +459,7 @@ def sweep(env_name, args=None, pareto=False):
         exp_args = deepcopy(args)
         active[gpu_id] = exp_args
         train(env_name, exp_args, range(gpu_id, gpu_id + exp_gpus),
-            sweep_obj=sweep_obj, result_queue=result_queue)
+            result_queue=result_queue, sweep_early_stop=_sweep_early_stop_state(sweep_obj))
 
 def eval(env_name, args=None, load_path=None):
     '''Evaluate a trained policy. Supports both native and --slowly torch backends.'''
diff --git a/src/bindings.cu b/src/bindings.cu
index 4469cb512c..3eee3064d2 100644
--- a/src/bindings.cu
+++ b/src/bindings.cu
@@ -69,18 +69,11 @@ pybind11::dict puf_log(pybind11::object pufferl_obj) {
 
     // Utilization
     pybind11::dict util_dict;
-    nvmlUtilization_t util;
-    nvmlDeviceGetUtilizationRates(pufferl.nvml_device, &util);
-    util_dict["gpu_percent"] = (float)util.gpu;
-
-    nvmlMemory_t mem;
-    nvmlDeviceGetMemoryInfo(pufferl.nvml_device, &mem);
-    util_dict["gpu_mem"] = 100.0f * (float)mem.used / (float)mem.total;
-
-    size_t cuda_free, cuda_total;
-    cudaMemGetInfo(&cuda_free, &cuda_total);
-    util_dict["vram_used_gb"] = (float)(cuda_total - cuda_free) / (1024.0f * 1024.0f * 1024.0f);
-    util_dict["vram_total_gb"] = (float)cuda_total / (1024.0f * 1024.0f * 1024.0f);
+    GpuUtil util = gpu_get_utilization(pufferl.gpu_device);
+    util_dict["gpu_percent"] = util.gpu_percent;
+    util_dict["gpu_mem"] = util.gpu_mem;
+    util_dict["vram_used_gb"] = util.vram_used_gb;
+    util_dict["vram_total_gb"] = util.vram_total_gb;
 
     long rss_kb = 0;
     FILE* f = fopen("/proc/self/status", "r");
@@ -418,25 +411,15 @@ PYBIND11_MODULE(_C, m) {
     });
     // Standalone utilization monitor (no PuffeRL instance needed)
     m.def("get_utilization", [](int gpu_id) {
-        static bool nvml_inited = false;
-        if (!nvml_inited) { nvmlInit(); nvml_inited = true; }
-
         py::dict util_dict;
-        nvmlDevice_t device;
-        nvmlDeviceGetHandleByIndex(gpu_id, &device);
-
-        nvmlUtilization_t util;
-        nvmlDeviceGetUtilizationRates(device, &util);
-        util_dict["gpu_percent"] = (float)util.gpu;
-
-        nvmlMemory_t mem;
-        nvmlDeviceGetMemoryInfo(device, &mem);
-        util_dict["gpu_mem"] = 100.0f * (float)mem.used / (float)mem.total;
-
-        size_t cuda_free, cuda_total;
-        cudaMemGetInfo(&cuda_free, &cuda_total);
-        util_dict["vram_used_gb"] = (float)(cuda_total - cuda_free) / (1024.0f * 1024.0f * 1024.0f);
-        util_dict["vram_total_gb"] = (float)cuda_total / (1024.0f * 1024.0f * 1024.0f);
+        PufferGpuDevice device;
+        gpu_monitor_init(gpu_id, &device);
+        GpuUtil util = gpu_get_utilization(device);
+        gpu_monitor_shutdown();
+        util_dict["gpu_percent"] = util.gpu_percent;
+        util_dict["gpu_mem"] = util.gpu_mem;
+        util_dict["vram_used_gb"] = util.vram_used_gb;
+        util_dict["vram_total_gb"] = util.vram_total_gb;
 
         long rss_kb = 0;
         FILE* f = fopen("/proc/self/status", "r");
diff --git a/src/ocean.cu b/src/ocean.cu
index baaa9b7be6..314c8311be 100644
--- a/src/ocean.cu
+++ b/src/ocean.cu
@@ -1,6 +1,10 @@
 // NMMO3 CUDA encoder: multihot, cuDNN conv, embedding, concat, projection
 // Included by pufferlib.cu — requires precision_t, PrecisionTensor, Allocator, puf_mm, etc.
 
+#ifdef USE_ROCM
+static void create_custom_encoder(const std::string&, Encoder*) {}
+#else
+
 #include "cudnn_conv2d.cu"
 
 // ---- NMMO3 constants ----
@@ -588,3 +592,5 @@ static void create_custom_encoder(const std::string& env_name, Encoder* enc) {
         };
     }
 }
+
+#endif
diff --git a/src/pufferlib.cu b/src/pufferlib.cu
index 3a3e6ee00e..ba3b15bdb0 100644
--- a/src/pufferlib.cu
+++ b/src/pufferlib.cu
@@ -1,7 +1,11 @@
 #include <cuda_runtime.h>
+#ifndef USE_ROCM
 #include <cuda_profiler_api.h>
 #include <nvtx3/nvToolsExt.h>
 #include <nvml.h>
+#else
+#include <rocm_smi/rocm_smi.h>
+#endif
 #include <nccl.h>
 
 #include <time.h>
@@ -45,6 +49,110 @@ typedef struct {
     float accum[NUM_PROF];
 } ProfileT;
 
+struct GpuUtil {
+    float gpu_percent;
+    float gpu_mem;
+    float vram_used_gb;
+    float vram_total_gb;
+};
+
+#ifndef USE_ROCM
+using PufferGpuDevice = nvmlDevice_t;
+
+inline void gpu_monitor_init(int gpu_id, PufferGpuDevice* device) {
+    nvmlInit();
+    nvmlDeviceGetHandleByIndex(gpu_id, device);
+}
+
+inline void gpu_monitor_shutdown() {
+    nvmlShutdown();
+}
+
+inline GpuUtil gpu_get_utilization(PufferGpuDevice device) {
+    GpuUtil out = {};
+    nvmlUtilization_t util;
+    if (nvmlDeviceGetUtilizationRates(device, &util) == NVML_SUCCESS) {
+        out.gpu_percent = (float)util.gpu;
+    }
+
+    nvmlMemory_t mem;
+    if (nvmlDeviceGetMemoryInfo(device, &mem) == NVML_SUCCESS && mem.total > 0) {
+        out.gpu_mem = 100.0f * (float)mem.used / (float)mem.total;
+    }
+
+    size_t free_bytes = 0, total_bytes = 0;
+    cudaMemGetInfo(&free_bytes, &total_bytes);
+    if (total_bytes > 0) {
+        out.vram_used_gb = (float)(total_bytes - free_bytes) / (1024.0f * 1024.0f * 1024.0f);
+        out.vram_total_gb = (float)total_bytes / (1024.0f * 1024.0f * 1024.0f);
+    }
+    return out;
+}
+
+inline void profile_begin(const char* tag, bool enable) {
+    if (enable) nvtxRangePushA(tag);
+}
+
+inline void profile_end(bool enable) {
+    if (enable) nvtxRangePop();
+}
+
+inline void gpu_profiler_start(bool enable) {
+    if (enable) cudaProfilerStart();
+}
+
+inline void gpu_profiler_stop(bool enable) {
+    if (enable) cudaProfilerStop();
+}
+#else
+using PufferGpuDevice = uint32_t;
+
+inline void gpu_monitor_init(int gpu_id, PufferGpuDevice* device) {
+    *device = (uint32_t)gpu_id;
+    rsmi_init(RSMI_INIT_FLAG_ALL_GPUS);
+}
+
+inline void gpu_monitor_shutdown() {
+    rsmi_shut_down();
+}
+
+inline GpuUtil gpu_get_utilization(PufferGpuDevice device) {
+    GpuUtil out = {};
+    uint32_t busy = 0;
+    if (rsmi_dev_busy_percent_get(device, &busy) == RSMI_STATUS_SUCCESS) {
+        out.gpu_percent = (float)busy;
+    }
+
+    uint64_t used = 0, total = 0;
+    if (rsmi_dev_memory_usage_get(device, RSMI_MEM_TYPE_VRAM, &used) == RSMI_STATUS_SUCCESS &&
+            rsmi_dev_memory_total_get(device, RSMI_MEM_TYPE_VRAM, &total) == RSMI_STATUS_SUCCESS &&
+            total > 0) {
+        out.gpu_mem = 100.0f * (float)used / (float)total;
+    }
+
+    size_t free_bytes = 0, total_bytes = 0;
+    cudaMemGetInfo(&free_bytes, &total_bytes);
+    if (total_bytes > 0) {
+        out.vram_used_gb = (float)(total_bytes - free_bytes) / (1024.0f * 1024.0f * 1024.0f);
+        out.vram_total_gb = (float)total_bytes / (1024.0f * 1024.0f * 1024.0f);
+    }
+    return out;
+}
+
+inline void profile_begin(const char*, bool) {}
+inline void profile_end(bool) {}
+inline void gpu_profiler_start(bool) {}
+inline void gpu_profiler_stop(bool) {}
+#endif
+
+inline cudaError_t puf_graph_instantiate(cudaGraphExec_t* exec, cudaGraph_t graph) {
+#ifdef USE_ROCM
+    return cudaGraphInstantiate(exec, graph, nullptr, nullptr, 0);
+#else
+    return cudaGraphInstantiate(exec, graph, 0);
+#endif
+}
+
 // Data collected by parallel environment workers. Each worker handles
 // a constant subset of agents 
 struct RolloutBuf {
@@ -316,7 +424,7 @@ typedef struct {
     PrecisionTensor grad_puf;
     LongTensor rng_offset_puf;   // (num_buffers+1,) int64 CUDA device counters
     ProfileT profile;
-    nvmlDevice_t nvml_device;
+    PufferGpuDevice gpu_device;
     long epoch;
     long global_step;
     double start_time;
@@ -335,14 +443,6 @@ Dict* log_environments_impl(PuffeRL& pufferl) {
     return out;
 }
 
-inline void profile_begin(const char* tag, bool enable) {
-    if (enable) nvtxRangePushA(tag);
-}
-
-inline void profile_end(bool enable) {
-    if (enable) nvtxRangePop();
-}
-
 // Thread-local stream for per-buffer threads (set once by thread_init_wrapper)
 static thread_local cudaStream_t tl_stream = 0;
 
@@ -556,7 +656,7 @@ extern "C" void net_callback_wrapper(void* ctx, int buf, int t) {
         cudaGraph_t _graph;
         assert(cudaStreamEndCapture(current_stream, &_graph) == cudaSuccess
                 && "cudaStreamEndCapture failed");
-        assert(cudaGraphInstantiate(&pufferl->fused_rollout_cudagraphs[graph], _graph, 0) == cudaSuccess
+        assert(puf_graph_instantiate(&pufferl->fused_rollout_cudagraphs[graph], _graph) == cudaSuccess
                 && "cudaGraphInstantiate failed");
         assert(cudaGraphDestroy(_graph) == cudaSuccess && "cudaGraphDestroy failed");
         cudaDeviceSynchronize();
@@ -935,9 +1035,22 @@ void ppo_loss_fwd_bwd(
 }
 
 #define PRIO_WARP_SIZE 32
-#define PRIO_FULL_MASK 0xffffffff
+#ifdef USE_ROCM
+using PufWarpMask = unsigned long long;
+#else
+using PufWarpMask = unsigned int;
+#endif
 #define PRIO_BLOCK_SIZE 256
 #define PRIO_NUM_WARPS (PRIO_BLOCK_SIZE / PRIO_WARP_SIZE)
+
+__device__ __forceinline__ float prio_warp_sum(float val, int width = PRIO_WARP_SIZE) {
+    PufWarpMask mask = (PufWarpMask)__activemask();
+    for (int s = width / 2; s >= 1; s /= 2) {
+        val += __shfl_down_sync(mask, val, s, width);
+    }
+    return val;
+}
+
 __global__ void compute_prio_adv_reduction(
         const precision_t* __restrict__ advantages,
         float* prio_weights, float prio_alpha, int stride) {
@@ -950,9 +1063,7 @@ __global__ void compute_prio_adv_reduction(
         local_sum += fabsf(to_float(advantages[offset + t]));
     }
 
-    for (int s = PRIO_WARP_SIZE / 2; s >= 1; s /= 2) {
-        local_sum += __shfl_down_sync(PRIO_FULL_MASK, local_sum, s);
-    }
+    local_sum = prio_warp_sum(local_sum);
     if (tx == 0) {
         float pw = __powf(local_sum, prio_alpha);
         if (isnan(pw) || isinf(pw)) {
@@ -975,9 +1086,7 @@ __global__ void compute_prio_normalize(float* prio_weights, int length) {
     for (int t = tx; t < length; t += blockDim.x) {
         local_sum += prio_weights[t];
     }
-    for (int s = PRIO_WARP_SIZE / 2; s >= 1; s /= 2) {
-        local_sum += __shfl_down_sync(PRIO_FULL_MASK, local_sum, s);
-    }
+    local_sum = prio_warp_sum(local_sum);
     if (lane == 0) {
         shmem[warp_id] = local_sum;
     }
@@ -985,9 +1094,7 @@ __global__ void compute_prio_normalize(float* prio_weights, int length) {
 
     if (warp_id == 0) {
         float val = (lane < PRIO_NUM_WARPS) ? shmem[lane] : 0.0f;
-        for (int s = PRIO_NUM_WARPS / 2; s >= 1; s /= 2) {
-            val += __shfl_down_sync(PRIO_FULL_MASK, val, s);
-        }
+        val = prio_warp_sum(val, PRIO_NUM_WARPS);
         if (tx == 0) {
             block_sum = val + eps;
         }
@@ -1413,7 +1520,7 @@ void train_impl(PuffeRL& pufferl) {
                 cudaGraph_t _graph;
                 assert(cudaStreamEndCapture(train_stream, &_graph) == cudaSuccess
                         && "cudaStreamEndCapture failed");
-                assert(cudaGraphInstantiate(&pufferl.train_cudagraph, _graph, 0) == cudaSuccess
+                assert(puf_graph_instantiate(&pufferl.train_cudagraph, _graph) == cudaSuccess
                         && "cudaGraphInstantiate failed");
                 assert(cudaGraphDestroy(_graph) == cudaSuccess && "cudaGraphDestroy failed");
                 cudaDeviceSynchronize();
@@ -1517,8 +1624,7 @@ std::unique_ptr<PuffeRL> create_pufferl_impl(HypersT& hypers,
         cudaEventCreate(&pufferl->profile.events[i]);
     }
     memset(pufferl->profile.accum, 0, sizeof(pufferl->profile.accum));
-    nvmlInit();
-    nvmlDeviceGetHandleByIndex(hypers.gpu_id, &pufferl->nvml_device);
+    gpu_monitor_init(hypers.gpu_id, &pufferl->gpu_device);
 
     // Create policy
     int input_size = pufferl->env.obs.shape[1];
@@ -1751,10 +1857,8 @@ std::unique_ptr<PuffeRL> create_pufferl_impl(HypersT& hypers,
         net_callback_wrapper, thread_init_wrapper);
     static_vec_reset(vec);
 
-    if (hypers.profile) {
-        cudaDeviceSynchronize();
-        cudaProfilerStart();
-    }
+    if (hypers.profile) cudaDeviceSynchronize();
+    gpu_profiler_start(hypers.profile);
 
     double now = wall_clock();
     pufferl->start_time = now;
@@ -1766,9 +1870,7 @@ std::unique_ptr<PuffeRL> create_pufferl_impl(HypersT& hypers,
 
 void close_impl(PuffeRL& pufferl) {
     cudaDeviceSynchronize();
-    if (pufferl.hypers.profile) {
-        cudaProfilerStop();
-    }
+    gpu_profiler_stop(pufferl.hypers.profile);
 
     cudaGraphExecDestroy(pufferl.train_cudagraph);
     for (int i = 0; i < pufferl.hypers.horizon * pufferl.hypers.num_buffers; i++) {
@@ -1800,7 +1902,7 @@ void close_impl(PuffeRL& pufferl) {
     for (int i = 0; i < NUM_TRAIN_EVENTS; i++) {
         cudaEventDestroy(pufferl.profile.events[i]);
     }
-    nvmlShutdown();
+    gpu_monitor_shutdown();
 
     static_vec_close(pufferl.vec);
 
diff --git a/src/rocm_cuda_shim.cpp b/src/rocm_cuda_shim.cpp
new file mode 100644
index 0000000000..0feed14b2e
--- /dev/null
+++ b/src/rocm_cuda_shim.cpp
@@ -0,0 +1,57 @@
+#include <hip/hip_runtime.h>
+
+extern "C" {
+
+hipError_t cudaHostAlloc(void** ptr, size_t size, unsigned int flags) {
+    return hipHostMalloc(ptr, size, flags);
+}
+
+hipError_t cudaMalloc(void** ptr, size_t size) {
+    return hipMalloc(ptr, size);
+}
+
+hipError_t cudaMemcpy(void* dst, const void* src, size_t size, int kind) {
+    return hipMemcpy(dst, src, size, (hipMemcpyKind)kind);
+}
+
+hipError_t cudaMemcpyAsync(void* dst, const void* src, size_t size, int kind, hipStream_t stream) {
+    return hipMemcpyAsync(dst, src, size, (hipMemcpyKind)kind, stream);
+}
+
+hipError_t cudaMemset(void* dst, int value, size_t size) {
+    return hipMemset(dst, value, size);
+}
+
+hipError_t cudaFree(void* ptr) {
+    return hipFree(ptr);
+}
+
+hipError_t cudaFreeHost(void* ptr) {
+    return hipHostFree(ptr);
+}
+
+hipError_t cudaSetDevice(int device) {
+    return hipSetDevice(device);
+}
+
+hipError_t cudaDeviceSynchronize(void) {
+    return hipDeviceSynchronize();
+}
+
+hipError_t cudaStreamSynchronize(hipStream_t stream) {
+    return hipStreamSynchronize(stream);
+}
+
+hipError_t cudaStreamCreateWithFlags(hipStream_t* stream, unsigned int flags) {
+    return hipStreamCreateWithFlags(stream, flags);
+}
+
+hipError_t cudaStreamQuery(hipStream_t stream) {
+    return hipStreamQuery(stream);
+}
+
+const char* cudaGetErrorString(hipError_t error) {
+    return hipGetErrorString(error);
+}
+
+}