Skip to content

Commit 4f1da95

Browse files
author
ClawLabby
committed
fix: address review feedback on multi-GPU PR
- Fix resolve_cuda_device() multi-node bug: compare local_rank against num_visible (not WORLD_SIZE) so 2-node x 4-GPU setups work correctly. Remove unused os import and WORLD_SIZE env var. - Revert presets.py to upstream: remove newton and physx fields from MultiBackendRendererCfg. The global 'newton' preset resolves physics, not renderer — renderer resolves via 'newton_renderer' field. Fix isaacsim_rtx_renderer to use proper type annotation (independent instance, not shared alias). - Revert _is_kit_camera() PresetCfg escape hatch: PresetCfg renderers are always resolved by Hydra before launch_simulation() is called, so the check is unnecessary. Restore original upstream logic. - Rewrite tests: test resolve_cuda_device() by calling the actual function with mocked torch.cuda.device_count(). Add multi-node test case. Remove vacuous inline-logic test and redundant skipif.
1 parent f3005fd commit 4f1da95

4 files changed

Lines changed: 45 additions & 57 deletions

File tree

source/isaaclab/isaaclab/utils/distributed.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77

88
from __future__ import annotations
99

10-
import os
11-
1210
import torch
1311

1412

@@ -27,9 +25,10 @@ def resolve_cuda_device(local_rank: int) -> tuple[str, int]:
2725
``("cuda:2", 2)``.
2826
"""
2927
num_visible = torch.cuda.device_count()
30-
world_size = int(os.getenv("WORLD_SIZE", "1"))
31-
if num_visible >= world_size:
28+
if local_rank < num_visible:
3229
device_id = local_rank
3330
else:
31+
# CUDA_VISIBLE_DEVICES restricts this process to fewer GPUs than
32+
# local_rank implies — fall back to cuda:0.
3433
device_id = 0
3534
return f"cuda:{device_id}", device_id

source/isaaclab/test/test_multigpu_newton.py

Lines changed: 42 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -19,39 +19,58 @@ def _get_num_gpus() -> int:
1919
return torch.cuda.device_count()
2020

2121

22-
@pytest.mark.skipif(_get_num_gpus() < 2, reason="Requires at least 2 GPUs")
23-
class TestMultiGPUDeviceAssignment:
24-
"""Tests for multi-GPU device assignment in distributed training."""
22+
class TestResolveCudaDevice:
23+
"""Tests for resolve_cuda_device across different GPU visibility scenarios."""
2524

26-
def test_resolve_cuda_device_full_visibility(self):
25+
def test_full_visibility(self):
2726
"""When all GPUs are visible, each rank gets its own device."""
2827
from isaaclab.utils.distributed import resolve_cuda_device
2928

30-
# Simulate 4 visible GPUs, world_size=4
31-
with unittest.mock.patch("torch.cuda.device_count", return_value=4), \
32-
unittest.mock.patch.dict(os.environ, {"WORLD_SIZE": "4"}):
29+
with unittest.mock.patch("torch.cuda.device_count", return_value=4):
3330
for local_rank in range(4):
3431
device, device_id = resolve_cuda_device(local_rank)
3532
assert device == f"cuda:{local_rank}"
3633
assert device_id == local_rank
3734

38-
def test_resolve_cuda_device_restricted_visibility(self):
35+
def test_restricted_visibility(self):
3936
"""When CUDA_VISIBLE_DEVICES restricts each process to 1 GPU, all ranks use cuda:0."""
4037
from isaaclab.utils.distributed import resolve_cuda_device
4138

42-
# Simulate 1 visible GPU (CUDA_VISIBLE_DEVICES restricted), world_size=4
43-
with unittest.mock.patch("torch.cuda.device_count", return_value=1), \
44-
unittest.mock.patch.dict(os.environ, {"WORLD_SIZE": "4"}):
39+
with unittest.mock.patch("torch.cuda.device_count", return_value=1):
4540
for local_rank in range(4):
4641
device, device_id = resolve_cuda_device(local_rank)
4742
assert device == "cuda:0"
4843
assert device_id == 0
4944

45+
def test_multi_node(self):
46+
"""Multi-node: 4 visible GPUs per node, world_size=8, each rank gets its own GPU."""
47+
from isaaclab.utils.distributed import resolve_cuda_device
48+
49+
with unittest.mock.patch("torch.cuda.device_count", return_value=4):
50+
for local_rank in range(4):
51+
device, device_id = resolve_cuda_device(local_rank)
52+
assert device == f"cuda:{local_rank}", (
53+
f"Multi-node: local_rank={local_rank} should map to cuda:{local_rank}"
54+
)
55+
56+
def test_local_rank_exceeds_visible(self):
57+
"""When local_rank >= num_visible, fall back to cuda:0."""
58+
from isaaclab.utils.distributed import resolve_cuda_device
59+
60+
with unittest.mock.patch("torch.cuda.device_count", return_value=2):
61+
device, device_id = resolve_cuda_device(5)
62+
assert device == "cuda:0"
63+
assert device_id == 0
64+
65+
66+
@pytest.mark.skipif(_get_num_gpus() < 2, reason="Requires at least 2 GPUs")
67+
class TestMultiGPUDeviceAssignment:
68+
"""Integration tests for multi-GPU training."""
69+
5070
def test_cartpole_newton_multigpu(self):
5171
"""Test that multi-GPU cartpole training with Newton physics runs without error."""
5272
num_gpus = min(_get_num_gpus(), 4)
5373

54-
# Run a quick 2-iteration training to verify setup works
5574
cmd = [
5675
"torchrun",
5776
f"--nproc_per_node={num_gpus}",
@@ -65,12 +84,9 @@ def test_cartpole_newton_multigpu(self):
6584
]
6685

6786
env = os.environ.copy()
68-
# Required for containers with IOMMU where P2P transport fails,
69-
# and for environments without InfiniBand.
7087
env["NCCL_P2P_DISABLE"] = "1"
7188
env["NCCL_IB_DISABLE"] = "1"
7289

73-
# Resolve IsaacLab root from test file location
7490
test_dir = os.path.dirname(os.path.abspath(__file__))
7591
isaaclab_root = os.path.dirname(os.path.dirname(os.path.dirname(test_dir)))
7692

@@ -83,7 +99,6 @@ def test_cartpole_newton_multigpu(self):
8399
cwd=isaaclab_root,
84100
)
85101

86-
# Verify training actually ran (not just a clean exit from early skip)
87102
has_training_output = "Learning iteration 1" in result.stdout
88103
no_cuda_errors = "CUDA error" not in result.stderr
89104
assert result.returncode == 0 and has_training_output and no_cuda_errors, (
@@ -97,42 +112,25 @@ def test_cartpole_newton_multigpu(self):
97112
class TestMultiGPUCameraRendering:
98113
"""Tests for multi-GPU training with camera observations."""
99114

100-
def test_preset_renderer_matching(self):
101-
"""Test that newton preset correctly matches the renderer."""
115+
def test_preset_renderer_has_newton_renderer_field(self):
116+
"""Test that MultiBackendRendererCfg has newton_renderer field."""
102117
from isaaclab_tasks.utils.presets import MultiBackendRendererCfg
103118

104119
cfg = MultiBackendRendererCfg()
105-
assert hasattr(cfg, "newton"), "MultiBackendRendererCfg should have 'newton' field"
106-
assert "NewtonWarp" in type(cfg.newton).__name__, (
107-
f"newton field should be NewtonWarpRendererCfg, got {type(cfg.newton).__name__}"
120+
assert hasattr(cfg, "newton_renderer"), (
121+
"MultiBackendRendererCfg should have 'newton_renderer' field"
108122
)
109-
110-
def test_physx_preset_is_independent_instance(self):
111-
"""Test that physx preset is a separate instance from default (not a shared alias)."""
112-
from isaaclab_tasks.utils.presets import MultiBackendRendererCfg
113-
114-
cfg = MultiBackendRendererCfg()
115-
assert cfg.physx is not cfg.default, (
116-
"physx and default should be independent instances, not shared aliases"
123+
assert "NewtonWarp" in type(cfg.newton_renderer).__name__, (
124+
f"newton_renderer should be NewtonWarpRendererCfg, got {type(cfg.newton_renderer).__name__}"
117125
)
118126

119-
def test_camera_not_kit_with_preset_renderer(self):
120-
"""Test that PresetCfg renderers are not detected as Kit cameras."""
121-
from isaaclab.sensors import TiledCameraCfg
127+
def test_alias_fields_are_independent_instances(self):
128+
"""Test that aliased fields are separate instances (not shared mutable references)."""
122129
from isaaclab_tasks.utils.presets import MultiBackendRendererCfg
123-
from isaaclab_tasks.utils.sim_launcher import _is_kit_camera
124-
125-
cam = TiledCameraCfg(
126-
prim_path="/test",
127-
data_types=["rgb"],
128-
width=64,
129-
height=64,
130-
renderer_cfg=MultiBackendRendererCfg(),
131-
)
132130

133-
# PresetCfg renderers resolve to match the physics backend, so not necessarily Kit
134-
assert not _is_kit_camera(cam), (
135-
"Camera with PresetCfg renderer should not be detected as Kit camera"
131+
cfg = MultiBackendRendererCfg()
132+
assert cfg.isaacsim_rtx_renderer is not cfg.default, (
133+
"isaacsim_rtx_renderer and default should be independent instances"
136134
)
137135

138136

source/isaaclab_tasks/isaaclab_tasks/utils/presets.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,6 @@
1515
@configclass
1616
class MultiBackendRendererCfg(PresetCfg):
1717
default: IsaacRtxRendererCfg = IsaacRtxRendererCfg()
18-
newton: NewtonWarpRendererCfg = NewtonWarpRendererCfg()
1918
newton_renderer: NewtonWarpRendererCfg = NewtonWarpRendererCfg()
2019
ovrtx_renderer: OVRTXRendererCfg = OVRTXRendererCfg()
2120
isaacsim_rtx_renderer: IsaacRtxRendererCfg = IsaacRtxRendererCfg()
22-
# PhysX uses RTX rendering by default in Isaac Lab.
23-
physx: IsaacRtxRendererCfg = IsaacRtxRendererCfg()

source/isaaclab_tasks/isaaclab_tasks/utils/sim_launcher.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -122,12 +122,6 @@ def _is_kit_camera(node) -> bool:
122122
return True
123123
if isinstance(renderer_cfg, RendererCfg):
124124
return renderer_cfg.renderer_type in ("default", "isaac_rtx")
125-
# PresetCfg renderers (e.g. MultiBackendRendererCfg) are resolved later;
126-
# assume they will match the physics backend, so not necessarily Kit.
127-
from isaaclab_tasks.utils import PresetCfg
128-
129-
if isinstance(renderer_cfg, PresetCfg):
130-
return False
131125
# Unknown renderer type — conservatively assume Kit is required.
132126
return True
133127

0 commit comments

Comments
 (0)