@@ -19,39 +19,58 @@ def _get_num_gpus() -> int:
1919 return torch .cuda .device_count ()
2020
2121
22- @pytest .mark .skipif (_get_num_gpus () < 2 , reason = "Requires at least 2 GPUs" )
23- class TestMultiGPUDeviceAssignment :
24- """Tests for multi-GPU device assignment in distributed training."""
22+ class TestResolveCudaDevice :
23+ """Tests for resolve_cuda_device across different GPU visibility scenarios."""
2524
26- def test_resolve_cuda_device_full_visibility (self ):
25+ def test_full_visibility (self ):
2726 """When all GPUs are visible, each rank gets its own device."""
2827 from isaaclab .utils .distributed import resolve_cuda_device
2928
30- # Simulate 4 visible GPUs, world_size=4
31- with unittest .mock .patch ("torch.cuda.device_count" , return_value = 4 ), \
32- unittest .mock .patch .dict (os .environ , {"WORLD_SIZE" : "4" }):
29+ with unittest .mock .patch ("torch.cuda.device_count" , return_value = 4 ):
3330 for local_rank in range (4 ):
3431 device , device_id = resolve_cuda_device (local_rank )
3532 assert device == f"cuda:{ local_rank } "
3633 assert device_id == local_rank
3734
38- def test_resolve_cuda_device_restricted_visibility (self ):
35+ def test_restricted_visibility (self ):
3936 """When CUDA_VISIBLE_DEVICES restricts each process to 1 GPU, all ranks use cuda:0."""
4037 from isaaclab .utils .distributed import resolve_cuda_device
4138
42- # Simulate 1 visible GPU (CUDA_VISIBLE_DEVICES restricted), world_size=4
43- with unittest .mock .patch ("torch.cuda.device_count" , return_value = 1 ), \
44- unittest .mock .patch .dict (os .environ , {"WORLD_SIZE" : "4" }):
39+ with unittest .mock .patch ("torch.cuda.device_count" , return_value = 1 ):
4540 for local_rank in range (4 ):
4641 device , device_id = resolve_cuda_device (local_rank )
4742 assert device == "cuda:0"
4843 assert device_id == 0
4944
45+ def test_multi_node (self ):
46+ """Multi-node: 4 visible GPUs per node, world_size=8, each rank gets its own GPU."""
47+ from isaaclab .utils .distributed import resolve_cuda_device
48+
49+ with unittest .mock .patch ("torch.cuda.device_count" , return_value = 4 ):
50+ for local_rank in range (4 ):
51+ device , device_id = resolve_cuda_device (local_rank )
52+ assert device == f"cuda:{ local_rank } " , (
53+ f"Multi-node: local_rank={ local_rank } should map to cuda:{ local_rank } "
54+ )
55+
56+ def test_local_rank_exceeds_visible (self ):
57+ """When local_rank >= num_visible, fall back to cuda:0."""
58+ from isaaclab .utils .distributed import resolve_cuda_device
59+
60+ with unittest .mock .patch ("torch.cuda.device_count" , return_value = 2 ):
61+ device , device_id = resolve_cuda_device (5 )
62+ assert device == "cuda:0"
63+ assert device_id == 0
64+
65+
66+ @pytest .mark .skipif (_get_num_gpus () < 2 , reason = "Requires at least 2 GPUs" )
67+ class TestMultiGPUDeviceAssignment :
68+ """Integration tests for multi-GPU training."""
69+
5070 def test_cartpole_newton_multigpu (self ):
5171 """Test that multi-GPU cartpole training with Newton physics runs without error."""
5272 num_gpus = min (_get_num_gpus (), 4 )
5373
54- # Run a quick 2-iteration training to verify setup works
5574 cmd = [
5675 "torchrun" ,
5776 f"--nproc_per_node={ num_gpus } " ,
@@ -65,12 +84,9 @@ def test_cartpole_newton_multigpu(self):
6584 ]
6685
6786 env = os .environ .copy ()
68- # Required for containers with IOMMU where P2P transport fails,
69- # and for environments without InfiniBand.
7087 env ["NCCL_P2P_DISABLE" ] = "1"
7188 env ["NCCL_IB_DISABLE" ] = "1"
7289
73- # Resolve IsaacLab root from test file location
7490 test_dir = os .path .dirname (os .path .abspath (__file__ ))
7591 isaaclab_root = os .path .dirname (os .path .dirname (os .path .dirname (test_dir )))
7692
@@ -83,7 +99,6 @@ def test_cartpole_newton_multigpu(self):
8399 cwd = isaaclab_root ,
84100 )
85101
86- # Verify training actually ran (not just a clean exit from early skip)
87102 has_training_output = "Learning iteration 1" in result .stdout
88103 no_cuda_errors = "CUDA error" not in result .stderr
89104 assert result .returncode == 0 and has_training_output and no_cuda_errors , (
@@ -97,42 +112,25 @@ def test_cartpole_newton_multigpu(self):
97112class TestMultiGPUCameraRendering :
98113 """Tests for multi-GPU training with camera observations."""
99114
100- def test_preset_renderer_matching (self ):
101- """Test that newton preset correctly matches the renderer ."""
115+ def test_preset_renderer_has_newton_renderer_field (self ):
116+ """Test that MultiBackendRendererCfg has newton_renderer field ."""
102117 from isaaclab_tasks .utils .presets import MultiBackendRendererCfg
103118
104119 cfg = MultiBackendRendererCfg ()
105- assert hasattr (cfg , "newton" ), "MultiBackendRendererCfg should have 'newton' field"
106- assert "NewtonWarp" in type (cfg .newton ).__name__ , (
107- f"newton field should be NewtonWarpRendererCfg, got { type (cfg .newton ).__name__ } "
120+ assert hasattr (cfg , "newton_renderer" ), (
121+ "MultiBackendRendererCfg should have 'newton_renderer' field"
108122 )
109-
110- def test_physx_preset_is_independent_instance (self ):
111- """Test that physx preset is a separate instance from default (not a shared alias)."""
112- from isaaclab_tasks .utils .presets import MultiBackendRendererCfg
113-
114- cfg = MultiBackendRendererCfg ()
115- assert cfg .physx is not cfg .default , (
116- "physx and default should be independent instances, not shared aliases"
123+ assert "NewtonWarp" in type (cfg .newton_renderer ).__name__ , (
124+ f"newton_renderer should be NewtonWarpRendererCfg, got { type (cfg .newton_renderer ).__name__ } "
117125 )
118126
119- def test_camera_not_kit_with_preset_renderer (self ):
120- """Test that PresetCfg renderers are not detected as Kit cameras."""
121- from isaaclab .sensors import TiledCameraCfg
127+ def test_alias_fields_are_independent_instances (self ):
128+ """Test that aliased fields are separate instances (not shared mutable references)."""
122129 from isaaclab_tasks .utils .presets import MultiBackendRendererCfg
123- from isaaclab_tasks .utils .sim_launcher import _is_kit_camera
124-
125- cam = TiledCameraCfg (
126- prim_path = "/test" ,
127- data_types = ["rgb" ],
128- width = 64 ,
129- height = 64 ,
130- renderer_cfg = MultiBackendRendererCfg (),
131- )
132130
133- # PresetCfg renderers resolve to match the physics backend, so not necessarily Kit
134- assert not _is_kit_camera ( cam ) , (
135- "Camera with PresetCfg renderer should not be detected as Kit camera "
131+ cfg = MultiBackendRendererCfg ()
132+ assert cfg . isaacsim_rtx_renderer is not cfg . default , (
133+ "isaacsim_rtx_renderer and default should be independent instances "
136134 )
137135
138136
0 commit comments