|
23 | 23 | from helion._testing import TestCase |
24 | 24 | from helion._testing import import_path |
25 | 25 | from helion._testing import onlyBackends |
| 26 | +from helion._testing import skipIfRocm |
26 | 27 | from helion._testing import skipIfXPU |
27 | 28 | from helion.autotuner import search_algorithms |
28 | 29 | from helion.autotuner.effort_profile import _PROFILES |
@@ -162,6 +163,7 @@ def _cleanup_process(self): |
162 | 163 | dist.barrier() |
163 | 164 | dist.destroy_process_group() |
164 | 165 |
|
| 166 | + @skipIfRocm("Distributed example requires CUDA/NCCL") |
165 | 167 | @skipIfXPU("Distributed operations require CCL, not yet fully integrated") |
166 | 168 | @skip_if_lt_x_gpu(4) |
167 | 169 | def test_sync_seed(self): |
@@ -189,6 +191,7 @@ def _all_eq(xlist: list[Tensor]) -> bool: |
189 | 191 |
|
190 | 192 | self._cleanup_process() |
191 | 193 |
|
| 194 | + @skipIfRocm("Distributed example requires CUDA/NCCL") |
192 | 195 | @skipIfXPU("Distributed operations require CCL, not yet fully integrated") |
193 | 196 | @skip_if_lt_x_gpu(4) |
194 | 197 | @parametrize("autotuner", autotuner_names) |
@@ -252,6 +255,7 @@ def do_test_allreduce(self, kernel): |
252 | 255 |
|
253 | 256 | torch.testing.assert_close(result, expected, rtol=1e-1, atol=1e-1) |
254 | 257 |
|
| 258 | + @skipIfRocm("Distributed example requires CUDA/NCCL") |
255 | 259 | @skipIfXPU("Distributed operations require CCL, not yet fully integrated") |
256 | 260 | @skip_if_lt_x_gpu(4) |
257 | 261 | @parametrize( |
@@ -332,6 +336,7 @@ def do_test_allreduce_bias_rmsnorm(self, kernel, ref_kernel): |
332 | 336 | expected = ref_kernel(x, bias, weight) |
333 | 337 | torch.testing.assert_close(result, expected, rtol=1e-4, atol=1e-4) |
334 | 338 |
|
| 339 | + @skipIfRocm("Distributed example requires CUDA/NCCL") |
335 | 340 | @skipIfXPU("Distributed operations require CCL, not yet fully integrated") |
336 | 341 | @skip_if_lt_x_gpu(4) |
337 | 342 | @parametrize("autotuner", autotuner_names) |
|
0 commit comments