Skip to content

Commit ce7054a

Browse files
committed
these tests still have issue on Mi350
1 parent db0e318 commit ce7054a

2 files changed

Lines changed: 10 additions & 0 deletions

File tree

test/test_distributed.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from helion._testing import TestCase
2424
from helion._testing import import_path
2525
from helion._testing import onlyBackends
26+
from helion._testing import skipIfRocm
2627
from helion._testing import skipIfXPU
2728
from helion.autotuner import search_algorithms
2829
from helion.autotuner.effort_profile import _PROFILES
@@ -162,6 +163,7 @@ def _cleanup_process(self):
162163
dist.barrier()
163164
dist.destroy_process_group()
164165

166+
@skipIfRocm("Distributed example requires CUDA/NCCL")
165167
@skipIfXPU("Distributed operations require CCL, not yet fully integrated")
166168
@skip_if_lt_x_gpu(4)
167169
def test_sync_seed(self):
@@ -189,6 +191,7 @@ def _all_eq(xlist: list[Tensor]) -> bool:
189191

190192
self._cleanup_process()
191193

194+
@skipIfRocm("Distributed example requires CUDA/NCCL")
192195
@skipIfXPU("Distributed operations require CCL, not yet fully integrated")
193196
@skip_if_lt_x_gpu(4)
194197
@parametrize("autotuner", autotuner_names)
@@ -252,6 +255,7 @@ def do_test_allreduce(self, kernel):
252255

253256
torch.testing.assert_close(result, expected, rtol=1e-1, atol=1e-1)
254257

258+
@skipIfRocm("Distributed example requires CUDA/NCCL")
255259
@skipIfXPU("Distributed operations require CCL, not yet fully integrated")
256260
@skip_if_lt_x_gpu(4)
257261
@parametrize(
@@ -332,6 +336,7 @@ def do_test_allreduce_bias_rmsnorm(self, kernel, ref_kernel):
332336
expected = ref_kernel(x, bias, weight)
333337
torch.testing.assert_close(result, expected, rtol=1e-4, atol=1e-4)
334338

339+
@skipIfRocm("Distributed example requires CUDA/NCCL")
335340
@skipIfXPU("Distributed operations require CCL, not yet fully integrated")
336341
@skip_if_lt_x_gpu(4)
337342
@parametrize("autotuner", autotuner_names)

test/test_examples_dist.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from helion._testing import code_and_output
2121
from helion._testing import import_path
2222
from helion._testing import onlyBackends
23+
from helion._testing import skipIfRocm
2324
from helion._testing import skipIfXPU
2425

2526

@@ -83,6 +84,7 @@ def _cleanup_process(self):
8384
dist.barrier()
8485
dist.destroy_process_group()
8586

87+
@skipIfRocm("Distributed example requires CUDA/NCCL")
8688
@skipIfXPU("Distributed operations require CCL, not yet fully integrated")
8789
@skip_if_lt_x_gpu(4)
8890
def test_all_gather_matmul(self):
@@ -137,6 +139,7 @@ def test_all_gather_matmul(self):
137139
torch.cuda.current_stream().wait_stream(backend_stream)
138140
self._cleanup_process()
139141

142+
@skipIfRocm("Distributed example requires CUDA/NCCL")
140143
@skipIfXPU("Distributed operations require CCL, not yet fully integrated")
141144
@skip_if_lt_x_gpu(4)
142145
def test_all_reduce(self):
@@ -182,6 +185,7 @@ def test_all_reduce(self):
182185

183186
self._cleanup_process()
184187

188+
@skipIfRocm("Distributed example requires CUDA/NCCL")
185189
@skipIfXPU("Distributed operations require CCL, not yet fully integrated")
186190
@skip_if_lt_x_gpu(4)
187191
@parametrize(
@@ -239,6 +243,7 @@ def test_allreduce_bias_rmsnorm(self, kernel_name):
239243

240244
self._cleanup_process()
241245

246+
@skipIfRocm("Distributed example requires CUDA/NCCL")
242247
@skipIfXPU("Distributed operations require CCL, not yet fully integrated")
243248
@skip_if_lt_x_gpu(4)
244249
def test_matmul_reduce_scatter(self):

0 commit comments

Comments
 (0)