these tests still have issue on Mi350

umechand-amd · umechand-amd · commit ce7054a0b9f5 · 2026-03-27T04:25:31.000Z
diff --git a/test/test_distributed.py b/test/test_distributed.py
@@ -23,6 +23,7 @@
 from helion._testing import TestCase
 from helion._testing import import_path
 from helion._testing import onlyBackends
+from helion._testing import skipIfRocm
 from helion._testing import skipIfXPU
 from helion.autotuner import search_algorithms
 from helion.autotuner.effort_profile import _PROFILES
@@ -162,6 +163,7 @@ def _cleanup_process(self):
         dist.barrier()
         dist.destroy_process_group()
 
+    @skipIfRocm("Distributed example requires CUDA/NCCL")
     @skipIfXPU("Distributed operations require CCL, not yet fully integrated")
     @skip_if_lt_x_gpu(4)
     def test_sync_seed(self):
@@ -189,6 +191,7 @@ def _all_eq(xlist: list[Tensor]) -> bool:
 
         self._cleanup_process()
 
+    @skipIfRocm("Distributed example requires CUDA/NCCL")
     @skipIfXPU("Distributed operations require CCL, not yet fully integrated")
     @skip_if_lt_x_gpu(4)
     @parametrize("autotuner", autotuner_names)
@@ -252,6 +255,7 @@ def do_test_allreduce(self, kernel):
 
         torch.testing.assert_close(result, expected, rtol=1e-1, atol=1e-1)
 
+    @skipIfRocm("Distributed example requires CUDA/NCCL")
     @skipIfXPU("Distributed operations require CCL, not yet fully integrated")
     @skip_if_lt_x_gpu(4)
     @parametrize(
@@ -332,6 +336,7 @@ def do_test_allreduce_bias_rmsnorm(self, kernel, ref_kernel):
         expected = ref_kernel(x, bias, weight)
         torch.testing.assert_close(result, expected, rtol=1e-4, atol=1e-4)
 
+    @skipIfRocm("Distributed example requires CUDA/NCCL")
     @skipIfXPU("Distributed operations require CCL, not yet fully integrated")
     @skip_if_lt_x_gpu(4)
     @parametrize("autotuner", autotuner_names)
diff --git a/test/test_examples_dist.py b/test/test_examples_dist.py
@@ -20,6 +20,7 @@
 from helion._testing import code_and_output
 from helion._testing import import_path
 from helion._testing import onlyBackends
+from helion._testing import skipIfRocm
 from helion._testing import skipIfXPU
 
 
@@ -83,6 +84,7 @@ def _cleanup_process(self):
         dist.barrier()
         dist.destroy_process_group()
 
+    @skipIfRocm("Distributed example requires CUDA/NCCL")
     @skipIfXPU("Distributed operations require CCL, not yet fully integrated")
     @skip_if_lt_x_gpu(4)
     def test_all_gather_matmul(self):
@@ -137,6 +139,7 @@ def test_all_gather_matmul(self):
         torch.cuda.current_stream().wait_stream(backend_stream)
         self._cleanup_process()
 
+    @skipIfRocm("Distributed example requires CUDA/NCCL")
     @skipIfXPU("Distributed operations require CCL, not yet fully integrated")
     @skip_if_lt_x_gpu(4)
     def test_all_reduce(self):
@@ -182,6 +185,7 @@ def test_all_reduce(self):
 
         self._cleanup_process()
 
+    @skipIfRocm("Distributed example requires CUDA/NCCL")
     @skipIfXPU("Distributed operations require CCL, not yet fully integrated")
     @skip_if_lt_x_gpu(4)
     @parametrize(
@@ -239,6 +243,7 @@ def test_allreduce_bias_rmsnorm(self, kernel_name):
 
         self._cleanup_process()
 
+    @skipIfRocm("Distributed example requires CUDA/NCCL")
     @skipIfXPU("Distributed operations require CCL, not yet fully integrated")
     @skip_if_lt_x_gpu(4)
     def test_matmul_reduce_scatter(self):