Skip to content

Commit 9075f54

Browse files
committed
clean up after rebase: remove duplicated change, revert cmake files
1 parent fa56838 commit 9075f54

File tree

3 files changed

+14
-9
lines changed

3 files changed

+14
-9
lines changed

ggml/CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,6 @@ option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental,
210210
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
211211
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
212212
option(GGML_HIP_MMQ_MFMA "ggml: enable MFMA MMA for CDNA in MMQ" ON)
213-
option(GGML_HIP_MMQ_WMMA "ggml: enable WMMA MMA for RDNA4 in MMQ" ON)
214213
option(GGML_HIP_EXPORT_METRICS "ggml: enable kernel perf metrics output" OFF)
215214
option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF)
216215
option(GGML_MUSA_MUDNN_COPY "ggml: enable muDNN for accelerated copy" OFF)

ggml/src/ggml-cuda/common.cuh

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -232,9 +232,6 @@ static const char * cu_get_error_str(CUresult err) {
232232
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
233233
#define VOLTA_MMA_AVAILABLE
234234
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
235-
#if defined(GGML_USE_HIP) && defined(RDNA4)
236-
#define AMD_WMMA_AVAILABLE
237-
#endif // defined(GGML_USE_HIP) && defined(RDNA4) && !defined(GGML_HIP_NO_MMQ_WMMA)
238235

239236
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
240237
#define TURING_MMA_AVAILABLE
@@ -298,10 +295,6 @@ static bool volta_mma_available(const int cc) {
298295
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) == GGML_CUDA_CC_VOLTA;
299296
}
300297

301-
static bool amd_wmma_available(const int cc) {
302-
return GGML_CUDA_CC_IS_RDNA4(cc);
303-
}
304-
305298
static bool turing_mma_available(const int cc) {
306299
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
307300
}

ggml/src/ggml-cuda/mma.cuh

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -437,7 +437,20 @@ namespace ggml_cuda_mma {
437437
xi[0] = xs[0];
438438
}
439439
#elif defined(AMD_WMMA_AVAILABLE)
440-
ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
440+
if constexpr (I == 16 && J == 4) {
441+
int64_t * xi = (int64_t *) t.x;
442+
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
443+
xi[0] = xs[0];
444+
}else if constexpr (I == 16 && J == 8) {
445+
int64_t * xi = (int64_t *) t.x;
446+
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I));
447+
xi[0] = xs[0];
448+
449+
const int64_t * xs1 = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I) + 2);
450+
xi[1] = xs1[0];
451+
}else{
452+
NO_DEVICE_CODE;
453+
}
441454
#else
442455
#pragma unroll
443456
for (int l = 0; l < t.ne; ++l) {

0 commit comments

Comments
 (0)