Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
git rebase on top of master: fixing the correctness of the mat mul op…
…erations, updating layout mappings for RDNA4
  • Loading branch information
jiachengjason committed Nov 23, 2025
commit 48afe04ca374790ae4ec8cac75bfc195f6d39cb9
Empty file modified build-xcframework.sh
100755 → 100644
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are you changing this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it was a mistake, it is reverted now thanks!

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are still changes to this file.

Empty file.
99 changes: 63 additions & 36 deletions ggml/src/ggml-cuda/mma.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -73,34 +73,7 @@ namespace ggml_cuda_mma {
static constexpr int I = I_;
static constexpr int J = J_;

#if defined(GGML_USE_HIP)
#if defined(RDNA4)
static constexpr int ne = I * J / 32;
T x[ne] = {0};

static constexpr __device__ bool supported() {
if (I == 16 && J == 16) return true;
return false;
}

static __device__ __forceinline__ int get_i(const int l) {
if constexpr (I == 16 && J == 16) {
return 8 * (threadIdx.x / 16) + l;
} else {
NO_DEVICE_CODE;
return -1;
}
}

static __device__ __forceinline__ int get_j(const int l) {
if constexpr (I == 16 && J == 16) {
return threadIdx.x % 16;
} else {
NO_DEVICE_CODE;
return -1;
}
}
#else
#if defined(AMD_MFMA_AVAILABLE)
static constexpr int ne = I * J / 64;
T x[ne] = {0};

Expand Down Expand Up @@ -146,7 +119,6 @@ namespace ggml_cuda_mma {
return -1;
}
}
#endif // defined(RDNA4)
#elif __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
static constexpr int ne = I * J / 32;
T x[ne] = {0};
Expand Down Expand Up @@ -177,6 +149,34 @@ namespace ggml_cuda_mma {
return -1;
}
}
#elif defined(AMD_WMMA_AVAILABLE)
#if defined(RDNA4)
static constexpr int ne = I * J / 32;
T x[ne] = {0};

static constexpr __device__ bool supported() {
if (I == 16 && J == 16) return true;
return false;
}

static __device__ __forceinline__ int get_i(const int l) {
if constexpr (I == 16 && J == 16) {
return 8 * (threadIdx.x / 16) + l;
} else {
NO_DEVICE_CODE;
return -1;
}
}

static __device__ __forceinline__ int get_j(const int l) {
if constexpr (I == 16 && J == 16) {
return threadIdx.x % 16;
} else {
NO_DEVICE_CODE;
return -1;
}
}
#endif
#else
static constexpr int ne = I * J / 32;
T x[ne] = {0};
Expand Down Expand Up @@ -425,7 +425,7 @@ namespace ggml_cuda_mma {

template <int I, int J, typename T>
static __device__ __forceinline__ void load_generic(tile<I, J, T> & t, const T * __restrict__ xs0, const int stride) {
#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
#if defined(AMD_MFMA_AVAILABLE)
if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
#pragma unroll
for (int l = 0; l < t.ne; ++l) {
Expand Down Expand Up @@ -784,21 +784,21 @@ namespace ggml_cuda_mma {
#if defined(RDNA4)

acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
false,
true,
a_vec[0],
false,
true,
b_vec[0],
acc[0],
false
true
);

acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
false,
true,
a_vec[1],
false,
true,
b_vec[1],
acc[0],
false
true
);
#endif // defined(RDNA4)

Expand Down Expand Up @@ -873,4 +873,31 @@ namespace ggml_cuda_mma {
mma(D16[1], A16[1], B);
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
}

static __device__ __forceinline__ void mma(
tile<16, 16, int> & D, const tile<16, 4, int> & A, const tile<16, 4, int> & B) {
#if defined(AMD_WMMA_AVAILABLE)
using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
int32x2_t * a_vec = (int32x2_t *) A.x;
int32x2_t * b_vec = (int32x2_t *) B.x;

using int32x8_t = __attribute__((__vector_size__(8 * sizeof(int)))) int;
int32x8_t * acc = (int32x8_t *) D.x;

acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
true,
a_vec[0],
true,
b_vec[0],
acc[0],
false
);
#else
GGML_UNUSED(D);
GGML_UNUSED(A);
GGML_UNUSED(B);
NO_DEVICE_CODE;
#endif // AMD_MFMA_AVAILABLE
}
}

176 changes: 170 additions & 6 deletions ggml/src/ggml-cuda/mmq.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ static int mmq_get_nwarps_host(const int /*cc*/, const int warp_size) {
#endif // (GGML_USE_HIP)

static constexpr __device__ int mmq_get_nwarps_device() {
#if defined(AMD_MFMA_AVAILABLE)
#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
return 8;
#else
return 256/ggml_cuda_get_physical_warp_size();
Expand Down Expand Up @@ -1129,7 +1129,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_dp4a(
template <int mmq_x, int mmq_y>
static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
#if defined(AMD_MFMA_AVAILABLE)
typedef tile<16, 8, int> tile_A;
typedef tile<16, 8, int> tile_B;
typedef tile<16, 16, int> tile_C;
Expand Down Expand Up @@ -1170,6 +1170,54 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
tile_C C;
mma(C, A[n], B[0]);

#pragma unroll
for (int l = 0; l < tile_C::ne; ++l) {
const int i = i0 + n*tile_C::I + tile_C::get_i(l);
sum[(j0/tile_C::J + n)*tile_C::ne + l] += C.x[l] * x_df[i*MMQ_MMA_TILE_X_K_Q3_K + k0/4] * dB;
}
}
}
}
#elif defined(AMD_WMMA_AVAILABLE)
typedef tile<16, 4, int> tile_A;
typedef tile<16, 4, int> tile_B;
typedef tile<16, 16, int> tile_C;

constexpr int granularity = mmq_get_granularity_device(mmq_x);
constexpr int rows_per_warp = granularity;
constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.

y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);

const int * x_qs = (const int *) x;
const float * x_df = (const float *) x_qs + MMQ_TILE_NE_K*2;
const int * y_qs = (const int *) y + 4;
const float * y_df = (const float *) y;

const int i0 = (threadIdx.y / ntx) * rows_per_warp;

for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 4) {
const int k0 = k00 + k01;

tile_A A[ntx];
#pragma unroll
for (int n = 0; n < ntx; ++n) {
load_generic(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q3_K + k0, MMQ_MMA_TILE_X_K_Q3_K);
}

#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
tile_B B;
load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);

const int j = j0 + tile_C::get_j(0);
const float dB = y_df[j*MMQ_TILE_Y_K + k01/QI8_1];

#pragma unroll
for (int n = 0; n < ntx; ++n) {
tile_C C;
mma(C, A[n], B);

#pragma unroll
for (int l = 0; l < tile_C::ne; ++l) {
const int i = i0 + n*tile_C::I + tile_C::get_i(l);
Expand Down Expand Up @@ -1386,7 +1434,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a(
template <int mmq_x, int mmq_y>
static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
#if defined(AMD_MFMA_AVAILABLE)
typedef tile<16, 8, int> tile_A;
typedef tile<16, 8, int> tile_B;
typedef tile<16, 16, int> tile_C;
Expand Down Expand Up @@ -1438,6 +1486,72 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
tile_C Cd;
mma(Cd, A[n], B[0]);

#pragma unroll
for (int l = 0; l < tile_C::ne; ++l) {
const int i = i0 + n*tile_C::I + tile_C::get_i(l);
const float2 dm = __half22float2(x_dm[i*MMQ_MMA_TILE_X_K_Q2_K + k0/4]);
float tmp = Cd.x[l]*dm.x;
if (k01 >= MMQ_TILE_NE_K * 3/4) {
tmp -= Cm.x[l]*dm.y;
}
sum[(j0/tile_C::J + n)*tile_C::ne + l] += tmp*dB;
sum[(j0/tile_C::J + n)*tile_C::ne + l] -= dm.y*sB;
}
}
}
}
#elif defined(AMD_WMMA_AVAILABLE)

typedef tile<16, 4, int> tile_A;
typedef tile<16, 4, int> tile_B;
typedef tile<16, 16, int> tile_C;

constexpr int granularity = mmq_get_granularity_device(mmq_x);
constexpr int rows_per_warp = granularity;
constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.

y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);

const int * x_qs = (const int *) x;
const half2 * x_dm = (const half2 *) x_qs + MMQ_TILE_NE_K*2;
const int * y_qs = (const int *) y + 4;
const half2 * y_ds = (const half2 *) y;

const int i0 = (threadIdx.y / ntx) * rows_per_warp;

for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 4) {
const int k0 = k00 + k01;

tile_A A[ntx];
#pragma unroll
for (int n = 0; n < ntx; ++n) {
load_generic(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q2_K + k0, MMQ_MMA_TILE_X_K_Q2_K);
}

#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
tile_B B;
load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);

const int j = j0 + tile_C::get_j(0);
const float dB = (k01 < MMQ_TILE_NE_K/2) ? __half22float2(y_ds[j*MMQ_TILE_Y_K]).x : __half22float2(y_ds[j*MMQ_TILE_Y_K]).y;
const float sB = (k01 >= MMQ_TILE_NE_K * 3/4) ? 0
: (((k01/4)%2) ? __half22float2(y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]).y
: __half22float2(y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]).x);

tile_C Cm;
if (k01 >= MMQ_TILE_NE_K * 3/4) {
tile_A A1;
A1.x[0] = 0x01010101;
A1.x[1] = 0x01010101;
mma(Cm, A1, B);
}

#pragma unroll
for (int n = 0; n < ntx; ++n) {
tile_C Cd;
mma(Cd, A[n], B);

#pragma unroll
for (int l = 0; l < tile_C::ne; ++l) {
const int i = i0 + n*tile_C::I + tile_C::get_i(l);
Expand Down Expand Up @@ -1662,7 +1776,7 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
}

#if !(defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)) || defined(AMD_WMMA_AVAILABLE)
#if !(defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE))
#pragma unroll
for (int i0 = 0; i0 < mmq_y; i0 += nwarps*warp_size) {
int i = (i0 + threadIdx.y*warp_size + threadIdx.x) % mmq_y;
Expand Down Expand Up @@ -1921,7 +2035,7 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
constexpr int rows_per_warp = warp_size / 2;
#pragma unroll
for (int i0 = 0; i0 < mmq_y; i0 += nwarps*rows_per_warp) {
#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
#if defined(AMD_MFMA_AVAILABLE)
// Need if on AMD instead of % because warp_size == 64
// This causes double work and throughput loss (MI300X)
// H100 loses about 100 t/s with 'if' condition over '%'
Expand Down Expand Up @@ -2148,7 +2262,7 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_dp4a(
template <int mmq_x, int mmq_y>
static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
#if defined(AMD_MFMA_AVAILABLE)
typedef tile<16, 8, int> tile_A;
typedef tile<16, 8, int> tile_B;
typedef tile<16, 16, int> tile_C;
Expand Down Expand Up @@ -2190,6 +2304,56 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
tile_C C;
mma(C, A[n], B[0]);

#pragma unroll
for (int l = 0; l < tile_C::ne; ++l) {
const int i = i0 + n*tile_C::I + tile_C::get_i(l);
const int8_t * sc = (const int8_t *) (x_sc + i*MMQ_MMA_TILE_X_K_Q6_K + k00/16);
sum[(j0/tile_C::J + n)*tile_C::ne + l] += C.x[l] * sc[k01/4] * x_df[i*MMQ_MMA_TILE_X_K_Q6_K] * dB;
}
}
}
}
#elif defined(AMD_WMMA_AVAILABLE)
typedef tile<16, 4, int> tile_A;
typedef tile<16, 4, int> tile_B;
typedef tile<16, 16, int> tile_C;

constexpr int granularity = mmq_get_granularity_device(mmq_x);
constexpr int rows_per_warp = granularity;
constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.

y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);

const int * x_qs = (const int *) x;
const float * x_df = (const float *) x_qs + MMQ_TILE_NE_K*2;
const int * x_sc = (const int *) x_df + MMQ_TILE_NE_K/QI6_K;
const int * y_qs = (const int *) y + 4;
const float * y_df = (const float *) y;

const int i0 = (threadIdx.y / ntx) * rows_per_warp;

for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 4) {
const int k0 = k00 + k01;

tile_A A[ntx];
#pragma unroll
for (int n = 0; n < ntx; ++n) {
load_generic(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q6_K + k0, MMQ_MMA_TILE_X_K_Q6_K);
}

#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
tile_B B;
load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);

const int j = j0 + tile_C::get_j(0);
const float dB = y_df[j*MMQ_TILE_Y_K + k01/QI8_1];

#pragma unroll
for (int n = 0; n < ntx; ++n) {
tile_C C;
mma(C, A[n], B);

#pragma unroll
for (int l = 0; l < tile_C::ne; ++l) {
const int i = i0 + n*tile_C::I + tile_C::get_i(l);
Expand Down