Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
PR clean up, addressed comments
  • Loading branch information
jiachengjason committed Nov 23, 2025
commit 1dc62a795ce1a165636e50f32a0c23d64dca9fe0
6 changes: 1 addition & 5 deletions ggml/src/ggml-cuda/common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ static const char * cu_get_error_str(CUresult err) {
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
#define VOLTA_MMA_AVAILABLE
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
#if defined(GGML_USE_HIP) && defined(RDNA4) && !defined(GGML_HIP_NO_MMQ_WMMA)
#if defined(GGML_USE_HIP) && defined(RDNA4)
#define AMD_WMMA_AVAILABLE
#endif // defined(GGML_USE_HIP) && defined(RDNA4) && !defined(GGML_HIP_NO_MMQ_WMMA)

Expand Down Expand Up @@ -299,11 +299,7 @@ static bool volta_mma_available(const int cc) {
}

static bool amd_wmma_available(const int cc) {
#if !defined(GGML_HIP_NO_MMQ_WMMA)
return GGML_CUDA_CC_IS_RDNA4(cc);
#else
return false;
#endif //!defined(GGML_HIP_NO_MMQ_WMMA)
}

// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
Expand Down
4 changes: 2 additions & 2 deletions ggml/src/ggml-cuda/mmq.cu
Original file line number Diff line number Diff line change
Expand Up @@ -290,11 +290,11 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
}

if (amd_mfma_available(cc)||amd_wmma_available(cc)) {
if (amd_mfma_available(cc) || amd_wmma_available(cc)) {
// As of ROCM 7.0 rocblas/tensile performs very poorly on CDNA3 and hipblaslt (via ROCBLAS_USE_HIPBLASLT)
// performs better but is currently suffering from a crash on this architecture.
// TODO: Revisit when hipblaslt is fixed on CDNA3
if (GGML_CUDA_CC_IS_CDNA3(cc)||GGML_CUDA_CC_IS_RDNA4(cc)) {
if (GGML_CUDA_CC_IS_CDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
return true;
}
if (ne11 <= 128 || type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) {
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-cuda/mmq.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -2262,7 +2262,7 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_dp4a(
template <int mmq_x, int mmq_y>
static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
#if defined(AMD_MFMA_AVAILABLE)
#if defined(AMD_MFMA_AVAILABLE)
typedef tile<16, 8, int> tile_A;
typedef tile<16, 8, int> tile_B;
typedef tile<16, 16, int> tile_C;
Expand Down
4 changes: 0 additions & 4 deletions ggml/src/ggml-hip/CMakeLists.txt
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 is not used anymore

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

its not

Original file line number Diff line number Diff line change
Expand Up @@ -120,10 +120,6 @@ if (NOT GGML_HIP_MMQ_WMMA)
add_compile_definitions(GGML_HIP_NO_MMQ_WMMA)
endif()

if (GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 OR ${hip_VERSION} VERSION_GREATER_EQUAL 7.0)
add_compile_definitions(GGML_HIP_ROCWMMA_FATTN_GFX12)
endif()

if (GGML_HIP_EXPORT_METRICS)
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Rpass-analysis=kernel-resource-usage --save-temps")
endif()
Expand Down