Skip to content

Commit 3856668

Browse files
authored
ggml : add IQ2 to test-backend-ops + refactoring (#4990)
* ggml : add IQ2 to test-backend-ops + refactoring ggml-ci * cuda : update supports_op for IQ2 ggml-ci * ci : enable LLAMA_CUBLAS=1 for CUDA nodes ggml-ci * cuda : fix out-of-bounds-access in `mul_mat_vec_q` ggml-ci * tests : avoid creating RNGs for each Q tensor ggml-ci * tests : avoid creating RNGs for each tensor ggml-ci
1 parent ba69bbc commit 3856668

9 files changed

+128
-87
lines changed

ci/run.sh

+8-4
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
3636
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON"
3737
fi
3838

39+
if [ ! -z ${GG_BUILD_CUDA} ]; then
40+
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
41+
fi
42+
3943
## helpers
4044

4145
# download a file if it does not exist or if it is outdated
@@ -160,8 +164,8 @@ function gg_run_open_llama_3b_v2 {
160164

161165
set -e
162166

163-
(time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
164-
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
167+
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
168+
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
165169

166170
python3 ../convert.py ${path_models}
167171

@@ -343,8 +347,8 @@ function gg_run_open_llama_7b_v2 {
343347

344348
set -e
345349

346-
(time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
347-
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
350+
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
351+
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
348352

349353
python3 ../convert.py ${path_models}
350354

ggml-backend.c

+2
Original file line numberDiff line numberDiff line change
@@ -692,6 +692,8 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
692692

693693
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
694694
switch (op->op) {
695+
case GGML_OP_CPY:
696+
return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS; // missing type_traits.from_float
695697
case GGML_OP_MUL_MAT:
696698
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
697699
default:

ggml-cuda.cu

+9-3
Original file line numberDiff line numberDiff line change
@@ -5131,10 +5131,10 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
51315131
const block_q_t * x = (const block_q_t *) vx;
51325132
const block_q8_1 * y = (const block_q8_1 *) vy;
51335133

5134-
for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
5135-
const int ibx = row*blocks_per_row + i + threadIdx.x / (qi/vdr); // x block index
5134+
for (int i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) {
5135+
const int ibx = row*blocks_per_row + i; // x block index
51365136

5137-
const int iby = (i + threadIdx.x / (qi/vdr)) * (qk/QK8_1); // y block index that aligns with ibx
5137+
const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
51385138

51395139
const int iqs = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int
51405140

@@ -10918,6 +10918,12 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
1091810918
if (a->ne[3] != b->ne[3]) {
1091910919
return false;
1092010920
}
10921+
ggml_type a_type = a->type;
10922+
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS) {
10923+
if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
10924+
return false;
10925+
}
10926+
}
1092110927
return true;
1092210928
} break;
1092310929
case GGML_OP_GET_ROWS:

ggml-quants.c

+30-44
Original file line numberDiff line numberDiff line change
@@ -1274,7 +1274,12 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
12741274
}
12751275
float sumlx = 0;
12761276
float suml2 = 0;
1277+
#ifdef HAVE_BUGGY_APPLE_LINKER
1278+
// use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
1279+
for (volatile int i = 0; i < n; ++i) {
1280+
#else
12771281
for (int i = 0; i < n; ++i) {
1282+
#endif
12781283
int l = nearest_int(iscale * x[i]);
12791284
l = MAX(-nmax, MIN(nmax-1, l));
12801285
L[i] = l + nmax;
@@ -1649,7 +1654,12 @@ static float make_qkx3_quants(int n, int nmax, const float * restrict x, const f
16491654
float max = x[0];
16501655
float sum_w = weights ? weights[0] : x[0]*x[0];
16511656
float sum_x = sum_w * x[0];
1657+
#ifdef HAVE_BUGGY_APPLE_LINKER
1658+
// use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
1659+
for (volatile int i = 1; i < n; ++i) {
1660+
#else
16521661
for (int i = 1; i < n; ++i) {
1662+
#endif
16531663
if (x[i] < min) min = x[i];
16541664
if (x[i] > max) max = x[i];
16551665
float w = weights ? weights[i] : x[i]*x[i];
@@ -1660,7 +1670,7 @@ static float make_qkx3_quants(int n, int nmax, const float * restrict x, const f
16601670
min = 0;
16611671
}
16621672
if (max <= min) {
1663-
for (int i = 0; i < n; ++i) L[i] = 0;
1673+
memset(L, 0, n);
16641674
*the_min = -min;
16651675
return 0.f;
16661676
}
@@ -1862,7 +1872,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
18621872

18631873
size_t quantize_q2_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
18641874
(void)hist;
1865-
int row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
1875+
size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
18661876
if (!quant_weights) {
18671877
quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
18681878
}
@@ -2181,7 +2191,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
21812191

21822192
size_t quantize_q3_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
21832193
(void)hist;
2184-
int row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
2194+
size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
21852195
if (!quant_weights) {
21862196
quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
21872197
}
@@ -2448,7 +2458,7 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
24482458

24492459
size_t quantize_q4_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
24502460
(void)hist;
2451-
int row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
2461+
size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
24522462
if (!quant_weights) {
24532463
quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
24542464
}
@@ -2771,7 +2781,7 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
27712781

27722782
size_t quantize_q5_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
27732783
(void)hist;
2774-
int row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
2784+
size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
27752785
if (!quant_weights) {
27762786
quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
27772787
}
@@ -3025,7 +3035,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
30253035

30263036
size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
30273037
(void)hist;
3028-
int row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
3038+
size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
30293039
if (!quant_weights) {
30303040
quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
30313041
}
@@ -3072,7 +3082,7 @@ size_t quantize_q4_0(const float * src, void * dst, int nrow, int n_per_row, int
30723082
if (!quant_weights) {
30733083
return ggml_quantize_q4_0(src, dst, nrow*n_per_row, n_per_row, hist);
30743084
}
3075-
int row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
3085+
size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
30763086
char * qrow = (char *)dst;
30773087
for (int row = 0; row < nrow; ++row) {
30783088
quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
@@ -3116,7 +3126,7 @@ size_t quantize_q4_1(const float * src, void * dst, int nrow, int n_per_row, int
31163126
if (!quant_weights) {
31173127
return ggml_quantize_q4_1(src, dst, nrow*n_per_row, n_per_row, hist);
31183128
}
3119-
int row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
3129+
size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
31203130
char * qrow = (char *)dst;
31213131
for (int row = 0; row < nrow; ++row) {
31223132
quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights);
@@ -3169,7 +3179,7 @@ size_t quantize_q5_0(const float * src, void * dst, int nrow, int n_per_row, int
31693179
if (!quant_weights) {
31703180
return ggml_quantize_q5_0(src, dst, nrow*n_per_row, n_per_row, hist);
31713181
}
3172-
int row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
3182+
size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
31733183
char * qrow = (char *)dst;
31743184
for (int row = 0; row < nrow; ++row) {
31753185
quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights);
@@ -3221,7 +3231,7 @@ size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int
32213231
if (!quant_weights) {
32223232
return ggml_quantize_q5_1(src, dst, nrow*n_per_row, n_per_row, hist);
32233233
}
3224-
int row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
3234+
size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
32253235
char * qrow = (char *)dst;
32263236
for (int row = 0; row < nrow; ++row) {
32273237
quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights);
@@ -8565,7 +8575,7 @@ static int iq2_compare_func(const void * left, const void * right) {
85658575
return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
85668576
}
85678577

8568-
static void q2xs_init_impl(int grid_size) {
8578+
void iq2xs_init_impl(int grid_size) {
85698579
const int gindex = iq2_data_index(grid_size);
85708580
if (iq2_data[gindex].grid) {
85718581
return;
@@ -8720,19 +8730,7 @@ static void q2xs_init_impl(int grid_size) {
87208730
free(dist2);
87218731
}
87228732

8723-
void ggml_init_iq2_quantization(enum ggml_type type) {
8724-
if (type == GGML_TYPE_IQ2_XXS) {
8725-
q2xs_init_impl(256);
8726-
}
8727-
else if (type == GGML_TYPE_IQ2_XS) {
8728-
q2xs_init_impl(512);
8729-
}
8730-
else {
8731-
fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type);
8732-
}
8733-
}
8734-
8735-
static void q2xs_deinit_impl(int grid_size) {
8733+
void iq2xs_free_impl(int grid_size) {
87368734
GGML_ASSERT(grid_size == 256 || grid_size == 512 || grid_size == 1024);
87378735
const int gindex = iq2_data_index(grid_size);
87388736
if (iq2_data[gindex].grid) {
@@ -8742,18 +8740,6 @@ static void q2xs_deinit_impl(int grid_size) {
87428740
}
87438741
}
87448742

8745-
void ggml_deinit_iq2_quantization(enum ggml_type type) {
8746-
if (type == GGML_TYPE_IQ2_XXS) {
8747-
q2xs_deinit_impl(256);
8748-
}
8749-
else if (type == GGML_TYPE_IQ2_XS) {
8750-
q2xs_deinit_impl(512);
8751-
}
8752-
else {
8753-
fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type);
8754-
}
8755-
}
8756-
87578743
static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
87588744
const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
87598745
int num_neighbors = neighbours[0];
@@ -8786,10 +8772,10 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
87868772
const int * kmap_q2xs = iq2_data[gindex].map;
87878773
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
87888774

8789-
GGML_ASSERT(quant_weights);
8790-
GGML_ASSERT(kgrid_q2xs);
8791-
GGML_ASSERT(kmap_q2xs);
8792-
GGML_ASSERT(kneighbors_q2xs);
8775+
GGML_ASSERT(quant_weights && "missing quantization weights");
8776+
GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
8777+
GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
8778+
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
87938779
GGML_ASSERT(n%QK_K == 0);
87948780

87958781
const int kMaxQ = 3;
@@ -9005,10 +8991,10 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
90058991
const int * kmap_q2xs = iq2_data[gindex].map;
90068992
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
90078993

9008-
GGML_ASSERT(quant_weights);
9009-
GGML_ASSERT(kmap_q2xs);
9010-
GGML_ASSERT(kgrid_q2xs);
9011-
GGML_ASSERT(kneighbors_q2xs);
8994+
GGML_ASSERT(quant_weights && "missing quantization weights");
8995+
GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
8996+
GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
8997+
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
90128998
GGML_ASSERT(n%QK_K == 0);
90138999

90149000
const int kMaxQ = 3;

ggml-quants.h

+3
Original file line numberDiff line numberDiff line change
@@ -257,3 +257,6 @@ size_t quantize_q4_0 (const float * src, void * dst, int nrows, int n_per_row,
257257
size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
258258
size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
259259
size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
260+
261+
void iq2xs_init_impl(int grid_size);
262+
void iq2xs_free_impl(int grid_size);

ggml.c

+31-3
Original file line numberDiff line numberDiff line change
@@ -18524,6 +18524,28 @@ enum ggml_opt_result ggml_opt_resume_g(
1852418524

1852518525
////////////////////////////////////////////////////////////////////////////////
1852618526

18527+
void ggml_quantize_init(enum ggml_type type) {
18528+
ggml_critical_section_start();
18529+
18530+
switch (type) {
18531+
case GGML_TYPE_IQ2_XXS: iq2xs_init_impl(256); break;
18532+
case GGML_TYPE_IQ2_XS: iq2xs_init_impl(512); break;
18533+
default: // nothing
18534+
break;
18535+
}
18536+
18537+
ggml_critical_section_end();
18538+
}
18539+
18540+
void ggml_quantize_free(void) {
18541+
ggml_critical_section_start();
18542+
18543+
iq2xs_free_impl(256);
18544+
iq2xs_free_impl(512);
18545+
18546+
ggml_critical_section_end();
18547+
}
18548+
1852718549
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
1852818550
assert(k % QK4_0 == 0);
1852918551
const int nb = k / QK4_0;
@@ -18651,9 +18673,15 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
1865118673
return (n/QK8_0*sizeof(block_q8_0));
1865218674
}
1865318675

18676+
bool ggml_quantize_requires_imatrix(enum ggml_type type) {
18677+
return
18678+
type == GGML_TYPE_IQ2_XXS ||
18679+
type == GGML_TYPE_IQ2_XS;
18680+
}
18681+
1865418682
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
1865518683
int nrows, int n_per_row, int64_t * hist, const float * imatrix) {
18656-
(void)imatrix;
18684+
ggml_quantize_init(type); // this is noop if already initialized
1865718685
size_t result = 0;
1865818686
int n = nrows * n_per_row;
1865918687
switch (type) {
@@ -18766,13 +18794,13 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
1876618794
} break;
1876718795
case GGML_TYPE_F16:
1876818796
{
18769-
int elemsize = sizeof(ggml_fp16_t);
18797+
size_t elemsize = sizeof(ggml_fp16_t);
1877018798
ggml_fp32_to_fp16_row(src + start, (ggml_fp16_t *)dst + start, n);
1877118799
result = n * elemsize;
1877218800
} break;
1877318801
case GGML_TYPE_F32:
1877418802
{
18775-
int elemsize = sizeof(float);
18803+
size_t elemsize = sizeof(float);
1877618804
result = n * elemsize;
1877718805
memcpy((uint8_t *)dst + start * elemsize, src + start, result);
1877818806
} break;

ggml.h

+16-4
Original file line numberDiff line numberDiff line change
@@ -2065,6 +2065,18 @@ extern "C" {
20652065
// quantization
20662066
//
20672067

2068+
// - ggml_quantize_init can be called multiple times with the same type
2069+
// it will only initialize the quantization tables for the first call or after ggml_quantize_free
2070+
// automatically called by ggml_quantize_chunk for convenience
2071+
//
2072+
// - ggml_quantize_free will free any memory allocated by ggml_quantize_init
2073+
// call this at the end of the program to avoid memory leaks
2074+
//
2075+
// note: these are thread-safe
2076+
//
2077+
GGML_API void ggml_quantize_init(enum ggml_type type);
2078+
GGML_API void ggml_quantize_free(void);
2079+
20682080
// TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
20692081
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
20702082
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
@@ -2078,13 +2090,13 @@ extern "C" {
20782090
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
20792091
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
20802092

2093+
// some quantization type cannot be used without an importance matrix
2094+
GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type);
2095+
2096+
// calls ggml_quantize_init internally (i.e. can allocate memory)
20812097
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
20822098
int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
20832099

2084-
// These are needed for IQ2_XS and IQ2_XXS quantizations
2085-
GGML_API void ggml_init_iq2_quantization(enum ggml_type type);
2086-
GGML_API void ggml_deinit_iq2_quantization(enum ggml_type type);
2087-
20882100
//
20892101
// gguf
20902102
//

0 commit comments

Comments
 (0)