@@ -986,42 +986,82 @@ kernel void kernel_mul_mat_q4_k_f32(
986
986
987
987
const int tid = tpitg.y ; // 0...16
988
988
const int il = tid/4 ; // 0...3
989
- const int ir = tid%4 ; // 0...3
989
+ // const int ir = tid%4; // 0...3
990
+ const int ir = tid - 4 *il;// 0...3
990
991
const int n = 4 ;
991
992
992
993
const int im = il/2 ; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
993
994
const int in = il%2 ;
995
+
994
996
const int l0 = n*(2 *ir + in);
997
+ const int q_offset = 32 *im + l0;
998
+ const int y_offset = 64 *im + l0;
995
999
996
1000
sum[ith] = 0 .0f ;
997
1001
1002
+ // uint16_t aux_scales[4];
1003
+ // thread uint8_t * sc = (thread uint8_t *)aux_scales;
1004
+
1005
+ // uint32_t aux32[4];
1006
+ // thread const uint8_t * sc = (thread const uint8_t *)aux32;
1007
+
1008
+ uchar2 sc1, sc2, sc3, sc4;
1009
+
998
1010
float sumf = 0 ;
999
1011
for (int i = tpitg.x ; i < nb; i += tptg.x ) {
1000
1012
1001
- device const uint8_t * q1 = (x + i)->qs + 32 *im + l0;
1002
- device const float * y1 = yy + i*QK_K + 64 *im + l0;
1013
+ device const uint8_t * q1 = (x + i)->qs + q_offset;
1003
1014
device const uint8_t * q2 = q1 + 64 ;
1015
+ device const float * y1 = yy + i*QK_K + y_offset;
1004
1016
device const float * y2 = y1 + 128 ;
1005
1017
1006
- device const uint16_t * a = (device const uint16_t *)(x + i)->scales ;
1007
-
1008
1018
const float dall = (float )((x + i)->d );
1009
1019
const float dmin = (float )((x + i)->dmin );
1010
1020
1011
- const uchar2 sc1 = as_type<uchar2>((uint16_t )(a[im+0 ] & kmask1));
1012
- const uchar2 sc2 = as_type<uchar2>((uint16_t )(a[im+2 ] & kmask1));
1013
- const uchar2 sc3 = as_type<uchar2>((uint16_t )(((a[im+4 ] >> 0 ) & kmask2) | ((a[im+0 ] & kmask3) >> 2 )));
1014
- const uchar2 sc4 = as_type<uchar2>((uint16_t )(((a[im+4 ] >> 4 ) & kmask2) | ((a[im+2 ] & kmask3) >> 2 )));
1021
+ // device const uint32_t * a = (device const uint32_t *)(x + i)->scales;
1022
+ // aux32[0] = a[0] & 0x3f3f3f3f; // scales for 0, 32, 64, 96
1023
+ // aux32[1] = a[1] & 0x3f3f3f3f; // mins for 0, 32, 64, 96
1024
+ // aux32[2] = ((a[2] >> 0) & 0x0f0f0f0f) | ((a[0] & 0xc0c0c0c0) >> 2); // scales for 128, 160, 192, 224
1025
+ // aux32[3] = ((a[2] >> 4) & 0x0f0f0f0f) | ((a[1] & 0xc0c0c0c0) >> 2); // mins for 128, 160, 192, 224
1026
+
1027
+ // aux_scales[0] = (uint16_t)(a[im+0] & kmask1);
1028
+ // aux_scales[1] = (uint16_t)(((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2));
1029
+ // aux_scales[2] = (uint16_t)(a[im+2] & kmask1);
1030
+ // aux_scales[3] = (uint16_t)(((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2));
1031
+
1032
+ device const uint16_t * a = (device const uint16_t *)(x + i)->scales ;
1033
+ sc1 = as_type<uchar2>((uint16_t )(a[im+0 ] & kmask1));
1034
+ sc2 = as_type<uchar2>((uint16_t )(a[im+2 ] & kmask1));
1035
+ sc3 = as_type<uchar2>((uint16_t )(((a[im+4 ] >> 0 ) & kmask2) | ((a[im+0 ] & kmask3) >> 2 )));
1036
+ sc4 = as_type<uchar2>((uint16_t )(((a[im+4 ] >> 4 ) & kmask2) | ((a[im+2 ] & kmask3) >> 2 )));
1015
1037
1016
- float2 s = {0 .f , 0 .f };
1038
+ // float2 s = {0.f, 0.f};
1039
+ float4 s = {0 .f , 0 .f , 0 .f , 0 .f };
1040
+ float smin = 0 ;
1017
1041
for (int l = 0 ; l < n; ++l) {
1018
- s[0 ] += y1 [l] * sc1[0 ] * (q1[l] & 0xF ) + y1 [l+32 ] * sc1[1 ] * (q1[l] >> 4 )
1019
- + y2[l] * sc3[0 ] * (q2[l] & 0xF ) + y2[l+32 ] * sc3[1 ] * (q2[l] >> 4 );
1020
- s[1 ] += y1 [l] * sc2[0 ] + y1 [l+32 ] * sc2[1 ] + y2[l] * sc4[0 ] + y2[l+32 ] * sc4[1 ];
1042
+
1043
+ // //s[0] += y1[l] * sc[0] * (q1[l] & 0xF) + y1[l+32] * sc[1] * (q1[l] >> 4)
1044
+ // // + y2[l] * sc[2] * (q2[l] & 0xF) + y2[l+32] * sc[3] * (q2[l] >> 4);
1045
+ // //s[1] += y1[l] * sc[4] + y1[l+32] * sc[5] + y2[l] * sc[6] + y2[l+32] * sc[7];
1046
+
1047
+ // //s[0] += y1[l] * sc[2*im+0] * (q1[l] & 0xF) + y1[l+32] * sc[2*im+1] * (q1[l] >> 4)
1048
+ // // + y2[l] * sc[2*im+8] * (q2[l] & 0xF) + y2[l+32] * sc[2*im+9] * (q2[l] >> 4);
1049
+ // //s[1] += y1[l] * sc[2*im+4] + y1[l+32] * sc[2*im+5] + y2[l] * sc[2*im+12] + y2[l+32] * sc[2*im+13];
1050
+
1051
+ // s[0] += y1[l] * sc1[0] * (q1[l] & 0xF) + y1[l+32] * sc1[1] * (q1[l] >> 4)
1052
+ // + y2[l] * sc3[0] * (q2[l] & 0xF) + y2[l+32] * sc3[1] * (q2[l] >> 4);
1053
+ // s[1] += y1[l] * sc2[0] + y1[l+32] * sc2[1] + y2[l] * sc4[0] + y2[l+32] * sc4[1];
1054
+
1055
+ s[0 ] += y1 [l] * (q1[l] & 0xF ); s[1 ] += y1 [l+32 ] * (q1[l] >> 4 );
1056
+ s[2 ] += y2[l] * (q2[l] & 0xF ); s[3 ] += y2[l+32 ] * (q2[l] >> 4 );
1057
+ smin += y1 [l] * sc2[0 ] + y1 [l+32 ] * sc2[1 ] + y2[l] * sc4[0 ] + y2[l+32 ] * sc4[1 ];
1058
+
1021
1059
}
1022
- sumf += dall * s[0 ] - dmin * s[1 ];
1060
+ // sumf += dall * s[0] - dmin * s[1];
1061
+ sumf += dall * (s[0 ] * sc1[0 ] + s[1 ] * sc1[1 ] + s[2 ] * sc3[0 ] + s[3 ] * sc3[1 ]) - dmin * smin;
1023
1062
1024
1063
}
1064
+
1025
1065
sum[ith] = sumf;
1026
1066
1027
1067
//
0 commit comments