Testing 2 devices Backend 1/2: ROCm0 Device description: AMD Radeon AI PRO R9700 Device memory: 32624 MB (32556 MB free) MUL_MAT(type_a=f16,type_b=f32,m=16416,n=1,k=128,bs=[8,1],nr=[4,1],per=[0,2,1,3],v=0,o=1): 2976 runs - 355.35 us/run - 134.48 MFLOP/run - 378.44 GFLOPS MUL_MAT(type_a=f16,type_b=f32,m=128,n=1,k=16416,bs=[8,1],nr=[4,1],per=[0,1,2,3],v=1,o=1): 11160 runs - 95.55 us/run - 134.48 MFLOP/run -  1.41 TFLOPS MUL_MAT(type_a=f32,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 3408 runs - 371.89 us/run - 117.44 MFLOP/run - 315.80 GFLOPS MUL_MAT(type_a=f16,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 5964 runs - 189.74 us/run - 117.44 MFLOP/run - 618.96 GFLOPS MUL_MAT(type_a=bf16,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 5964 runs - 189.91 us/run - 117.44 MFLOP/run - 618.39 GFLOPS MUL_MAT(type_a=q4_0,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 34932 runs - 28.65 us/run - 117.44 MFLOP/run -  4.10 TFLOPS MUL_MAT(type_a=q4_1,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 34932 runs - 29.29 us/run - 117.44 MFLOP/run -  4.01 TFLOPS MUL_MAT(type_a=q5_0,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 27264 runs - 37.02 us/run - 117.44 MFLOP/run -  3.17 TFLOPS MUL_MAT(type_a=q5_1,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 29820 runs - 34.34 us/run - 117.44 MFLOP/run -  3.42 TFLOPS MUL_MAT(type_a=q8_0,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 21300 runs - 47.17 us/run - 117.44 MFLOP/run -  2.49 TFLOPS MUL_MAT(type_a=mxfp4,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 32376 runs - 31.46 us/run - 117.44 MFLOP/run -  3.73 TFLOPS MUL_MAT(type_a=q2_K,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 24708 runs - 40.89 us/run - 117.44 MFLOP/run -  2.87 TFLOPS MUL_MAT(type_a=q3_K,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 15336 runs - 66.83 us/run - 117.44 MFLOP/run -  1.76 TFLOPS MUL_MAT(type_a=q4_K,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 23856 runs - 42.22 us/run - 117.44 MFLOP/run -  2.78 TFLOPS MUL_MAT(type_a=q5_K,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 22152 runs - 45.36 us/run - 117.44 MFLOP/run -  2.59 TFLOPS MUL_MAT(type_a=q6_K,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 16188 runs - 63.43 us/run - 117.44 MFLOP/run -  1.85 TFLOPS MUL_MAT(type_a=iq2_xxs,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 15336 runs - 68.31 us/run - 117.44 MFLOP/run -  1.72 TFLOPS MUL_MAT(type_a=iq2_xs,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 19596 runs - 52.68 us/run - 117.44 MFLOP/run -  2.23 TFLOPS MUL_MAT(type_a=iq2_s,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 14484 runs - 71.52 us/run - 117.44 MFLOP/run -  1.64 TFLOPS MUL_MAT(type_a=iq3_xxs,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 18744 runs - 54.21 us/run - 117.44 MFLOP/run -  2.17 TFLOPS MUL_MAT(type_a=iq1_s,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 35784 runs - 28.45 us/run - 117.44 MFLOP/run -  4.13 TFLOPS MUL_MAT(type_a=iq1_m,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 30672 runs - 32.61 us/run - 117.44 MFLOP/run -  3.60 TFLOPS MUL_MAT(type_a=iq4_nl,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 33228 runs - 30.47 us/run - 117.44 MFLOP/run -  3.85 TFLOPS MUL_MAT(type_a=iq3_s,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 13632 runs - 73.88 us/run - 117.44 MFLOP/run -  1.59 TFLOPS MUL_MAT(type_a=iq4_xs,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 32376 runs - 31.59 us/run - 117.44 MFLOP/run -  3.72 TFLOPS MUL_MAT(type_a=f32,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 2982 runs - 372.97 us/run - 234.88 MFLOP/run - 629.76 GFLOPS MUL_MAT(type_a=f16,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 5538 runs - 192.50 us/run - 234.88 MFLOP/run -  1.22 TFLOPS MUL_MAT(type_a=bf16,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 5538 runs - 193.14 us/run - 234.88 MFLOP/run -  1.22 TFLOPS MUL_MAT(type_a=q4_0,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 24282 runs - 41.69 us/run - 234.88 MFLOP/run -  5.63 TFLOPS MUL_MAT(type_a=q4_1,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 29820 runs - 33.71 us/run - 234.88 MFLOP/run -  6.97 TFLOPS MUL_MAT(type_a=q5_0,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 22578 runs - 45.13 us/run - 234.88 MFLOP/run -  5.20 TFLOPS MUL_MAT(type_a=q5_1,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 23856 runs - 41.96 us/run - 234.88 MFLOP/run -  5.60 TFLOPS MUL_MAT(type_a=q8_0,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 20448 runs - 49.71 us/run - 234.88 MFLOP/run -  4.73 TFLOPS MUL_MAT(type_a=mxfp4,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 24708 runs - 40.49 us/run - 234.88 MFLOP/run -  5.80 TFLOPS MUL_MAT(type_a=q2_K,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 16188 runs - 62.84 us/run - 234.88 MFLOP/run -  3.74 TFLOPS MUL_MAT(type_a=q3_K,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 12354 runs - 81.96 us/run - 234.88 MFLOP/run -  2.87 TFLOPS MUL_MAT(type_a=q4_K,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 16188 runs - 62.01 us/run - 234.88 MFLOP/run -  3.79 TFLOPS MUL_MAT(type_a=q5_K,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 15336 runs - 65.42 us/run - 234.88 MFLOP/run -  3.59 TFLOPS MUL_MAT(type_a=q6_K,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 12780 runs - 79.82 us/run - 234.88 MFLOP/run -  2.94 TFLOPS MUL_MAT(type_a=iq2_xxs,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 13206 runs - 77.50 us/run - 234.88 MFLOP/run -  3.03 TFLOPS MUL_MAT(type_a=iq2_xs,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 16614 runs - 60.83 us/run - 234.88 MFLOP/run -  3.86 TFLOPS MUL_MAT(type_a=iq2_s,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 12780 runs - 80.10 us/run - 234.88 MFLOP/run -  2.93 TFLOPS MUL_MAT(type_a=iq3_xxs,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 16614 runs - 61.60 us/run - 234.88 MFLOP/run -  3.81 TFLOPS MUL_MAT(type_a=iq1_s,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 29820 runs - 33.84 us/run - 234.88 MFLOP/run -  6.94 TFLOPS MUL_MAT(type_a=iq1_m,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 25986 runs - 39.04 us/run - 234.88 MFLOP/run -  6.02 TFLOPS MUL_MAT(type_a=iq4_nl,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 23004 runs - 43.72 us/run - 234.88 MFLOP/run -  5.37 TFLOPS MUL_MAT(type_a=iq3_s,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 12780 runs - 79.74 us/run - 234.88 MFLOP/run -  2.95 TFLOPS MUL_MAT(type_a=iq4_xs,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 28542 runs - 35.38 us/run - 234.88 MFLOP/run -  6.64 TFLOPS MUL_MAT(type_a=f32,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 2840 runs - 374.88 us/run - 352.32 MFLOP/run - 939.83 GFLOPS MUL_MAT(type_a=f16,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 5112 runs - 197.40 us/run - 352.32 MFLOP/run -  1.78 TFLOPS MUL_MAT(type_a=bf16,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 5112 runs - 206.69 us/run - 352.32 MFLOP/run -  1.70 TFLOPS MUL_MAT(type_a=q4_0,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 21584 runs - 46.64 us/run - 352.32 MFLOP/run -  7.55 TFLOPS MUL_MAT(type_a=q4_1,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 22720 runs - 44.57 us/run - 352.32 MFLOP/run -  7.91 TFLOPS MUL_MAT(type_a=q5_0,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 19028 runs - 52.58 us/run - 352.32 MFLOP/run -  6.70 TFLOPS MUL_MAT(type_a=q5_1,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 20732 runs - 48.71 us/run - 352.32 MFLOP/run -  7.23 TFLOPS MUL_MAT(type_a=q8_0,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 18744 runs - 53.65 us/run - 352.32 MFLOP/run -  6.57 TFLOPS MUL_MAT(type_a=mxfp4,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 21300 runs - 47.28 us/run - 352.32 MFLOP/run -  7.45 TFLOPS MUL_MAT(type_a=q2_K,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 11644 runs - 87.10 us/run - 352.32 MFLOP/run -  4.04 TFLOPS MUL_MAT(type_a=q3_K,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 9940 runs - 102.53 us/run - 352.32 MFLOP/run -  3.44 TFLOPS MUL_MAT(type_a=q4_K,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 11928 runs - 85.15 us/run - 352.32 MFLOP/run -  4.14 TFLOPS MUL_MAT(type_a=q5_K,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 11360 runs - 89.04 us/run - 352.32 MFLOP/run -  3.96 TFLOPS MUL_MAT(type_a=q6_K,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 10508 runs - 95.38 us/run - 352.32 MFLOP/run -  3.69 TFLOPS MUL_MAT(type_a=iq2_xxs,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 12212 runs - 83.76 us/run - 352.32 MFLOP/run -  4.21 TFLOPS MUL_MAT(type_a=iq2_xs,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 14484 runs - 69.75 us/run - 352.32 MFLOP/run -  5.05 TFLOPS MUL_MAT(type_a=iq2_s,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 11360 runs - 88.39 us/run - 352.32 MFLOP/run -  3.99 TFLOPS MUL_MAT(type_a=iq3_xxs,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 14484 runs - 69.18 us/run - 352.32 MFLOP/run -  5.09 TFLOPS MUL_MAT(type_a=iq1_s,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 21016 runs - 47.91 us/run - 352.32 MFLOP/run -  7.35 TFLOPS MUL_MAT(type_a=iq1_m,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 20164 runs - 49.99 us/run - 352.32 MFLOP/run -  7.05 TFLOPS MUL_MAT(type_a=iq4_nl,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 21016 runs - 47.72 us/run - 352.32 MFLOP/run -  7.38 TFLOPS MUL_MAT(type_a=iq3_s,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 11928 runs - 85.75 us/run - 352.32 MFLOP/run -  4.11 TFLOPS MUL_MAT(type_a=iq4_xs,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 25560 runs - 39.37 us/run - 352.32 MFLOP/run -  8.95 TFLOPS MUL_MAT(type_a=f32,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 2769 runs - 376.21 us/run - 469.76 MFLOP/run -  1.25 TFLOPS MUL_MAT(type_a=f16,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 4899 runs - 205.05 us/run - 469.76 MFLOP/run -  2.29 TFLOPS MUL_MAT(type_a=bf16,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 4260 runs - 236.39 us/run - 469.76 MFLOP/run -  1.99 TFLOPS MUL_MAT(type_a=q4_0,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 18957 runs - 53.17 us/run - 469.76 MFLOP/run -  8.83 TFLOPS MUL_MAT(type_a=q4_1,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 19596 runs - 51.16 us/run - 469.76 MFLOP/run -  9.18 TFLOPS MUL_MAT(type_a=q5_0,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 17040 runs - 59.19 us/run - 469.76 MFLOP/run -  7.94 TFLOPS MUL_MAT(type_a=q5_1,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 17679 runs - 56.78 us/run - 469.76 MFLOP/run -  8.27 TFLOPS MUL_MAT(type_a=q8_0,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 15975 runs - 62.89 us/run - 469.76 MFLOP/run -  7.47 TFLOPS MUL_MAT(type_a=mxfp4,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 18744 runs - 53.77 us/run - 469.76 MFLOP/run -  8.74 TFLOPS MUL_MAT(type_a=q2_K,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 9159 runs - 111.30 us/run - 469.76 MFLOP/run -  4.22 TFLOPS MUL_MAT(type_a=q3_K,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 8307 runs - 121.29 us/run - 469.76 MFLOP/run -  3.87 TFLOPS MUL_MAT(type_a=q4_K,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 9372 runs - 108.68 us/run - 469.76 MFLOP/run -  4.32 TFLOPS MUL_MAT(type_a=q5_K,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 8946 runs - 112.83 us/run - 469.76 MFLOP/run -  4.16 TFLOPS MUL_MAT(type_a=q6_K,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 8946 runs - 112.11 us/run - 469.76 MFLOP/run -  4.19 TFLOPS MUL_MAT(type_a=iq2_xxs,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 10863 runs - 92.15 us/run - 469.76 MFLOP/run -  5.10 TFLOPS MUL_MAT(type_a=iq2_xs,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 12567 runs - 79.67 us/run - 469.76 MFLOP/run -  5.90 TFLOPS MUL_MAT(type_a=iq2_s,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 10437 runs - 97.04 us/run - 469.76 MFLOP/run -  4.84 TFLOPS MUL_MAT(type_a=iq3_xxs,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 12567 runs - 80.41 us/run - 469.76 MFLOP/run -  5.84 TFLOPS MUL_MAT(type_a=iq1_s,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 17679 runs - 56.73 us/run - 469.76 MFLOP/run -  8.28 TFLOPS MUL_MAT(type_a=iq1_m,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 16827 runs - 59.59 us/run - 469.76 MFLOP/run -  7.88 TFLOPS MUL_MAT(type_a=iq4_nl,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 18531 runs - 54.21 us/run - 469.76 MFLOP/run -  8.66 TFLOPS MUL_MAT(type_a=iq3_s,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 10863 runs - 92.73 us/run - 469.76 MFLOP/run -  5.07 TFLOPS MUL_MAT(type_a=iq4_xs,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 20022 runs - 50.08 us/run - 469.76 MFLOP/run -  9.38 TFLOPS MUL_MAT(type_a=f32,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 2736 runs - 380.33 us/run - 587.20 MFLOP/run -  1.54 TFLOPS MUL_MAT(type_a=f16,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 4617 runs - 217.14 us/run - 587.20 MFLOP/run -  2.70 TFLOPS MUL_MAT(type_a=bf16,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 4275 runs - 239.30 us/run - 587.20 MFLOP/run -  2.45 TFLOPS MUL_MAT(type_a=q4_0,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 14877 runs - 67.49 us/run - 587.20 MFLOP/run -  8.70 TFLOPS MUL_MAT(type_a=q4_1,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 15219 runs - 65.82 us/run - 587.20 MFLOP/run -  8.92 TFLOPS MUL_MAT(type_a=q5_0,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 14364 runs - 70.32 us/run - 587.20 MFLOP/run -  8.35 TFLOPS MUL_MAT(type_a=q5_1,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 15390 runs - 65.65 us/run - 587.20 MFLOP/run -  8.94 TFLOPS MUL_MAT(type_a=q8_0,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 13680 runs - 73.79 us/run - 587.20 MFLOP/run -  7.96 TFLOPS MUL_MAT(type_a=mxfp4,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 14706 runs - 68.12 us/run - 587.20 MFLOP/run -  8.62 TFLOPS MUL_MAT(type_a=q2_K,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 7524 runs - 135.24 us/run - 587.20 MFLOP/run -  4.34 TFLOPS MUL_MAT(type_a=q3_K,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 7182 runs - 142.01 us/run - 587.20 MFLOP/run -  4.13 TFLOPS MUL_MAT(type_a=q4_K,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 7695 runs - 132.06 us/run - 587.20 MFLOP/run -  4.45 TFLOPS MUL_MAT(type_a=q5_K,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 7524 runs - 135.07 us/run - 587.20 MFLOP/run -  4.35 TFLOPS MUL_MAT(type_a=q6_K,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 7866 runs - 129.34 us/run - 587.20 MFLOP/run -  4.54 TFLOPS MUL_MAT(type_a=iq2_xxs,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 10089 runs - 100.70 us/run - 587.20 MFLOP/run -  5.83 TFLOPS MUL_MAT(type_a=iq2_xs,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 11457 runs - 88.01 us/run - 587.20 MFLOP/run -  6.67 TFLOPS MUL_MAT(type_a=iq2_s,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 8721 runs - 115.27 us/run - 587.20 MFLOP/run -  5.09 TFLOPS MUL_MAT(type_a=iq3_xxs,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 11799 runs - 85.04 us/run - 587.20 MFLOP/run -  6.91 TFLOPS MUL_MAT(type_a=iq1_s,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 15219 runs - 65.86 us/run - 587.20 MFLOP/run -  8.92 TFLOPS MUL_MAT(type_a=iq1_m,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 14193 runs - 71.06 us/run - 587.20 MFLOP/run -  8.26 TFLOPS MUL_MAT(type_a=iq4_nl,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 14706 runs - 68.06 us/run - 587.20 MFLOP/run -  8.63 TFLOPS MUL_MAT(type_a=iq3_s,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 10089 runs - 99.42 us/run - 587.20 MFLOP/run -  5.91 TFLOPS MUL_MAT(type_a=iq4_xs,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 17613 runs - 56.79 us/run - 587.20 MFLOP/run -  10.34 TFLOPS MUL_MAT(type_a=f32,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 2461 runs - 407.62 us/run - 939.52 MFLOP/run -  2.30 TFLOPS MUL_MAT(type_a=f16,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 4387 runs - 231.71 us/run - 939.52 MFLOP/run -  4.05 TFLOPS MUL_MAT(type_a=bf16,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 4173 runs - 242.03 us/run - 939.52 MFLOP/run -  3.88 TFLOPS MUL_MAT(type_a=q4_0,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 10379 runs - 96.82 us/run - 939.52 MFLOP/run -  9.70 TFLOPS MUL_MAT(type_a=q4_1,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 10807 runs - 93.05 us/run - 939.52 MFLOP/run -  10.10 TFLOPS MUL_MAT(type_a=q5_0,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 9416 runs - 106.44 us/run - 939.52 MFLOP/run -  8.83 TFLOPS MUL_MAT(type_a=q5_1,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 10593 runs - 95.29 us/run - 939.52 MFLOP/run -  9.86 TFLOPS MUL_MAT(type_a=q8_0,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 8667 runs - 115.51 us/run - 939.52 MFLOP/run -  8.13 TFLOPS MUL_MAT(type_a=mxfp4,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 9951 runs - 101.12 us/run - 939.52 MFLOP/run -  9.29 TFLOPS MUL_MAT(type_a=q2_K,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 4708 runs - 213.75 us/run - 939.52 MFLOP/run -  4.40 TFLOPS MUL_MAT(type_a=q3_K,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 4815 runs - 210.83 us/run - 939.52 MFLOP/run -  4.46 TFLOPS MUL_MAT(type_a=q4_K,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 5029 runs - 200.56 us/run - 939.52 MFLOP/run -  4.68 TFLOPS MUL_MAT(type_a=q5_K,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 4922 runs - 204.99 us/run - 939.52 MFLOP/run -  4.58 TFLOPS MUL_MAT(type_a=q6_K,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 5564 runs - 179.73 us/run - 939.52 MFLOP/run -  5.23 TFLOPS MUL_MAT(type_a=iq2_xxs,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 7490 runs - 134.89 us/run - 939.52 MFLOP/run -  6.96 TFLOPS MUL_MAT(type_a=iq2_xs,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 8453 runs - 119.14 us/run - 939.52 MFLOP/run -  7.89 TFLOPS MUL_MAT(type_a=iq2_s,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 7062 runs - 143.61 us/run - 939.52 MFLOP/run -  6.54 TFLOPS MUL_MAT(type_a=iq3_xxs,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 8346 runs - 120.70 us/run - 939.52 MFLOP/run -  7.78 TFLOPS MUL_MAT(type_a=iq1_s,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 9737 runs - 103.56 us/run - 939.52 MFLOP/run -  9.07 TFLOPS MUL_MAT(type_a=iq1_m,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 10165 runs - 99.18 us/run - 939.52 MFLOP/run -  9.47 TFLOPS MUL_MAT(type_a=iq4_nl,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 9951 runs - 100.52 us/run - 939.52 MFLOP/run -  9.35 TFLOPS MUL_MAT(type_a=iq3_s,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 7597 runs - 132.26 us/run - 939.52 MFLOP/run -  7.10 TFLOPS MUL_MAT(type_a=iq4_xs,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 11128 runs - 90.69 us/run - 939.52 MFLOP/run -  10.36 TFLOPS MUL_MAT(type_a=f32,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 58 runs - 17246.88 us/run - 60.13 GFLOP/run -  3.49 TFLOPS MUL_MAT(type_a=f16,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1598 runs - 626.25 us/run - 60.13 GFLOP/run -  96.02 TFLOPS MUL_MAT(type_a=bf16,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1528 runs - 654.70 us/run - 60.13 GFLOP/run -  91.84 TFLOPS MUL_MAT(type_a=q4_0,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1408 runs - 710.32 us/run - 60.13 GFLOP/run -  84.65 TFLOPS MUL_MAT(type_a=q4_1,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1116 runs - 897.03 us/run - 60.13 GFLOP/run -  67.03 TFLOPS MUL_MAT(type_a=q5_0,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1188 runs - 843.15 us/run - 60.13 GFLOP/run -  71.32 TFLOPS MUL_MAT(type_a=q5_1,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 992 runs - 1009.18 us/run - 60.13 GFLOP/run -  59.58 TFLOPS MUL_MAT(type_a=q8_0,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1290 runs - 776.01 us/run - 60.13 GFLOP/run -  77.49 TFLOPS MUL_MAT(type_a=mxfp4,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1358 runs - 736.54 us/run - 60.13 GFLOP/run -  81.64 TFLOPS MUL_MAT(type_a=q2_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 372 runs - 2691.72 us/run - 60.13 GFLOP/run -  22.34 TFLOPS MUL_MAT(type_a=q3_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1004 runs - 996.13 us/run - 60.13 GFLOP/run -  60.36 TFLOPS MUL_MAT(type_a=q4_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1152 runs - 868.83 us/run - 60.13 GFLOP/run -  69.21 TFLOPS MUL_MAT(type_a=q5_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1046 runs - 957.01 us/run - 60.13 GFLOP/run -  62.83 TFLOPS MUL_MAT(type_a=q6_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 532 runs - 1880.83 us/run - 60.13 GFLOP/run -  31.97 TFLOPS MUL_MAT(type_a=iq2_xxs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1132 runs - 883.71 us/run - 60.13 GFLOP/run -  68.04 TFLOPS MUL_MAT(type_a=iq2_xs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 776 runs - 1290.30 us/run - 60.13 GFLOP/run -  46.60 TFLOPS MUL_MAT(type_a=iq2_s,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 774 runs - 1292.95 us/run - 60.13 GFLOP/run -  46.51 TFLOPS MUL_MAT(type_a=iq3_xxs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1214 runs - 824.96 us/run - 60.13 GFLOP/run -  72.89 TFLOPS MUL_MAT(type_a=iq1_s,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1124 runs - 891.17 us/run - 60.13 GFLOP/run -  67.47 TFLOPS MUL_MAT(type_a=iq1_m,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1212 runs - 826.12 us/run - 60.13 GFLOP/run -  72.79 TFLOPS MUL_MAT(type_a=iq4_nl,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1334 runs - 749.66 us/run - 60.13 GFLOP/run -  80.21 TFLOPS MUL_MAT(type_a=iq3_s,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1106 runs - 905.66 us/run - 60.13 GFLOP/run -  66.39 TFLOPS MUL_MAT(type_a=iq4_xs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1320 runs - 758.11 us/run - 60.13 GFLOP/run -  79.32 TFLOPS Backend ROCm0: OK Backend 2/2: CPU Skipping CPU backend 2/2 backends passed OK