Testing 2 devices Backend 1/2: ROCm0 Device description: AMD Radeon AI PRO R9700 Device memory: 32624 MB (32556 MB free) MUL_MAT(type_a=f16,type_b=f32,m=16416,n=1,k=128,bs=[8,1],nr=[4,1],per=[0,2,1,3],v=0,o=1): 2976 runs - 359.53 us/run - 134.48 MFLOP/run - 374.04 GFLOPS MUL_MAT(type_a=f16,type_b=f32,m=128,n=1,k=16416,bs=[8,1],nr=[4,1],per=[0,1,2,3],v=1,o=1): 10416 runs - 96.07 us/run - 134.48 MFLOP/run -  1.40 TFLOPS MUL_MAT(type_a=f32,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 3408 runs - 371.91 us/run - 117.44 MFLOP/run - 315.77 GFLOPS MUL_MAT(type_a=f16,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 5964 runs - 189.75 us/run - 117.44 MFLOP/run - 618.91 GFLOPS MUL_MAT(type_a=bf16,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 5964 runs - 189.92 us/run - 117.44 MFLOP/run - 618.36 GFLOPS MUL_MAT(type_a=q4_0,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 34932 runs - 28.73 us/run - 117.44 MFLOP/run -  4.09 TFLOPS MUL_MAT(type_a=q4_1,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 34932 runs - 29.30 us/run - 117.44 MFLOP/run -  4.01 TFLOPS MUL_MAT(type_a=q5_0,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 27264 runs - 37.56 us/run - 117.44 MFLOP/run -  3.13 TFLOPS MUL_MAT(type_a=q5_1,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 28968 runs - 34.65 us/run - 117.44 MFLOP/run -  3.39 TFLOPS MUL_MAT(type_a=q8_0,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 21300 runs - 46.97 us/run - 117.44 MFLOP/run -  2.50 TFLOPS MUL_MAT(type_a=mxfp4,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 32376 runs - 31.61 us/run - 117.44 MFLOP/run -  3.72 TFLOPS MUL_MAT(type_a=q2_K,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 24708 runs - 41.31 us/run - 117.44 MFLOP/run -  2.84 TFLOPS MUL_MAT(type_a=q3_K,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 15336 runs - 67.61 us/run - 117.44 MFLOP/run -  1.74 TFLOPS MUL_MAT(type_a=q4_K,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 23856 runs - 42.66 us/run - 117.44 MFLOP/run -  2.75 TFLOPS MUL_MAT(type_a=q5_K,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 22152 runs - 45.70 us/run - 117.44 MFLOP/run -  2.57 TFLOPS MUL_MAT(type_a=q6_K,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 16188 runs - 63.83 us/run - 117.44 MFLOP/run -  1.84 TFLOPS MUL_MAT(type_a=iq2_xxs,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 14484 runs - 69.23 us/run - 117.44 MFLOP/run -  1.70 TFLOPS MUL_MAT(type_a=iq2_xs,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 18744 runs - 53.41 us/run - 117.44 MFLOP/run -  2.20 TFLOPS MUL_MAT(type_a=iq2_s,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 14484 runs - 71.71 us/run - 117.44 MFLOP/run -  1.64 TFLOPS MUL_MAT(type_a=iq3_xxs,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 18744 runs - 54.74 us/run - 117.44 MFLOP/run -  2.15 TFLOPS MUL_MAT(type_a=iq1_s,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 35784 runs - 28.49 us/run - 117.44 MFLOP/run -  4.12 TFLOPS MUL_MAT(type_a=iq1_m,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 31524 runs - 32.26 us/run - 117.44 MFLOP/run -  3.64 TFLOPS MUL_MAT(type_a=iq4_nl,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 33228 runs - 30.69 us/run - 117.44 MFLOP/run -  3.83 TFLOPS MUL_MAT(type_a=iq3_s,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 13632 runs - 74.57 us/run - 117.44 MFLOP/run -  1.57 TFLOPS MUL_MAT(type_a=iq4_xs,type_b=f32,m=4096,n=1,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 32376 runs - 31.62 us/run - 117.44 MFLOP/run -  3.71 TFLOPS MUL_MAT(type_a=f32,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 2982 runs - 373.08 us/run - 234.88 MFLOP/run - 629.57 GFLOPS MUL_MAT(type_a=f16,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 5538 runs - 192.83 us/run - 234.88 MFLOP/run -  1.22 TFLOPS MUL_MAT(type_a=bf16,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 5538 runs - 193.49 us/run - 234.88 MFLOP/run -  1.21 TFLOPS MUL_MAT(type_a=q4_0,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 24282 runs - 41.91 us/run - 234.88 MFLOP/run -  5.60 TFLOPS MUL_MAT(type_a=q4_1,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 29394 runs - 34.08 us/run - 234.88 MFLOP/run -  6.89 TFLOPS MUL_MAT(type_a=q5_0,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 22152 runs - 45.38 us/run - 234.88 MFLOP/run -  5.18 TFLOPS MUL_MAT(type_a=q5_1,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 23856 runs - 42.02 us/run - 234.88 MFLOP/run -  5.59 TFLOPS MUL_MAT(type_a=q8_0,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 20022 runs - 50.03 us/run - 234.88 MFLOP/run -  4.69 TFLOPS MUL_MAT(type_a=mxfp4,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 25134 runs - 40.38 us/run - 234.88 MFLOP/run -  5.82 TFLOPS MUL_MAT(type_a=q2_K,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 15762 runs - 63.74 us/run - 234.88 MFLOP/run -  3.69 TFLOPS MUL_MAT(type_a=q3_K,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 12354 runs - 82.99 us/run - 234.88 MFLOP/run -  2.83 TFLOPS MUL_MAT(type_a=q4_K,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 16188 runs - 62.99 us/run - 234.88 MFLOP/run -  3.73 TFLOPS MUL_MAT(type_a=q5_K,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 15336 runs - 66.14 us/run - 234.88 MFLOP/run -  3.55 TFLOPS MUL_MAT(type_a=q6_K,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 12780 runs - 80.59 us/run - 234.88 MFLOP/run -  2.91 TFLOPS MUL_MAT(type_a=iq2_xxs,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 12780 runs - 78.37 us/run - 234.88 MFLOP/run -  3.00 TFLOPS MUL_MAT(type_a=iq2_xs,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 16614 runs - 61.46 us/run - 234.88 MFLOP/run -  3.82 TFLOPS MUL_MAT(type_a=iq2_s,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 12780 runs - 80.78 us/run - 234.88 MFLOP/run -  2.91 TFLOPS MUL_MAT(type_a=iq3_xxs,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 16188 runs - 62.16 us/run - 234.88 MFLOP/run -  3.78 TFLOPS MUL_MAT(type_a=iq1_s,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 29394 runs - 34.15 us/run - 234.88 MFLOP/run -  6.88 TFLOPS MUL_MAT(type_a=iq1_m,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 25560 runs - 39.27 us/run - 234.88 MFLOP/run -  5.98 TFLOPS MUL_MAT(type_a=iq4_nl,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 23004 runs - 44.03 us/run - 234.88 MFLOP/run -  5.33 TFLOPS MUL_MAT(type_a=iq3_s,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 12780 runs - 80.17 us/run - 234.88 MFLOP/run -  2.93 TFLOPS MUL_MAT(type_a=iq4_xs,type_b=f32,m=4096,n=2,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 28968 runs - 34.91 us/run - 234.88 MFLOP/run -  6.73 TFLOPS MUL_MAT(type_a=f32,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 2840 runs - 374.82 us/run - 352.32 MFLOP/run - 939.99 GFLOPS MUL_MAT(type_a=f16,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 5112 runs - 197.80 us/run - 352.32 MFLOP/run -  1.78 TFLOPS MUL_MAT(type_a=bf16,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 4828 runs - 207.77 us/run - 352.32 MFLOP/run -  1.70 TFLOPS MUL_MAT(type_a=q4_0,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 21584 runs - 46.74 us/run - 352.32 MFLOP/run -  7.54 TFLOPS MUL_MAT(type_a=q4_1,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 22436 runs - 44.80 us/run - 352.32 MFLOP/run -  7.86 TFLOPS MUL_MAT(type_a=q5_0,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 19028 runs - 52.79 us/run - 352.32 MFLOP/run -  6.67 TFLOPS MUL_MAT(type_a=q5_1,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 20448 runs - 49.01 us/run - 352.32 MFLOP/run -  7.19 TFLOPS MUL_MAT(type_a=q8_0,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 18744 runs - 53.90 us/run - 352.32 MFLOP/run -  6.54 TFLOPS MUL_MAT(type_a=mxfp4,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 21300 runs - 47.38 us/run - 352.32 MFLOP/run -  7.44 TFLOPS MUL_MAT(type_a=q2_K,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 11644 runs - 87.69 us/run - 352.32 MFLOP/run -  4.02 TFLOPS MUL_MAT(type_a=q3_K,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 9940 runs - 103.08 us/run - 352.32 MFLOP/run -  3.42 TFLOPS MUL_MAT(type_a=q4_K,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 11928 runs - 85.74 us/run - 352.32 MFLOP/run -  4.11 TFLOPS MUL_MAT(type_a=q5_K,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 11360 runs - 89.58 us/run - 352.32 MFLOP/run -  3.93 TFLOPS MUL_MAT(type_a=q6_K,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 10508 runs - 95.91 us/run - 352.32 MFLOP/run -  3.67 TFLOPS MUL_MAT(type_a=iq2_xxs,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 11928 runs - 84.18 us/run - 352.32 MFLOP/run -  4.19 TFLOPS MUL_MAT(type_a=iq2_xs,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 14484 runs - 70.04 us/run - 352.32 MFLOP/run -  5.03 TFLOPS MUL_MAT(type_a=iq2_s,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 11360 runs - 88.71 us/run - 352.32 MFLOP/run -  3.97 TFLOPS MUL_MAT(type_a=iq3_xxs,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 14484 runs - 69.40 us/run - 352.32 MFLOP/run -  5.08 TFLOPS MUL_MAT(type_a=iq1_s,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 21016 runs - 48.08 us/run - 352.32 MFLOP/run -  7.33 TFLOPS MUL_MAT(type_a=iq1_m,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 20164 runs - 50.08 us/run - 352.32 MFLOP/run -  7.04 TFLOPS MUL_MAT(type_a=iq4_nl,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 21016 runs - 47.76 us/run - 352.32 MFLOP/run -  7.38 TFLOPS MUL_MAT(type_a=iq3_s,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 11644 runs - 85.98 us/run - 352.32 MFLOP/run -  4.10 TFLOPS MUL_MAT(type_a=iq4_xs,type_b=f32,m=4096,n=3,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 25560 runs - 39.52 us/run - 352.32 MFLOP/run -  8.91 TFLOPS MUL_MAT(type_a=f32,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 2769 runs - 376.23 us/run - 469.76 MFLOP/run -  1.25 TFLOPS MUL_MAT(type_a=f16,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 4899 runs - 205.17 us/run - 469.76 MFLOP/run -  2.29 TFLOPS MUL_MAT(type_a=bf16,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 4260 runs - 237.59 us/run - 469.76 MFLOP/run -  1.98 TFLOPS MUL_MAT(type_a=q4_0,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 18957 runs - 53.32 us/run - 469.76 MFLOP/run -  8.81 TFLOPS MUL_MAT(type_a=q4_1,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 19596 runs - 51.26 us/run - 469.76 MFLOP/run -  9.16 TFLOPS MUL_MAT(type_a=q5_0,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 17040 runs - 59.32 us/run - 469.76 MFLOP/run -  7.92 TFLOPS MUL_MAT(type_a=q5_1,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 17679 runs - 56.95 us/run - 469.76 MFLOP/run -  8.25 TFLOPS MUL_MAT(type_a=q8_0,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 15975 runs - 63.08 us/run - 469.76 MFLOP/run -  7.45 TFLOPS MUL_MAT(type_a=mxfp4,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 18744 runs - 53.48 us/run - 469.76 MFLOP/run -  8.78 TFLOPS MUL_MAT(type_a=q2_K,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 9159 runs - 111.52 us/run - 469.76 MFLOP/run -  4.21 TFLOPS MUL_MAT(type_a=q3_K,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 8307 runs - 121.37 us/run - 469.76 MFLOP/run -  3.87 TFLOPS MUL_MAT(type_a=q4_K,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 9372 runs - 108.88 us/run - 469.76 MFLOP/run -  4.31 TFLOPS MUL_MAT(type_a=q5_K,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 8946 runs - 113.09 us/run - 469.76 MFLOP/run -  4.15 TFLOPS MUL_MAT(type_a=q6_K,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 8946 runs - 112.36 us/run - 469.76 MFLOP/run -  4.18 TFLOPS MUL_MAT(type_a=iq2_xxs,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 10863 runs - 92.22 us/run - 469.76 MFLOP/run -  5.09 TFLOPS MUL_MAT(type_a=iq2_xs,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 12567 runs - 79.76 us/run - 469.76 MFLOP/run -  5.89 TFLOPS MUL_MAT(type_a=iq2_s,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 10437 runs - 97.17 us/run - 469.76 MFLOP/run -  4.83 TFLOPS MUL_MAT(type_a=iq3_xxs,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 12567 runs - 80.55 us/run - 469.76 MFLOP/run -  5.83 TFLOPS MUL_MAT(type_a=iq1_s,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 17679 runs - 56.76 us/run - 469.76 MFLOP/run -  8.28 TFLOPS MUL_MAT(type_a=iq1_m,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 16827 runs - 59.63 us/run - 469.76 MFLOP/run -  7.88 TFLOPS MUL_MAT(type_a=iq4_nl,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 18318 runs - 54.76 us/run - 469.76 MFLOP/run -  8.58 TFLOPS MUL_MAT(type_a=iq3_s,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 10863 runs - 92.82 us/run - 469.76 MFLOP/run -  5.06 TFLOPS MUL_MAT(type_a=iq4_xs,type_b=f32,m=4096,n=4,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 20235 runs - 49.80 us/run - 469.76 MFLOP/run -  9.43 TFLOPS MUL_MAT(type_a=f32,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 2736 runs - 380.45 us/run - 587.20 MFLOP/run -  1.54 TFLOPS MUL_MAT(type_a=f16,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 4617 runs - 217.70 us/run - 587.20 MFLOP/run -  2.70 TFLOPS MUL_MAT(type_a=bf16,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 4275 runs - 238.23 us/run - 587.20 MFLOP/run -  2.46 TFLOPS MUL_MAT(type_a=q4_0,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 14877 runs - 67.77 us/run - 587.20 MFLOP/run -  8.66 TFLOPS MUL_MAT(type_a=q4_1,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 15219 runs - 65.93 us/run - 587.20 MFLOP/run -  8.91 TFLOPS MUL_MAT(type_a=q5_0,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 14193 runs - 70.51 us/run - 587.20 MFLOP/run -  8.33 TFLOPS MUL_MAT(type_a=q5_1,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 15219 runs - 65.78 us/run - 587.20 MFLOP/run -  8.93 TFLOPS MUL_MAT(type_a=q8_0,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 13680 runs - 73.95 us/run - 587.20 MFLOP/run -  7.94 TFLOPS MUL_MAT(type_a=mxfp4,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 14706 runs - 68.14 us/run - 587.20 MFLOP/run -  8.62 TFLOPS MUL_MAT(type_a=q2_K,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 7524 runs - 135.53 us/run - 587.20 MFLOP/run -  4.33 TFLOPS MUL_MAT(type_a=q3_K,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 7182 runs - 142.40 us/run - 587.20 MFLOP/run -  4.12 TFLOPS MUL_MAT(type_a=q4_K,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 7695 runs - 132.46 us/run - 587.20 MFLOP/run -  4.43 TFLOPS MUL_MAT(type_a=q5_K,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 7524 runs - 135.52 us/run - 587.20 MFLOP/run -  4.33 TFLOPS MUL_MAT(type_a=q6_K,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 7866 runs - 129.53 us/run - 587.20 MFLOP/run -  4.53 TFLOPS MUL_MAT(type_a=iq2_xxs,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 10089 runs - 100.77 us/run - 587.20 MFLOP/run -  5.83 TFLOPS MUL_MAT(type_a=iq2_xs,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 11457 runs - 88.15 us/run - 587.20 MFLOP/run -  6.66 TFLOPS MUL_MAT(type_a=iq2_s,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 8721 runs - 115.43 us/run - 587.20 MFLOP/run -  5.09 TFLOPS MUL_MAT(type_a=iq3_xxs,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 11799 runs - 85.17 us/run - 587.20 MFLOP/run -  6.89 TFLOPS MUL_MAT(type_a=iq1_s,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 15219 runs - 65.96 us/run - 587.20 MFLOP/run -  8.90 TFLOPS MUL_MAT(type_a=iq1_m,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 14193 runs - 71.12 us/run - 587.20 MFLOP/run -  8.26 TFLOPS MUL_MAT(type_a=iq4_nl,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 14706 runs - 68.57 us/run - 587.20 MFLOP/run -  8.56 TFLOPS MUL_MAT(type_a=iq3_s,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 10089 runs - 99.47 us/run - 587.20 MFLOP/run -  5.90 TFLOPS MUL_MAT(type_a=iq4_xs,type_b=f32,m=4096,n=5,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 17784 runs - 56.70 us/run - 587.20 MFLOP/run -  10.36 TFLOPS MUL_MAT(type_a=f32,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 2461 runs - 407.78 us/run - 939.52 MFLOP/run -  2.30 TFLOPS MUL_MAT(type_a=f16,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 4280 runs - 234.18 us/run - 939.52 MFLOP/run -  4.01 TFLOPS MUL_MAT(type_a=bf16,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 4173 runs - 243.56 us/run - 939.52 MFLOP/run -  3.86 TFLOPS MUL_MAT(type_a=q4_0,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 10379 runs - 97.03 us/run - 939.52 MFLOP/run -  9.68 TFLOPS MUL_MAT(type_a=q4_1,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 10807 runs - 93.16 us/run - 939.52 MFLOP/run -  10.08 TFLOPS MUL_MAT(type_a=q5_0,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 9416 runs - 106.68 us/run - 939.52 MFLOP/run -  8.81 TFLOPS MUL_MAT(type_a=q5_1,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 10593 runs - 94.53 us/run - 939.52 MFLOP/run -  9.94 TFLOPS MUL_MAT(type_a=q8_0,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 8667 runs - 116.25 us/run - 939.52 MFLOP/run -  8.08 TFLOPS MUL_MAT(type_a=mxfp4,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 9951 runs - 101.10 us/run - 939.52 MFLOP/run -  9.29 TFLOPS MUL_MAT(type_a=q2_K,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 4708 runs - 214.39 us/run - 939.52 MFLOP/run -  4.38 TFLOPS MUL_MAT(type_a=q3_K,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 4815 runs - 210.85 us/run - 939.52 MFLOP/run -  4.46 TFLOPS MUL_MAT(type_a=q4_K,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 5029 runs - 200.61 us/run - 939.52 MFLOP/run -  4.68 TFLOPS MUL_MAT(type_a=q5_K,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 4922 runs - 205.16 us/run - 939.52 MFLOP/run -  4.58 TFLOPS MUL_MAT(type_a=q6_K,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 5564 runs - 180.25 us/run - 939.52 MFLOP/run -  5.21 TFLOPS MUL_MAT(type_a=iq2_xxs,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 7490 runs - 134.91 us/run - 939.52 MFLOP/run -  6.96 TFLOPS MUL_MAT(type_a=iq2_xs,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 8453 runs - 118.77 us/run - 939.52 MFLOP/run -  7.91 TFLOPS MUL_MAT(type_a=iq2_s,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 7062 runs - 143.65 us/run - 939.52 MFLOP/run -  6.54 TFLOPS MUL_MAT(type_a=iq3_xxs,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 8346 runs - 120.84 us/run - 939.52 MFLOP/run -  7.78 TFLOPS MUL_MAT(type_a=iq1_s,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 9737 runs - 103.60 us/run - 939.52 MFLOP/run -  9.07 TFLOPS MUL_MAT(type_a=iq1_m,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 10165 runs - 99.22 us/run - 939.52 MFLOP/run -  9.47 TFLOPS MUL_MAT(type_a=iq4_nl,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 9951 runs - 100.65 us/run - 939.52 MFLOP/run -  9.33 TFLOPS MUL_MAT(type_a=iq3_s,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 7597 runs - 132.40 us/run - 939.52 MFLOP/run -  7.10 TFLOPS MUL_MAT(type_a=iq4_xs,type_b=f32,m=4096,n=8,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 11128 runs - 90.61 us/run - 939.52 MFLOP/run -  10.37 TFLOPS MUL_MAT(type_a=f32,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 60 runs - 17219.08 us/run - 60.13 GFLOP/run -  3.49 TFLOPS MUL_MAT(type_a=f16,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1594 runs - 628.04 us/run - 60.13 GFLOP/run -  95.74 TFLOPS MUL_MAT(type_a=bf16,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1548 runs - 646.41 us/run - 60.13 GFLOP/run -  93.02 TFLOPS MUL_MAT(type_a=q4_0,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1182 runs - 846.84 us/run - 60.13 GFLOP/run -  71.00 TFLOPS MUL_MAT(type_a=q4_1,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1170 runs - 855.96 us/run - 60.13 GFLOP/run -  70.25 TFLOPS MUL_MAT(type_a=q5_0,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1040 runs - 962.59 us/run - 60.13 GFLOP/run -  62.47 TFLOPS MUL_MAT(type_a=q5_1,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1046 runs - 957.85 us/run - 60.13 GFLOP/run -  62.78 TFLOPS MUL_MAT(type_a=q8_0,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1112 runs - 899.88 us/run - 60.13 GFLOP/run -  66.82 TFLOPS MUL_MAT(type_a=mxfp4,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1206 runs - 830.50 us/run - 60.13 GFLOP/run -  72.40 TFLOPS MUL_MAT(type_a=q2_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1168 runs - 856.52 us/run - 60.13 GFLOP/run -  70.20 TFLOPS MUL_MAT(type_a=q3_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1132 runs - 884.09 us/run - 60.13 GFLOP/run -  68.01 TFLOPS MUL_MAT(type_a=q4_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1172 runs - 853.82 us/run - 60.13 GFLOP/run -  70.42 TFLOPS MUL_MAT(type_a=q5_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1140 runs - 877.24 us/run - 60.13 GFLOP/run -  68.54 TFLOPS MUL_MAT(type_a=q6_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1146 runs - 873.46 us/run - 60.13 GFLOP/run -  68.84 TFLOPS MUL_MAT(type_a=iq2_xxs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1202 runs - 832.79 us/run - 60.13 GFLOP/run -  72.20 TFLOPS MUL_MAT(type_a=iq2_xs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1198 runs - 834.82 us/run - 60.13 GFLOP/run -  72.03 TFLOPS MUL_MAT(type_a=iq2_s,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1198 runs - 835.92 us/run - 60.13 GFLOP/run -  71.93 TFLOPS MUL_MAT(type_a=iq3_xxs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1188 runs - 842.03 us/run - 60.13 GFLOP/run -  71.41 TFLOPS MUL_MAT(type_a=iq1_s,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1210 runs - 827.19 us/run - 60.13 GFLOP/run -  72.69 TFLOPS MUL_MAT(type_a=iq1_m,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1204 runs - 831.57 us/run - 60.13 GFLOP/run -  72.31 TFLOPS MUL_MAT(type_a=iq4_nl,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1178 runs - 849.90 us/run - 60.13 GFLOP/run -  70.75 TFLOPS MUL_MAT(type_a=iq3_s,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1186 runs - 844.29 us/run - 60.13 GFLOP/run -  71.22 TFLOPS MUL_MAT(type_a=iq4_xs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): 1182 runs - 847.27 us/run - 60.13 GFLOP/run -  70.97 TFLOPS Backend ROCm0: OK Backend 2/2: CPU Skipping CPU backend 2/2 backends passed OK