@@ -239,10 +239,7 @@ def write_tensors(self):
239
239
data : np .ndarray = data # type hint
240
240
n_dims = len (data .shape )
241
241
data_dtype = data .dtype
242
-
243
- # if f32 desired, convert any float16 to float32
244
- if self .ftype == 0 and data_dtype == np .float16 :
245
- data = data .astype (np .float32 )
242
+ data_qtype : gguf .GGMLQuantizationType | None = None
246
243
247
244
# when both are True, f32 should win
248
245
extra_f32 = self .extra_f32_tensors (name , new_name , bid , n_dims )
@@ -254,20 +251,33 @@ def write_tensors(self):
254
251
# if f16 desired, convert any float32 2-dim weight tensors to float16
255
252
extra_f16 = extra_f16 or (name .endswith (".weight" ) and n_dims >= 2 )
256
253
257
- # when both extra_f32 and extra_f16 are False, convert to float32 by default
258
- if self .ftype == 1 and data_dtype == np .float16 and (extra_f32 or not extra_f16 ):
259
- data = data .astype (np .float32 )
254
+ if self .ftype != gguf .GGMLQuantizationType .F32 and extra_f16 and not extra_f32 :
255
+ if self .ftype == gguf .GGMLQuantizationType .F16 :
256
+ if data_dtype != np .float16 :
257
+ data = data .astype (np .float16 )
258
+ data_qtype = gguf .GGMLQuantizationType .F16
259
+
260
+ elif self .ftype == gguf .GGMLQuantizationType .BF16 :
261
+ if data_dtype != np .float32 :
262
+ data = data .astype (np .float32 )
263
+ data .dtype = np .int32
264
+ data = (data >> 16 ).astype (np .int16 )
265
+ data_qtype = gguf .GGMLQuantizationType .BF16
266
+
267
+ else : # by default, convert to float32
268
+ if data_dtype != np .float32 :
269
+ data = data .astype (np .float32 )
270
+ data_qtype = gguf .GGMLQuantizationType .F32
260
271
261
- if self .ftype == 1 and data_dtype == np .float32 and extra_f16 and not extra_f32 :
262
- data = data .astype (np .float16 )
272
+ assert data_qtype is not None
263
273
264
274
# reverse shape to make it similar to the internal ggml dimension order
265
275
shape_str = f"{{{ ', ' .join (str (n ) for n in reversed (data .shape ))} }}"
266
276
267
277
# n_dims is implicit in the shape
268
- logger .info (f"{ f'%-{ max_name_len } s' % f'{ new_name } ,' } { old_dtype } --> { data . dtype } , shape = { shape_str } " )
278
+ logger .info (f"{ f'%-{ max_name_len } s' % f'{ new_name } ,' } { old_dtype } --> { data_qtype . name } , shape = { shape_str } " )
269
279
270
- self .gguf_writer .add_tensor (new_name , data )
280
+ self .gguf_writer .add_tensor (new_name , data , raw_dtype = data_qtype )
271
281
272
282
def write (self ):
273
283
self .write_tensors ()
@@ -2417,8 +2427,8 @@ def parse_args() -> argparse.Namespace:
2417
2427
help = "path to write to; default: based on input" ,
2418
2428
)
2419
2429
parser .add_argument (
2420
- "--outtype" , type = str , choices = ["f32" , "f16" ], default = "f16" ,
2421
- help = "output format - use f32 for float32, f16 for float16" ,
2430
+ "--outtype" , type = str , choices = ["f32" , "f16" , "bf16" ], default = "f16" ,
2431
+ help = "output format - use f32 for float32, f16 for float16, bf16 for bfloat16 " ,
2422
2432
)
2423
2433
parser .add_argument (
2424
2434
"--bigendian" , action = "store_true" ,
@@ -2475,6 +2485,7 @@ def main() -> None:
2475
2485
ftype_map = {
2476
2486
"f32" : gguf .GGMLQuantizationType .F32 ,
2477
2487
"f16" : gguf .GGMLQuantizationType .F16 ,
2488
+ "bf16" : gguf .GGMLQuantizationType .BF16 ,
2478
2489
}
2479
2490
2480
2491
if args .outfile is not None :
0 commit comments