Quantization#

class tensorrt_llm.quantization.QuantAlgo(
value,
names=<not given>,
*values,
module=None,
qualname=None,
type=None,
start=1,
boundary=None,
)[source]#

Bases: StrEnum

class tensorrt_llm.quantization.QuantMode(
value,
names=<not given>,
*values,
module=None,
qualname=None,
type=None,
start=1,
boundary=None,
)[source]#

Bases: IntFlag

tensorrt_llm.quantization.quantize_and_export(
*,
model_dir,
device,
calib_dataset,
dtype,
qformat,
kv_cache_dtype,
calib_size,
batch_size,
calib_max_seq_length,
awq_block_size,
output_dir,
tp_size,
pp_size,
cp_size,
seed,
tokenizer_max_seq_length,
num_medusa_heads=None,
num_medusa_layers=None,
max_draft_len=None,
medusa_hidden_act=None,
medusa_model_dir=None,
quant_medusa_head=None,
auto_quantize_bits=None,
device_map='auto',
quantize_lm_head=False,
)[source]#

Load model from the model_dir, call Modelopt to quantize the model, and then export the quantized model as TRT-LLM checkpoint