Skip to content

Commit 780dd78

Browse files
author
cacaview
committed
1 parent 6b20da1 commit 780dd78

File tree

11 files changed

+47
-29
lines changed

11 files changed

+47
-29
lines changed

convert_hf_to_gguf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5121,7 +5121,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
51215121
@ModelBase.register("KimiLinearModel", "KimiLinearForCausalLM")
51225122
class KimiLinearModel(TextModel):
51235123
"""Kimi-Linear model with hybrid MLA+KDA architecture"""
5124-
model_arch = gguf.MODEL_ARCH.KIMI
5124+
model_arch = gguf.MODEL_ARCH.KIMI_LINEAR
51255125

51265126
_experts: list[dict[str, Tensor]] | None = None
51275127

gguf-py/gguf/constants.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -445,7 +445,7 @@ class MODEL_ARCH(IntEnum):
445445
MINIMAXM2 = auto()
446446
RND1 = auto()
447447
PANGU_EMBED = auto()
448-
KIMI = auto() # Kimi-Linear (hybrid MLA+KDA)
448+
KIMI_LINEAR = auto() # Kimi-Linear (hybrid MLA+KDA)
449449

450450

451451
class VISION_PROJECTOR_TYPE(IntEnum):
@@ -830,7 +830,7 @@ class MODEL_TENSOR(IntEnum):
830830
MODEL_ARCH.COGVLM: "cogvlm",
831831
MODEL_ARCH.RND1: "rnd1",
832832
MODEL_ARCH.PANGU_EMBED: "pangu-embedded",
833-
MODEL_ARCH.KIMI: "kimi",
833+
MODEL_ARCH.KIMI_LINEAR: "kimi-linear",
834834
}
835835

836836
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@@ -3096,7 +3096,7 @@ class MODEL_TENSOR(IntEnum):
30963096
MODEL_TENSOR.FFN_DOWN,
30973097
MODEL_TENSOR.FFN_UP,
30983098
],
3099-
MODEL_ARCH.KIMI: [
3099+
MODEL_ARCH.KIMI_LINEAR: [
31003100
MODEL_TENSOR.TOKEN_EMBD,
31013101
MODEL_TENSOR.OUTPUT_NORM,
31023102
MODEL_TENSOR.OUTPUT,

gguf-py/gguf/tensor_mapping.py

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1570,15 +1570,33 @@ class TensorNameMap:
15701570
),
15711571

15721572
# Kimi Linear KDA (using SSM_ prefix for consistency)
1573-
MODEL_TENSOR.SSM_CONV1D_Q: ("model.layers.{bid}.self_attn.q_conv1d",),
1574-
MODEL_TENSOR.SSM_CONV1D_K: ("model.layers.{bid}.self_attn.k_conv1d",),
1575-
MODEL_TENSOR.SSM_CONV1D_V: ("model.layers.{bid}.self_attn.v_conv1d",),
1576-
MODEL_TENSOR.SSM_F_A: ("model.layers.{bid}.self_attn.f_a_proj",),
1577-
MODEL_TENSOR.SSM_F_B: ("model.layers.{bid}.self_attn.f_b_proj",),
1578-
MODEL_TENSOR.SSM_BETA: ("model.layers.{bid}.self_attn.b_proj",),
1579-
MODEL_TENSOR.SSM_A_LOG: ("model.layers.{bid}.self_attn.A_log",),
1580-
MODEL_TENSOR.SSM_G_A: ("model.layers.{bid}.self_attn.g_a_proj",),
1581-
MODEL_TENSOR.SSM_G_B: ("model.layers.{bid}.self_attn.g_b_proj",),
1573+
MODEL_TENSOR.SSM_CONV1D_Q: (
1574+
"model.layers.{bid}.self_attn.q_conv1d",
1575+
),
1576+
MODEL_TENSOR.SSM_CONV1D_K: (
1577+
"model.layers.{bid}.self_attn.k_conv1d",
1578+
),
1579+
MODEL_TENSOR.SSM_CONV1D_V: (
1580+
"model.layers.{bid}.self_attn.v_conv1d",
1581+
),
1582+
MODEL_TENSOR.SSM_F_A: (
1583+
"model.layers.{bid}.self_attn.f_a_proj",
1584+
),
1585+
MODEL_TENSOR.SSM_F_B: (
1586+
"model.layers.{bid}.self_attn.f_b_proj",
1587+
),
1588+
MODEL_TENSOR.SSM_BETA: (
1589+
"model.layers.{bid}.self_attn.b_proj",
1590+
),
1591+
MODEL_TENSOR.SSM_A_LOG: (
1592+
"model.layers.{bid}.self_attn.A_log",
1593+
),
1594+
MODEL_TENSOR.SSM_G_A: (
1595+
"model.layers.{bid}.self_attn.g_a_proj",
1596+
),
1597+
MODEL_TENSOR.SSM_G_B: (
1598+
"model.layers.{bid}.self_attn.g_b_proj",
1599+
),
15821600
MODEL_TENSOR.SSM_DT_B: (
15831601
"model.layers.{bid}.self_attn.dt_bias",
15841602
),

src/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ add_library(llama
8282
models/internlm2.cpp
8383
models/jais.cpp
8484
models/jamba.cpp
85-
models/kimi.cpp
85+
models/kimi-linear.cpp
8686
models/lfm2.cpp
8787
models/llada-moe.cpp
8888
models/llada.cpp

src/llama-arch.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
111111
{ LLM_ARCH_COGVLM, "cogvlm" },
112112
{ LLM_ARCH_RND1, "rnd1" },
113113
{ LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
114-
{ LLM_ARCH_KIMI, "kimi" },
114+
{ LLM_ARCH_KIMI_LINEAR, "kimi-linear" },
115115
{ LLM_ARCH_UNKNOWN, "(unknown)" },
116116
};
117117

@@ -2494,7 +2494,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
24942494
},
24952495
},
24962496
{
2497-
LLM_ARCH_KIMI,
2497+
LLM_ARCH_KIMI_LINEAR,
24982498
{
24992499
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
25002500
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
@@ -2833,7 +2833,7 @@ bool llm_arch_is_recurrent(const llm_arch & arch) {
28332833
case LLM_ARCH_RWKV6QWEN2:
28342834
case LLM_ARCH_RWKV7:
28352835
case LLM_ARCH_ARWKV7:
2836-
case LLM_ARCH_KIMI: // KDA layers use delta attention with recurrent state
2836+
case LLM_ARCH_KIMI_LINEAR: // KDA layers use delta attention with recurrent state
28372837
return true;
28382838
default:
28392839
return false;
@@ -2852,7 +2852,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
28522852
case LLM_ARCH_QWEN3NEXT:
28532853
// Kimi: Currently using recurrent-only mode since MLA doesn't use KV cache
28542854
// TODO: Enable hybrid when MLA KV caching is implemented
2855-
// case LLM_ARCH_KIMI:
2855+
// case LLM_ARCH_KIMI_LINEAR:
28562856
return true;
28572857
default:
28582858
return false;

src/llama-arch.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ enum llm_arch {
115115
LLM_ARCH_COGVLM,
116116
LLM_ARCH_RND1,
117117
LLM_ARCH_PANGU_EMBED,
118-
LLM_ARCH_KIMI,
118+
LLM_ARCH_KIMI_LINEAR,
119119
LLM_ARCH_UNKNOWN,
120120
};
121121

src/llama-context.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1387,7 +1387,7 @@ void llama_context::output_reorder() {
13871387
//
13881388

13891389
uint32_t llama_context::graph_max_nodes() const {
1390-
if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_KIMI) {
1390+
if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_KIMI_LINEAR) {
13911391
return std::max<uint32_t>(8192u, 32u*model.n_tensors());
13921392
}
13931393
return std::max<uint32_t>(1024u, 8u*model.n_tensors());

src/llama-model.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2247,7 +2247,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
22472247
default: type = LLM_TYPE_UNKNOWN;
22482248
}
22492249
} break;
2250-
case LLM_ARCH_KIMI:
2250+
case LLM_ARCH_KIMI_LINEAR:
22512251
{
22522252
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
22532253
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla, false);
@@ -6406,7 +6406,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
64066406
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
64076407
}
64086408
} break;
6409-
case LLM_ARCH_KIMI:
6409+
case LLM_ARCH_KIMI_LINEAR:
64106410
{
64116411
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
64126412

@@ -7712,9 +7712,9 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
77127712
{
77137713
llm = std::make_unique<llm_build_qwen3next>(*this, params);
77147714
} break;
7715-
case LLM_ARCH_KIMI:
7715+
case LLM_ARCH_KIMI_LINEAR:
77167716
{
7717-
llm = std::make_unique<llm_build_kimi>(*this, params);
7717+
llm = std::make_unique<llm_build_kimi_linear>(*this, params);
77187718
} break;
77197719
default:
77207720
GGML_ABORT("fatal error");
@@ -7871,7 +7871,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
78717871
case LLM_ARCH_ARCTIC:
78727872
case LLM_ARCH_DEEPSEEK:
78737873
case LLM_ARCH_DEEPSEEK2:
7874-
case LLM_ARCH_KIMI:
7874+
case LLM_ARCH_KIMI_LINEAR:
78757875
case LLM_ARCH_PLM:
78767876
case LLM_ARCH_CHATGLM:
78777877
case LLM_ARCH_GLM4:

src/llama-quant.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -726,7 +726,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
726726
// sanity checks for models that have attention layers
727727
// Skip this check for Kimi models which have hybrid KDA+MLA architecture
728728
// (only MLA layers have attn_kv_b weights, KDA layers don't)
729-
if (qs.n_attention_wv != 0 && !is_clip_model && model.arch != LLM_ARCH_KIMI)
729+
if (qs.n_attention_wv != 0 && !is_clip_model && model.arch != LLM_ARCH_KIMI_LINEAR)
730730
{
731731
const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
732732
// attention layers have a non-zero number of kv heads

src/models/kimi.cpp renamed to src/models/kimi-linear.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#include "models.h"
22

3-
llm_build_kimi::llm_build_kimi(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params), model(model) {
3+
llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params), model(model) {
44
ggml_tensor * cur;
55
ggml_tensor * inpL;
66

0 commit comments

Comments
 (0)