-
Notifications
You must be signed in to change notification settings - Fork 13.9k
Feature/kimi linear support #17592
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Feature/kimi linear support #17592
Changes from 1 commit
7c0334e
0e04784
446c0e6
6b20da1
1b29643
780dd78
3a7e87f
02d3d8d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Implement KDA layer (linear attention with gates and decay) - Implement MLA layer (multi-head latent attention with KV compression) - Support MoE FFN with shared experts - Add TikToken tokenizer support for Kimi models - Fix vocab loading for large vocabularies - Model loads and runs inference (27 layers, 603 tensors)
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -2722,58 +2722,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter | |||||
| return [] # skip other tensors | ||||||
|
|
||||||
|
|
||||||
| @ModelBase.register("KimiLinearForCausalLM") | ||||||
| class KimiLinearModel(ModelBase): | ||||||
| model_arch = gguf.MODEL_ARCH.KIMI | ||||||
|
|
||||||
| def set_gguf_parameters(self): | ||||||
| self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) | ||||||
| self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) | ||||||
| self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) | ||||||
| self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) | ||||||
| self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) | ||||||
| self.gguf_writer.add_rope_dimension_count(self.hparams["qk_rope_head_dim"]) | ||||||
| self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) | ||||||
| self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) | ||||||
|
|
||||||
| linear_attn = self.hparams.get("linear_attn_config", {}) | ||||||
| if linear_attn: | ||||||
| self.gguf_writer.add_ssm_conv_kernel(linear_attn.get("short_conv_kernel_size", 4)) | ||||||
| # Add other Kimi params as generic KV if needed or extend GGUFWriter | ||||||
| # For now we rely on conv_kernel being enough for the conv op | ||||||
|
|
||||||
| def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: | ||||||
| if "gate_up_proj" in name: | ||||||
| # shape: (2 * intermediate_size, hidden_size) | ||||||
| # split along dim 0. Assuming [gate; up] | ||||||
| out_dim = data_torch.shape[0] | ||||||
| mid = out_dim // 2 | ||||||
| w1 = data_torch[:mid, :] # gate | ||||||
| w3 = data_torch[mid:, :] # up | ||||||
|
|
||||||
| # Map directly using the split names which should map to FFN_GATE and FFN_UP | ||||||
| # We need to construct the original names that map_tensor_name expects for mapping | ||||||
| # Or we can manual map if we know the logic. | ||||||
| # But modify_tensors usually returns mapped names. | ||||||
|
|
||||||
| # tensor_mapping.py: | ||||||
| # FFN_GATE: "mlp.gate_proj" (standard llama) | ||||||
| # FFN_UP: "mlp.up_proj" | ||||||
|
|
||||||
| # name is something like "model.layers.0.mlp.gate_up_proj.weight" | ||||||
| name_gate = name.replace("gate_up_proj", "gate_proj") | ||||||
| name_up = name.replace("gate_up_proj", "up_proj") | ||||||
|
|
||||||
| return [ | ||||||
| (self.map_tensor_name(name_gate), w1), | ||||||
| (self.map_tensor_name(name_up), w3) | ||||||
| ] | ||||||
|
|
||||||
| # Handle 1x1xHx1 tensors like A_log | ||||||
| if "A_log" in name: | ||||||
| data_torch = data_torch.squeeze() | ||||||
|
|
||||||
| return [(self.map_tensor_name(name), data_torch)] | ||||||
| # KimiLinearModel is defined later in this file (line ~5140) as a TextModel subclass | ||||||
| # This old definition has been removed to avoid conflicts | ||||||
|
|
||||||
|
|
||||||
| @ModelBase.register( | ||||||
|
|
@@ -5162,12 +5112,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter | |||||
| (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), k), | ||||||
| (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), v), | ||||||
| ] | ||||||
| else: | ||||||
| return [(self.map_tensor_name(name), data_torch)] | ||||||
|
|
||||||
|
|
||||||
| @ModelBase.register("KimiLinearModel", "KimiLinearForCausalLM") | ||||||
| class KimiLinearModel(TextModel): | ||||||
| """Kimi-Linear model with hybrid MLA+KDA architecture""" | ||||||
| model_arch = gguf.MODEL_ARCH.KIMI | ||||||
|
||||||
| model_arch = gguf.MODEL_ARCH.KIMI | |
| model_arch = gguf.MODEL_ARCH.KIMI_LINEAR |
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -445,7 +445,7 @@ class MODEL_ARCH(IntEnum): | |||||||||
| MINIMAXM2 = auto() | ||||||||||
| RND1 = auto() | ||||||||||
| PANGU_EMBED = auto() | ||||||||||
| KIMI = auto() | ||||||||||
| KIMI = auto() # Kimi-Linear (hybrid MLA+KDA) | ||||||||||
|
||||||||||
| KIMI = auto() # Kimi-Linear (hybrid MLA+KDA) | |
| KIMI_LINEAR = auto() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| KEY_ATTENTION_Q_LORA_RANK = Keys.Attention.Q_LORA_RANK | |
| KEY_ATTENTION_KV_LORA_RANK = Keys.Attention.KV_LORA_RANK | |
| KEY_ATTENTION_KEY_LENGTH_MLA = Keys.Attention.KEY_LENGTH_MLA | |
| KEY_ATTENTION_VALUE_LENGTH_MLA = Keys.Attention.VALUE_LENGTH_MLA |
These are old aliases.
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -82,6 +82,7 @@ add_library(llama | |||||
| models/internlm2.cpp | ||||||
| models/jais.cpp | ||||||
| models/jamba.cpp | ||||||
| models/kimi.cpp | ||||||
|
||||||
| models/kimi.cpp | |
| models/kimi-linear.cpp |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.