Skip to content

Commit ff55414

Browse files
pwilkinCISCggerganov
authored
model : Qwen3 Next (#16095)
* Qwen3 Next - cleaned up version * Whitespaces and stuff * Correct minor errors * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret <[email protected]> * Misc. fixes. * Clean up code, add missing hybrid qualifier * Did someone transpose the SOLVE_TRI result matrix? Perhaps... * Whitespace * Proper tensors for cb calls * Use llama-graph.h vertical alignment * BROKEN: chunking * Set new tensors as inputs. * Proper chunk logic * It's the circle of life... * More shenanigans for n_seq > 1 * Nail in the coffin? * Fix Windows build * Eh, one fails on Windows, the other fails on Mac... just use general capture. * quant : cleanup * model : cleanup * qwen3 : cleanup * cont : cleanup * cont : cleanup * ggml : revert change * qwen3 : cleanup * cont : cleanup * Readd cmath * qwen3 : fix typo * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <[email protected]> * Usual suspects * fix my bad suggestion --------- Co-authored-by: Sigbjørn Skjæret <[email protected]> Co-authored-by: Georgi Gerganov <[email protected]>
1 parent 73955f7 commit ff55414

File tree

16 files changed

+1345
-19
lines changed

16 files changed

+1345
-19
lines changed

convert_hf_to_gguf.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4183,6 +4183,36 @@ def set_vocab(self):
41834183
super().set_vocab()
41844184

41854185

4186+
@ModelBase.register("Qwen3NextForCausalLM")
4187+
class Qwen3NextModel(Qwen2MoeModel):
4188+
model_arch = gguf.MODEL_ARCH.QWEN3NEXT
4189+
4190+
def set_gguf_parameters(self):
4191+
super().set_gguf_parameters()
4192+
self.gguf_writer.add_ssm_conv_kernel(self.hparams["linear_conv_kernel_dim"])
4193+
self.gguf_writer.add_ssm_state_size(self.hparams["linear_key_head_dim"])
4194+
self.gguf_writer.add_ssm_group_count(self.hparams["linear_num_key_heads"])
4195+
self.gguf_writer.add_ssm_time_step_rank(self.hparams["linear_num_value_heads"])
4196+
self.gguf_writer.add_ssm_inner_size(self.hparams["linear_value_head_dim"] * self.hparams["linear_num_value_heads"])
4197+
if (rope_dim := self.hparams.get("head_dim")) is None:
4198+
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
4199+
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.25)))
4200+
4201+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4202+
if name.startswith("mtp"):
4203+
return [] # ignore MTP layers for now
4204+
if name.endswith(".A_log"):
4205+
data_torch = -torch.exp(data_torch)
4206+
elif name.endswith(".dt_bias"):
4207+
name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
4208+
elif "conv1d" in name:
4209+
data_torch = data_torch.squeeze()
4210+
elif name.endswith("norm.weight") and not name.endswith("linear_attn.norm.weight"):
4211+
data_torch = data_torch + 1
4212+
4213+
yield from super().modify_tensors(data_torch, name, bid)
4214+
4215+
41864216
@ModelBase.register("RND1")
41874217
class RND1Model(Qwen2MoeModel):
41884218
model_arch = gguf.MODEL_ARCH.RND1

examples/model-conversion/scripts/causal/run-converted-model.sh

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@ set -e
44

55
# First try command line argument, then environment variable, then file
66
CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
7+
MODEL_TESTING_PROMPT="${2:-"$MODEL_TESTING_PROMPT"}"
8+
9+
if [ -z "$MODEL_TESTING_PROMPT"]; then
10+
MODEL_TESTING_PROMPT="Hello, my name is"
11+
fi
712

813
# Final check if we have a model path
914
if [ -z "$CONVERTED_MODEL" ]; then
@@ -14,7 +19,8 @@ if [ -z "$CONVERTED_MODEL" ]; then
1419
fi
1520

1621
echo $CONVERTED_MODEL
22+
echo $MODEL_TESTING_PROMPT
1723

1824
cmake --build ../../build --target llama-logits -j8
1925

20-
../../build/bin/llama-logits -m "$CONVERTED_MODEL" "Hello, my name is"
26+
../../build/bin/llama-logits -m "$CONVERTED_MODEL" "$MODEL_TESTING_PROMPT"

examples/model-conversion/scripts/causal/run-org-model.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -184,8 +184,12 @@ def fn(_m, input, output):
184184
# of using AutoModelForCausalLM.
185185
print(f"Model class: {model.__class__.__name__}")
186186

187-
prompt = "Hello, my name is"
188-
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
187+
device = next(model.parameters()).device
188+
if os.getenv("MODEL_TESTING_PROMPT"):
189+
prompt = os.getenv("MODEL_TESTING_PROMPT")
190+
else:
191+
prompt = "Hello, my name is"
192+
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
189193

190194
print(f"Input tokens: {input_ids}")
191195
print(f"Input text: {repr(prompt)}")

ggml/src/ggml-cpu/ops.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9766,7 +9766,8 @@ static void ggml_compute_forward_solve_tri_f32(const struct ggml_compute_params
97669766
}
97679767

97689768
const float diag = A_batch[i00 * n + i00];
9769-
GGML_ASSERT(diag != 0.0f && "Zero diagonal in triangular matrix");
9769+
assert(diag != 0.0f && "Zero diagonal in triangular matrix");
9770+
97709771
X_batch[i00 * k + i01] = (B_batch[i00 * k + i01] - sum) / diag;
97719772
}
97729773
}

gguf-py/gguf/constants.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,7 @@ class MODEL_ARCH(IntEnum):
366366
QWEN2VL = auto()
367367
QWEN3 = auto()
368368
QWEN3MOE = auto()
369+
QWEN3NEXT = auto()
369370
QWEN3VL = auto()
370371
QWEN3VLMOE = auto()
371372
PHI2 = auto()
@@ -531,6 +532,7 @@ class MODEL_TENSOR(IntEnum):
531532
SSM_D = auto()
532533
SSM_NORM = auto()
533534
SSM_OUT = auto()
535+
SSM_BETA_ALPHA = auto() # qwen3next
534536
TIME_MIX_W0 = auto()
535537
TIME_MIX_W1 = auto()
536538
TIME_MIX_W2 = auto()
@@ -736,6 +738,7 @@ class MODEL_TENSOR(IntEnum):
736738
MODEL_ARCH.QWEN2VL: "qwen2vl",
737739
MODEL_ARCH.QWEN3: "qwen3",
738740
MODEL_ARCH.QWEN3MOE: "qwen3moe",
741+
MODEL_ARCH.QWEN3NEXT: "qwen3next",
739742
MODEL_ARCH.QWEN3VL: "qwen3vl",
740743
MODEL_ARCH.QWEN3VLMOE: "qwen3vlmoe",
741744
MODEL_ARCH.PHI2: "phi2",
@@ -900,6 +903,7 @@ class MODEL_TENSOR(IntEnum):
900903
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
901904
MODEL_TENSOR.SSM_NORM: "blk.{bid}.ssm_norm",
902905
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
906+
MODEL_TENSOR.SSM_BETA_ALPHA: "blk.{bid}.ssm_ba",
903907
MODEL_TENSOR.TIME_MIX_W0: "blk.{bid}.time_mix_w0",
904908
MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1",
905909
MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2",
@@ -1569,6 +1573,35 @@ class MODEL_TENSOR(IntEnum):
15691573
MODEL_TENSOR.FFN_DOWN_EXP,
15701574
MODEL_TENSOR.FFN_UP_EXP,
15711575
],
1576+
MODEL_ARCH.QWEN3NEXT: [
1577+
MODEL_TENSOR.TOKEN_EMBD,
1578+
MODEL_TENSOR.OUTPUT_NORM,
1579+
MODEL_TENSOR.OUTPUT,
1580+
MODEL_TENSOR.ATTN_NORM,
1581+
MODEL_TENSOR.ATTN_Q,
1582+
MODEL_TENSOR.ATTN_Q_NORM,
1583+
MODEL_TENSOR.ATTN_K,
1584+
MODEL_TENSOR.ATTN_K_NORM,
1585+
MODEL_TENSOR.ATTN_V,
1586+
MODEL_TENSOR.ATTN_OUT,
1587+
MODEL_TENSOR.ATTN_POST_NORM,
1588+
MODEL_TENSOR.ATTN_GATE,
1589+
MODEL_TENSOR.FFN_GATE_INP,
1590+
MODEL_TENSOR.FFN_GATE_INP_SHEXP,
1591+
MODEL_TENSOR.FFN_UP_SHEXP,
1592+
MODEL_TENSOR.FFN_DOWN_SHEXP,
1593+
MODEL_TENSOR.FFN_GATE_SHEXP,
1594+
MODEL_TENSOR.FFN_DOWN_EXP,
1595+
MODEL_TENSOR.FFN_UP_EXP,
1596+
MODEL_TENSOR.FFN_GATE_EXP,
1597+
MODEL_TENSOR.SSM_A,
1598+
MODEL_TENSOR.SSM_CONV1D,
1599+
MODEL_TENSOR.SSM_DT,
1600+
MODEL_TENSOR.SSM_NORM,
1601+
MODEL_TENSOR.SSM_IN,
1602+
MODEL_TENSOR.SSM_BETA_ALPHA,
1603+
MODEL_TENSOR.SSM_OUT
1604+
],
15721605
MODEL_ARCH.QWEN3VL: [
15731606
MODEL_TENSOR.TOKEN_EMBD,
15741607
MODEL_TENSOR.OUTPUT_NORM,

gguf-py/gguf/tensor_mapping.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -672,17 +672,19 @@ class TensorNameMap:
672672
),
673673

674674
MODEL_TENSOR.SSM_IN: (
675-
"model.layers.{bid}.in_proj", # mamba-hf
676-
"backbone.layers.{bid}.mixer.in_proj", # mamba
677-
"model.layers.{bid}.mamba.in_proj", # jamba falcon-h1 granite-hybrid
678-
"model.layers.layers.{bid}.mixer.in_proj", # plamo2
675+
"model.layers.{bid}.in_proj", # mamba-hf
676+
"backbone.layers.{bid}.mixer.in_proj", # mamba
677+
"model.layers.{bid}.mamba.in_proj", # jamba falcon-h1 granite-hybrid
678+
"model.layers.layers.{bid}.mixer.in_proj", # plamo2
679+
"model.layers.{bid}.linear_attn.in_proj_qkvz", # qwen3next
679680
),
680681

681682
MODEL_TENSOR.SSM_CONV1D: (
682683
"model.layers.{bid}.conv1d", # mamba-hf
683684
"backbone.layers.{bid}.mixer.conv1d", # mamba
684685
"model.layers.{bid}.mamba.conv1d", # jamba falcon-h1 granite-hybrid
685686
"model.layers.layers.{bid}.mixer.conv1d", # plamo2
687+
"model.layers.{bid}.linear_attn.conv1d", # qwen3next
686688
),
687689

688690
MODEL_TENSOR.SSM_X: (
@@ -697,6 +699,7 @@ class TensorNameMap:
697699
"backbone.layers.{bid}.mixer.dt_proj", # mamba
698700
"model.layers.{bid}.mamba.dt_proj", # jamba falcon-h1 granite-hybrid
699701
"model.layers.layers.{bid}.mixer.dt_proj", # plamo2
702+
"model.layers.{bid}.linear_attn.dt_proj", # qwen3next
700703
),
701704

702705
MODEL_TENSOR.SSM_DT_NORM: (
@@ -709,6 +712,7 @@ class TensorNameMap:
709712
"backbone.layers.{bid}.mixer.A_log", # mamba
710713
"model.layers.{bid}.mamba.A_log", # jamba falcon-h1 granite-hybrid
711714
"model.layers.layers.{bid}.mixer.A_log", # plamo2
715+
"model.layers.{bid}.linear_attn.A_log", # qwen3next
712716
),
713717

714718
MODEL_TENSOR.SSM_B_NORM: (
@@ -731,17 +735,23 @@ class TensorNameMap:
731735
),
732736

733737
MODEL_TENSOR.SSM_NORM: (
734-
"model.layers.{bid}.mamba.norm", # falcon-h1 granite-hybrid
735-
"backbone.layers.{bid}.mixer.norm", # mamba2
738+
"model.layers.{bid}.mamba.norm", # falcon-h1 granite-hybrid
739+
"model.layers.{bid}.linear_attn.norm", # qwen3next
740+
"backbone.layers.{bid}.mixer.norm", # mamba2
736741
),
737742

738743
MODEL_TENSOR.SSM_OUT: (
739744
"model.layers.{bid}.out_proj", # mamba-hf
740745
"backbone.layers.{bid}.mixer.out_proj", # mamba
741746
"model.layers.{bid}.mamba.out_proj", # jamba falcon-h1 granite-hybrid
747+
"model.layers.{bid}.linear_attn.out_proj", # qwen3next
742748
"model.layers.layers.{bid}.mixer.out_proj", # plamo2
743749
),
744750

751+
MODEL_TENSOR.SSM_BETA_ALPHA: (
752+
"model.layers.{bid}.linear_attn.in_proj_ba", # qwen3next
753+
),
754+
745755
MODEL_TENSOR.TIME_MIX_W0: (
746756
"model.layers.{bid}.attention.w0", # rwkv7
747757
),

src/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ add_library(llama
114114
models/qwen3vl.cpp
115115
models/qwen3vl-moe.cpp
116116
models/qwen3moe.cpp
117+
models/qwen3next.cpp
117118
models/refact.cpp
118119
models/rnd1.cpp
119120
models/rwkv6-base.cpp

src/llama-arch.cpp

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
3232
{ LLM_ARCH_QWEN2VL, "qwen2vl" },
3333
{ LLM_ARCH_QWEN3, "qwen3" },
3434
{ LLM_ARCH_QWEN3MOE, "qwen3moe" },
35+
{ LLM_ARCH_QWEN3NEXT, "qwen3next" },
3536
{ LLM_ARCH_QWEN3VL, "qwen3vl" },
3637
{ LLM_ARCH_QWEN3VLMOE, "qwen3vlmoe" },
3738
{ LLM_ARCH_PHI2, "phi2" },
@@ -829,6 +830,38 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
829830
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
830831
},
831832
},
833+
{
834+
LLM_ARCH_QWEN3NEXT,
835+
{
836+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
837+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
838+
{ LLM_TENSOR_OUTPUT, "output" },
839+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
840+
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
841+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
842+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
843+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
844+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
845+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
846+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
847+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
848+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
849+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
850+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
851+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
852+
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
853+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
854+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
855+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
856+
{ LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
857+
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
858+
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
859+
{ LLM_TENSOR_SSM_BETA_ALPHA, "blk.%d.ssm_ba" },
860+
{ LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
861+
{ LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
862+
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
863+
},
864+
},
832865
{
833866
LLM_ARCH_QWEN3VL,
834867
{
@@ -2556,6 +2589,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
25562589
{LLM_TENSOR_SSM_X, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
25572590
{LLM_TENSOR_SSM_DT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
25582591
{LLM_TENSOR_SSM_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
2592+
{LLM_TENSOR_SSM_BETA_ALPHA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
25592593
{LLM_TENSOR_TIME_MIX_W1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
25602594
{LLM_TENSOR_TIME_MIX_W2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
25612595
{LLM_TENSOR_TIME_MIX_A1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
@@ -2754,6 +2788,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
27542788
case LLM_ARCH_LFM2:
27552789
case LLM_ARCH_LFM2MOE:
27562790
case LLM_ARCH_NEMOTRON_H:
2791+
case LLM_ARCH_QWEN3NEXT:
27572792
return true;
27582793
default:
27592794
return false;

src/llama-arch.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ enum llm_arch {
3636
LLM_ARCH_QWEN2VL,
3737
LLM_ARCH_QWEN3,
3838
LLM_ARCH_QWEN3MOE,
39+
LLM_ARCH_QWEN3NEXT,
3940
LLM_ARCH_QWEN3VL,
4041
LLM_ARCH_QWEN3VLMOE,
4142
LLM_ARCH_PHI2,
@@ -381,6 +382,7 @@ enum llm_tensor {
381382
LLM_TENSOR_SSM_D,
382383
LLM_TENSOR_SSM_NORM,
383384
LLM_TENSOR_SSM_OUT,
385+
LLM_TENSOR_SSM_BETA_ALPHA, // qwen3next
384386
LLM_TENSOR_TIME_MIX_W0,
385387
LLM_TENSOR_TIME_MIX_W1,
386388
LLM_TENSOR_TIME_MIX_W2,

src/llama-context.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include "llama-context.h"
22

3+
#include "llama-arch.h"
34
#include "llama-impl.h"
45
#include "llama-batch.h"
56
#include "llama-io.h"
@@ -1386,6 +1387,9 @@ void llama_context::output_reorder() {
13861387
//
13871388

13881389
uint32_t llama_context::graph_max_nodes() const {
1390+
if (model.arch == LLM_ARCH_QWEN3NEXT) {
1391+
return std::max<uint32_t>(8192u, 32u*model.n_tensors());
1392+
}
13891393
return std::max<uint32_t>(1024u, 8u*model.n_tensors());
13901394
}
13911395

0 commit comments

Comments
 (0)