Skip to content

Commit 7ba071a

Browse files
committed
model : add LFM2-ColBert-350M
1 parent 8e3a761 commit 7ba071a

13 files changed

Lines changed: 89 additions & 26 deletions

convert_hf_to_gguf.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9948,6 +9948,31 @@ def _is_audio_tensor(self, name: str):
99489948
return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"])
99499949

99509950

9951+
@ModelBase.register("Lfm2Model")
9952+
class LFM2ColBertModel(LFM2Model):
9953+
model_arch = gguf.MODEL_ARCH.LFM2
9954+
dense_tensor_name = "dense_2"
9955+
9956+
def set_vocab(self):
9957+
super().set_vocab()
9958+
self.gguf_writer.add_add_bos_token(False)
9959+
9960+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9961+
if not name.startswith(self.dense_tensor_name):
9962+
name = "model." + name
9963+
9964+
return super().modify_tensors(data_torch, name, bid)
9965+
9966+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
9967+
# dense tensor is stored in a separate safetensors file
9968+
from safetensors.torch import load_file
9969+
tensors_file = self.dir_model / "1_Dense" / "model.safetensors"
9970+
assert tensors_file.is_file()
9971+
tensor = load_file(tensors_file)["linear.weight"]
9972+
self.gguf_writer.add_embedding_length_out(tensor.shape[0])
9973+
yield f"{self.dense_tensor_name}.weight", tensor.clone()
9974+
9975+
99519976
@ModelBase.register("Lfm2MoeForCausalLM")
99529977
class LFM2MoeModel(TextModel):
99539978
model_arch = gguf.MODEL_ARCH.LFM2MOE

gguf-py/gguf/constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ class LLM:
104104
VOCAB_SIZE = "{arch}.vocab_size"
105105
CONTEXT_LENGTH = "{arch}.context_length"
106106
EMBEDDING_LENGTH = "{arch}.embedding_length"
107+
EMBEDDING_LENGTH_OUT = "{arch}.embedding_length_out"
107108
FEATURES_LENGTH = "{arch}.features_length"
108109
BLOCK_COUNT = "{arch}.block_count"
109110
LEADING_DENSE_BLOCK_COUNT = "{arch}.leading_dense_block_count"
@@ -3038,6 +3039,7 @@ class MODEL_TENSOR(IntEnum):
30383039
MODEL_TENSOR.ATTN_V,
30393040
MODEL_TENSOR.ATTN_OUT,
30403041
MODEL_TENSOR.OUTPUT,
3042+
MODEL_TENSOR.DENSE_2_OUT, # LFM2-ColBert-350M
30413043
],
30423044
MODEL_ARCH.LFM2MOE: [
30433045
MODEL_TENSOR.TOKEN_EMBD,

gguf-py/gguf/gguf_writer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -681,6 +681,9 @@ def add_context_length(self, length: int) -> None:
681681
def add_embedding_length(self, length: int) -> None:
682682
self.add_uint32(Keys.LLM.EMBEDDING_LENGTH.format(arch=self.arch), length)
683683

684+
def add_embedding_length_out(self, length: int) -> None:
685+
self.add_uint32(Keys.LLM.EMBEDDING_LENGTH_OUT.format(arch=self.arch), length)
686+
684687
def add_features_length(self, length: int) -> None:
685688
self.add_uint32(Keys.LLM.FEATURES_LENGTH.format(arch=self.arch), length)
686689

include/llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -535,6 +535,7 @@ extern "C" {
535535
LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
536536
LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model);
537537
LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model);
538+
LLAMA_API int32_t llama_model_n_embd_out (const struct llama_model * model);
538539
LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
539540
LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
540541
LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);

src/llama-arch.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
152152
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
153153
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
154154
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
155+
{ LLM_KV_EMBEDDING_LENGTH_OUT, "%s.embedding_length_out" },
155156
{ LLM_KV_FEATURES_LENGTH, "%s.features_length" },
156157
{ LLM_KV_BLOCK_COUNT, "%s.block_count" },
157158
{ LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
@@ -2075,6 +2076,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
20752076
LLM_TENSOR_TOKEN_EMBD,
20762077
LLM_TENSOR_OUTPUT_NORM_LFM2,
20772078
LLM_TENSOR_OUTPUT,
2079+
LLM_TENSOR_DENSE_2_OUT,
20782080
};
20792081
case LLM_ARCH_LFM2MOE:
20802082
return {

src/llama-arch.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ enum llm_kv {
156156
LLM_KV_VOCAB_SIZE,
157157
LLM_KV_CONTEXT_LENGTH,
158158
LLM_KV_EMBEDDING_LENGTH,
159+
LLM_KV_EMBEDDING_LENGTH_OUT,
159160
LLM_KV_FEATURES_LENGTH,
160161
LLM_KV_BLOCK_COUNT,
161162
LLM_KV_LEADING_DENSE_BLOCK_COUNT,

src/llama-context.cpp

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -758,7 +758,8 @@ float * llama_context::get_embeddings_ith(int32_t i) {
758758
throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
759759
}
760760

761-
return embd + j*model.hparams.n_embd;
761+
const uint32_t n_embd_out = model.hparams.get_n_embd_out();
762+
return embd + j*n_embd_out;
762763
} catch (const std::exception & err) {
763764
LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
764765
#ifndef NDEBUG
@@ -1194,9 +1195,10 @@ int llama_context::encode(const llama_batch & batch_inp) {
11941195
{
11951196
// extract token embeddings
11961197
GGML_ASSERT(embd != nullptr);
1198+
const uint32_t n_embd_out = hparams.get_n_embd_out();
11971199

1198-
GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size);
1199-
ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd*sizeof(float));
1200+
GGML_ASSERT(n_tokens*n_embd_out <= (int64_t) embd_size);
1201+
ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd_out*sizeof(float));
12001202
} break;
12011203
case LLAMA_POOLING_TYPE_MEAN:
12021204
case LLAMA_POOLING_TYPE_CLS:
@@ -1215,17 +1217,17 @@ int llama_context::encode(const llama_batch & batch_inp) {
12151217
} break;
12161218
case LLAMA_POOLING_TYPE_RANK:
12171219
{
1218-
// extract the rerank score - n_cls_out floats per sequence
1220+
// extract the rerank score - n_embd_out floats per sequence
12191221
auto & embd_seq_out = embd_seq;
12201222

1221-
const uint32_t n_cls_out = hparams.n_cls_out;
1223+
const uint32_t n_embd_out = hparams.get_n_embd_out();
12221224

12231225
for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
12241226
const llama_seq_id seq_id = ubatch.seq_id_unq[s];
12251227
const int32_t seq_idx = ubatch.seq_idx[seq_id];
12261228

1227-
embd_seq_out[seq_id].resize(n_cls_out);
1228-
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_cls_out*seq_idx)*sizeof(float), n_cls_out*sizeof(float));
1229+
embd_seq_out[seq_id].resize(n_embd_out);
1230+
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd_out*seq_idx)*sizeof(float), n_embd_out*sizeof(float));
12291231
}
12301232
} break;
12311233
case LLAMA_POOLING_TYPE_UNSPECIFIED:
@@ -1600,12 +1602,13 @@ int llama_context::decode(const llama_batch & batch_inp) {
16001602
{
16011603
// extract token embeddings
16021604
GGML_ASSERT(embd != nullptr);
1603-
float * embd_out = embd + n_outputs_prev*n_embd;
1605+
const uint32_t n_embd_out = hparams.get_n_embd_out();
1606+
float * embd_out = embd + n_outputs_prev*n_embd_out;
16041607

16051608
if (n_outputs) {
16061609
GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all);
1607-
GGML_ASSERT((n_outputs_prev + n_outputs)*n_embd <= (int64_t) embd_size);
1608-
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd*sizeof(float));
1610+
GGML_ASSERT((n_outputs_prev + n_outputs)*n_embd_out <= (int64_t) embd_size);
1611+
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd_out*sizeof(float));
16091612
}
16101613
} break;
16111614
case LLAMA_POOLING_TYPE_MEAN:
@@ -1625,17 +1628,17 @@ int llama_context::decode(const llama_batch & batch_inp) {
16251628
} break;
16261629
case LLAMA_POOLING_TYPE_RANK:
16271630
{
1628-
// extract the rerank score - n_cls_out floats per sequence
1631+
// extract the rerank score - n_embd_out floats per sequence
16291632
auto & embd_seq_out = embd_seq;
16301633

1631-
const uint32_t n_cls_out = hparams.n_cls_out;
1634+
const uint32_t n_embd_out = hparams.get_n_embd_out();
16321635

16331636
for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
16341637
const llama_seq_id seq_id = ubatch.seq_id_unq[s];
16351638
const int32_t seq_idx = ubatch.seq_idx[seq_id];
16361639

1637-
embd_seq_out[seq_id].resize(n_cls_out);
1638-
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_cls_out*seq_idx)*sizeof(float), n_cls_out*sizeof(float));
1640+
embd_seq_out[seq_id].resize(n_embd_out);
1641+
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd_out*seq_idx)*sizeof(float), n_embd_out*sizeof(float));
16391642
}
16401643
} break;
16411644
case LLAMA_POOLING_TYPE_UNSPECIFIED:
@@ -1730,9 +1733,9 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba
17301733

17311734
const int64_t n_outputs_max = std::max<int64_t>(n_outputs, n_seq_max());
17321735

1733-
const auto n_batch = cparams.n_batch;
1734-
const auto n_vocab = vocab.n_tokens();
1735-
const auto n_embd = hparams.n_embd;
1736+
const auto n_batch = cparams.n_batch;
1737+
const auto n_vocab = vocab.n_tokens();
1738+
const auto n_embd_out = hparams.get_n_embd_out();
17361739

17371740
bool has_logits = true;
17381741
bool has_embd = cparams.embeddings;
@@ -1773,7 +1776,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba
17731776

17741777
// Allocate CPU logits buffer only if needed by sequences in this batch
17751778
logits_size = (has_logits && cpu_logits) ? n_vocab*n_outputs_max : 0;
1776-
embd_size = has_embd ? n_embd*n_outputs_max : 0;
1779+
embd_size = has_embd ? n_embd_out*n_outputs_max : 0;
17771780

17781781
// TODO: avoid this branching by working with the worst-case
17791782
if (!has_sampling) {

src/llama-graph.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2067,14 +2067,18 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
20672067
void llm_graph_context::build_dense_out(
20682068
ggml_tensor * dense_2,
20692069
ggml_tensor * dense_3) const {
2070-
if (!cparams.embeddings || dense_2 == nullptr || dense_3 == nullptr) {
2070+
if (!cparams.embeddings || !(dense_2 || dense_3)) {
20712071
return;
20722072
}
20732073
ggml_tensor * cur = res->t_embd_pooled != nullptr ? res->t_embd_pooled : res->t_embd;
20742074
GGML_ASSERT(cur != nullptr && "missing t_embd_pooled/t_embd");
20752075

2076-
cur = ggml_mul_mat(ctx0, dense_2, cur);
2077-
cur = ggml_mul_mat(ctx0, dense_3, cur);
2076+
if (dense_2) {
2077+
cur = ggml_mul_mat(ctx0, dense_2, cur);
2078+
}
2079+
if (dense_3) {
2080+
cur = ggml_mul_mat(ctx0, dense_3, cur);
2081+
}
20782082
cb(cur, "result_embd_pooled", -1);
20792083
res->t_embd_pooled = cur;
20802084
ggml_build_forward_expand(gf, cur);

src/llama-hparams.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,10 @@ uint32_t llama_hparams::n_embd_inp() const {
7272
return n_embd_inp;
7373
}
7474

75+
uint32_t llama_hparams::get_n_embd_out() const {
76+
return n_embd_out > 0 ? n_embd_out : n_embd;
77+
}
78+
7579
uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
7680
const uint32_t n_head_kv = this->n_head_kv(il);
7781

src/llama-hparams.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,9 @@ struct llama_hparams {
162162
// for Classifiers
163163
uint32_t n_cls_out = 1;
164164

165+
// output embedding dimension (0 = use n_embd)
166+
uint32_t n_embd_out = 0;
167+
165168
// llama4 smallthinker
166169
uint32_t n_moe_layer_step = 0;
167170
uint32_t n_no_rope_layer_step = 4;
@@ -234,6 +237,9 @@ struct llama_hparams {
234237
// dimension of main + auxiliary input embeddings
235238
uint32_t n_embd_inp() const;
236239

240+
// dimension of output embeddings
241+
uint32_t get_n_embd_out() const;
242+
237243
// dimension of key embeddings across all k-v heads
238244
uint32_t n_embd_k_gqa(uint32_t il = 0) const;
239245

0 commit comments

Comments
 (0)