Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
sentencepiece tokenizer for llama.cpp
teuken chat template
  • Loading branch information
net-haus committed Nov 18, 2025
commit 5fd8e3c8415648a2a78428289d0339d87ec43adf
1 change: 1 addition & 0 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ extern "C" {
LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
LLAMA_VOCAB_TYPE_PLAMO2 = 6, // PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming
LLAMA_VOCAB_TYPE_SPIE = 7, // TEUKEN tokenizer based on SentencePiece
};

enum llama_rope_type {
Expand Down
7 changes: 5 additions & 2 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,19 @@ add_library(llama
llama-quant.cpp
llama-sampling.cpp
llama-vocab.cpp
llama-vocab-sentencepiece.cpp
unicode-data.cpp
unicode.cpp
unicode.h
)

target_include_directories(llama PRIVATE .)
target_include_directories(llama PUBLIC ../include)
target_include_directories(llama PUBLIC ../include "C:/NHKI/llama/llama.cpp/common" "C:/NHKI/llama/sentencepiece/src" "C:/NHKI/llama/sentencepiece")
#target_link_directories(${TEST_TARGET} PRIVATE "C:/NHKI/llama/sentencepiece/build/src/Debug")

target_compile_features (llama PRIVATE cxx_std_17) # don't bump

target_link_libraries(llama PUBLIC ggml)
target_link_libraries(llama PUBLIC ggml "C:/NHKI/llama/sentencepiece/build/src/Debug/sentencepiece.lib")

if (BUILD_SHARED_LIBS)
set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
Expand Down
1 change: 1 addition & 0 deletions src/llama-arch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
{ LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" },
{ LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" },
{ LLM_KV_TOKENIZER_SENTENCEPIECE_MODEL, "tokenizer.ggml.sentencepiece_model" },
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
{ LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" },
Expand Down
1 change: 1 addition & 0 deletions src/llama-arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@ enum llm_kv {
LLM_KV_TOKENIZER_ADD_PREFIX,
LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,
LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
LLM_KV_TOKENIZER_SENTENCEPIECE_MODEL,
LLM_KV_TOKENIZER_HF_JSON,
LLM_KV_TOKENIZER_RWKV,
LLM_KV_TOKENIZER_CHAT_TEMPLATE,
Expand Down
13 changes: 11 additions & 2 deletions src/llama-chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -457,27 +457,36 @@ int32_t llm_chat_apply_template(
}
} else if (tmpl == LLM_CHAT_TEMPLATE_TEUKEN) {
// eachadea/vicuna-13b-1.1 (and Orca variant)
bool isSysOut=false;
for (auto message : chat) {
std::string role(message->role);
if (role == "system") {
const std::string lang=trim( message->content);
if(LLM_TEUKEN_SYSTEM.find(lang)==LLM_TEUKEN_SYSTEM.end())
{
ss << "System: " << message->content << "\n";
std::string teuken_system=(*(LLM_TEUKEN_SYSTEM.find("EN"))).second;
ss << "System: " << teuken_system << "\n";
}
else
{
std::string teuken_system=(*(LLM_TEUKEN_SYSTEM.find(lang))).second;
ss << "System: " << teuken_system << "\n";
}
isSysOut=true;
} else if (role == "user") {
if (!isSysOut)
{
std::string teuken_system=(*(LLM_TEUKEN_SYSTEM.find("EN"))).second;
ss << "System: " << teuken_system << "\n";
isSysOut=true;
}
ss << "User: " << message->content << "\n";
} else if (role == "assistant") {
ss << "Assistant: " << message->content << "</s>\n";
}
}
if (add_ass) {
ss << "Assistant:";
ss << "Assistant: ";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK) {
// deepseek-ai/deepseek-coder-33b-instruct
Expand Down
38 changes: 38 additions & 0 deletions src/llama-vocab-sentencepiece.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#include "sentencepiece_processor.h"
#include "filesystem.h"
#include "llama-vocab-sentencepiece.h"
#include <string>
#include <vector>
#include <memory>
#include <stdexcept>



std::unique_ptr<sentencepiece::SentencePieceProcessor> processor;

int sp_init(const std::string& sp_binary)
{
if (processor.get()==NULL)
{
processor.reset(new sentencepiece::SentencePieceProcessor);
const auto status = processor->LoadFromSerializedProto(sp_binary);
if (!status.ok()) {
//std::cerr << status.ToString() << std::endl;
// error
throw std::invalid_argument("sentencepiece not initialized");
return 0;
}
}
return 1;
}

int sp_encode(const std::string& str,std::vector<int32_t>& token_ids)
{
if (processor.get()==NULL)
{
throw std::invalid_argument("sentencepiece not initialized");
}

processor->Encode(str, &token_ids);
return 0;
}
9 changes: 9 additions & 0 deletions src/llama-vocab-sentencepiece.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#pragma once

#include <string>
#include <vector>
#include <memory>


int sp_init(const std::string& model);
int sp_encode(const std::string& str,std::vector<int32_t>& token_ids);
119 changes: 115 additions & 4 deletions src/llama-vocab.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include "gguf.h"
#include "llama-impl.h"
#include "llama-model-loader.h"

#include "llama-vocab-sentencepiece.h"
#include "unicode.h"

#include <algorithm>
Expand Down Expand Up @@ -1103,6 +1103,44 @@ struct llm_tokenizer_ugm_session {
const llm_tokenizer_ugm & tokenizer;
};



//
// Spie tokenizer
//

struct llm_tokenizer_spie : llm_tokenizer {
llm_tokenizer_spie(const llama_vocab & vocab) {
//sp_init();
}
};

struct llm_tokenizer_spie_session {
llm_tokenizer_spie_session(const llama_vocab & vocab, const llm_tokenizer_spie & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}

/* This implementation is based on SentencePiece optimized Viterbi algorithm for
* unigram language models. The general idea is to:
* - move along the input sequence in steps of one UTF code point,
* - at each step find all possible tokenizations of the prefix by
* traversing the tokens trie,
* - for each tokenization store the best one so far (by higher score)
* - use the position in sequence after given token as an index to store
* results
* - if there was no valid tokenization of the current UTF code point
* then use unknown token with additional score penalty
* After processing the whole sequence we backtrack from the end to get
* the best tokenization.
*/
void tokenize(const std::string & text, std::vector<llama_token> & output) {
sp_encode(text,output);
}

const llama_vocab & vocab;
const llm_tokenizer_spie & tokenizer;
};



//
// RWKV tokenizer
//
Expand Down Expand Up @@ -1592,6 +1630,8 @@ struct llama_vocab::impl {

std::vector<char> precompiled_charsmap;

std::string sentencepiece_model;

impl(const llama_vocab & vocab) : vocab(vocab) {
}

Expand Down Expand Up @@ -1708,15 +1748,40 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
special_sep_id = LLAMA_TOKEN_NULL;
special_pad_id = LLAMA_TOKEN_NULL;
special_mask_id = LLAMA_TOKEN_NULL;
} else if (tokenizer_model == "teuken") {
type = LLAMA_VOCAB_TYPE_UGM;
} else if (tokenizer_model == "sentencepiece") {
type = LLAMA_VOCAB_TYPE_SPIE;
// default special tokens
special_bos_id = 1;
special_eos_id = 2;
special_unk_id = 0;
special_sep_id = LLAMA_TOKEN_NULL;
special_pad_id = LLAMA_TOKEN_NULL;
special_mask_id = LLAMA_TOKEN_NULL;

const int sentencepiece_model_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SENTENCEPIECE_MODEL).c_str());
if (sentencepiece_model_keyidx != -1) {
const gguf_type pc_type = gguf_get_arr_type(ctx, sentencepiece_model_keyidx);
GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);

const size_t n_sentencepiece_model = gguf_get_arr_n(ctx, sentencepiece_model_keyidx);
const char * pc = (const char *) gguf_get_arr_data(ctx, sentencepiece_model_keyidx);
sentencepiece_model.assign(pc, pc + n_sentencepiece_model);
sp_init(sentencepiece_model);



#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
// correct endiannes of data in precompiled_charsmap binary blob
uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
*xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
size_t xcda_array_size = *xcda_blob_size / sizeof(uint32_t);
uint32_t * xcda_array = (uint32_t *) &precompiled_charsmap[sizeof(uint32_t)];
for (size_t i = 0; i < xcda_array_size; ++i) {
xcda_array[i] = __builtin_bswap32(xcda_array[i]);
}
#endif
}
} else if (tokenizer_model == "bert") {
type = LLAMA_VOCAB_TYPE_WPM;

Expand Down Expand Up @@ -1782,6 +1847,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);

#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
// correct endiannes of data in precompiled_charsmap binary blob
uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
Expand Down Expand Up @@ -2021,7 +2087,12 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
add_bos = false;
add_eos = true;
} else if (type == LLAMA_VOCAB_TYPE_RWKV) {
} else if (type == LLAMA_VOCAB_TYPE_SPIE) {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
add_bos = false;
add_eos = true;
}
else if (type == LLAMA_VOCAB_TYPE_RWKV) {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
add_space_prefix = false;
clean_spaces = false;
Expand Down Expand Up @@ -2532,6 +2603,7 @@ std::string llama_vocab::impl::type_name() const{
case LLAMA_VOCAB_TYPE_UGM: return "UGM";
case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
case LLAMA_VOCAB_TYPE_PLAMO2: return "PLaMo2";
case LLAMA_VOCAB_TYPE_SPIE: return "SPIE";
default: return "unknown";
}
}
Expand Down Expand Up @@ -2576,6 +2648,7 @@ uint8_t llama_vocab::impl::token_to_byte(llama_token id) const {
const auto & token_data = id_to_token.at(id);
switch (get_type()) {
case LLAMA_VOCAB_TYPE_SPM:
case LLAMA_VOCAB_TYPE_SPIE:
case LLAMA_VOCAB_TYPE_UGM: {
auto buf = token_data.text.substr(3, 2);
return strtol(buf.c_str(), NULL, 16);
Expand Down Expand Up @@ -2612,6 +2685,9 @@ void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) {
case LLAMA_VOCAB_TYPE_UGM:
tokenizer = std::make_unique<llm_tokenizer_ugm>(vocab, precompiled_charsmap);
break;
case LLAMA_VOCAB_TYPE_SPIE:
tokenizer = std::make_unique<llm_tokenizer_spie>(vocab);
break;
case LLAMA_VOCAB_TYPE_RWKV:
tokenizer = std::make_unique<llm_tokenizer_rwkv>(vocab);
break;
Expand Down Expand Up @@ -2938,6 +3014,39 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
"Are you sure this is what you want?\n", __FUNCTION__);
}

if (add_special && add_eos) {
GGML_ASSERT(special_eos_id != LLAMA_TOKEN_NULL);
output.push_back(special_eos_id);
}
} break;
case LLAMA_VOCAB_TYPE_SPIE:
{
if (add_special && add_bos) {
GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
output.push_back(special_bos_id);
}

llm_tokenizer_spie_session session(vocab, *static_cast<const llm_tokenizer_spie *>(tokenizer.get()));

for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
#endif
session.tokenize(text, output);
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
output.push_back(fragment.token);
}
}

if (add_special && add_bos && output.size() >= 2 && output[1] == special_bos_id) {
LLAMA_LOG_WARN(
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
"Are you sure this is what you want?\n", __FUNCTION__);
}

if (add_special && add_eos) {
GGML_ASSERT(special_eos_id != LLAMA_TOKEN_NULL);
output.push_back(special_eos_id);
Expand Down Expand Up @@ -3025,6 +3134,7 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
switch (get_type()) {
case LLAMA_VOCAB_TYPE_WPM:
case LLAMA_VOCAB_TYPE_SPM:
case LLAMA_VOCAB_TYPE_SPIE:
case LLAMA_VOCAB_TYPE_UGM: {
// NOTE: we accept all unsupported token types,
// suppressing them like CONTROL tokens.
Expand Down Expand Up @@ -3313,6 +3423,7 @@ llama_token llama_vocab::byte_to_token(uint8_t ch) const {
static const char * hex = "0123456789ABCDEF";
switch (get_type()) {
case LLAMA_VOCAB_TYPE_SPM:
case LLAMA_VOCAB_TYPE_SPIE:
case LLAMA_VOCAB_TYPE_UGM: {
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
auto token = pimpl->token_to_id.find(buf);
Expand Down
1 change: 1 addition & 0 deletions src/unicode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include "unicode.h"
#include "unicode-data.h"
#include "llama-vocab-sentencepiece.h"

#include <algorithm>
#include <cassert>
Expand Down
Loading