Skip to content
Merged
Changes from 1 commit
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
8d2eca3
convert_hf_to_gguf: Add support for RWKV v6
MollySophia Jul 31, 2024
dc0767f
Add RWKV tokenization
LaylBongers Apr 4, 2024
865167d
Fix build
MollySophia Jul 31, 2024
7cac72a
Do not use special tokens when matching in RWKV tokenizer
LaylBongers Apr 12, 2024
e92c74f
Fix model loading
LaylBongers Apr 15, 2024
a0aae8d
Add (broken) placeholder graph builder for RWKV
LaylBongers Apr 17, 2024
a866789
Add workaround for kv cache
LaylBongers Apr 19, 2024
4e23d97
Add logits conversion to rwkv5
LaylBongers Apr 23, 2024
5479588
Add rwkv5 layer norms
LaylBongers Apr 26, 2024
dd3aa3d
Add time mix KVRG & correct merge mistake
LaylBongers May 6, 2024
b409fd8
Add remaining time mix parameters
LaylBongers May 13, 2024
3cbeffc
Add time mix output loading
LaylBongers May 13, 2024
b3b17e0
Add placeholder llm_build_time_mix
LaylBongers May 14, 2024
700dad1
Fix build
MollySophia Aug 1, 2024
a180b63
Load more tensors for rwkv v6
MollySophia Aug 1, 2024
0e5ac34
Fix rwkv tokenizer
MollySophia Aug 2, 2024
5732de8
ggml: Add unary operator Exp
MollySophia Aug 2, 2024
0784a0c
RWKV v6 graph building
MollySophia Aug 2, 2024
8d498c7
Add ``rescale_every_n_layers`` parameter
MollySophia Aug 6, 2024
903089b
Add ``wkv.head_size`` key for RWKV
MollySophia Aug 7, 2024
98ce5f4
Fix offloading layers to CUDA
MollySophia Aug 7, 2024
01dcf4b
Fix parallel inferencing for RWKV
MollySophia Aug 9, 2024
6ae2f48
Remove trailing whitespaces
MollySophia Aug 11, 2024
8bc1f9a
build_rwkv: Avoid using inplace operations
MollySophia Aug 11, 2024
18decea
convert_hf_to_gguf: rwkv: Avoid using ``eval``
MollySophia Aug 11, 2024
7f2e370
convert_hf_to_gguf: rwkv tokenizer: Don't escape sequences manually
MollySophia Aug 12, 2024
c695552
Update convert_hf_to_gguf.py
MollySophia Aug 12, 2024
8aa711a
ggml: Add backward computation for unary op ``exp``
MollySophia Aug 12, 2024
ae9936a
Update convert_hf_to_gguf.py
MollySophia Aug 12, 2024
5afa3ef
Update convert_hf_to_gguf.py
MollySophia Aug 12, 2024
12fbe1a
Use MODEL_ARCH.RWKV6 instead of MODEL_ARCH.RWKV
MollySophia Aug 12, 2024
276d53b
build_rwkv6: Simplify graph
MollySophia Aug 12, 2024
b0f4fe5
llama: rwkv6: Detect model.type
MollySophia Aug 13, 2024
683d70c
llama: rwkv6: Fix tensor loading for 7B/14B models
MollySophia Aug 13, 2024
ee1b78c
llama: rwkv6: Fix group_norm assertion failure with Metal
MollySophia Aug 13, 2024
c165e34
llama: rwkv6: Clean up
MollySophia Aug 13, 2024
6da6aa4
llama: rwkv6: Add quantization tensor exclusion
MollySophia Aug 13, 2024
f5d955d
llama: rwkv6: Use the new advanced batch splits
MollySophia Aug 23, 2024
57decb4
Update src/llama.cpp
MollySophia Aug 25, 2024
e94778a
llama: rwkv6: Use ``ggml_norm`` instead of ``ggml_group_norm``
MollySophia Aug 25, 2024
7756afd
llama: rwkv6: Apply code style and misc changes
MollySophia Aug 25, 2024
87a2901
converter: Use class name ``Rwkv6Model``
MollySophia Aug 25, 2024
c414a24
llama: rwkv6: Make use of key ``feed_forward_length``
MollySophia Aug 25, 2024
6d69fd7
llama: rwkv6: Add kv ``time_mix_extra_dim`` and ``time_decay_extra_dim``
MollySophia Aug 25, 2024
601b592
converter: Match ``new_name`` instead of ``name`` for float32 explici…
MollySophia Aug 26, 2024
e0ea511
llama: rwkv6: Keep ``time_mix_w1/w2`` as F32
MollySophia Aug 26, 2024
5f00c52
llama: rwkv6: Remove unused nodes
MollySophia Aug 26, 2024
7444046
llama: rwkv6: Apply code format changes
MollySophia Aug 26, 2024
7f2ef56
llama: rwkv6: Add lora for some supported tensors
MollySophia Aug 30, 2024
7004323
rwkv : speed-up tokenization using trie
ggerganov Aug 30, 2024
59dc2e7
minor : style + indentation
ggerganov Aug 30, 2024
5175375
llama: rwkv6: Avoid division by zero
MollySophia Aug 31, 2024
846358d
ggml: rwkv_wkv: Avoid copying the state
MollySophia Aug 31, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
rwkv : speed-up tokenization using trie
  • Loading branch information
ggerganov committed Aug 30, 2024
commit 7004323ecdd5f4dab77e626ea0e677fcf175542e
64 changes: 33 additions & 31 deletions src/llama-vocab.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,17 +58,17 @@ struct naive_trie {
auto res = children.find(c);
if (res != children.end()) {
return res->second.get_longest_prefix(key, len, offset + 1);
} else {
return std::make_pair(key, offset);
}

return std::make_pair(key, offset);
}
struct naive_trie * traverse(const char c) {
const struct naive_trie * traverse(const char c) const {
auto res = children.find(c);
if (res != children.end()) {
return &res->second;
} else {
return NULL;
}

return NULL;
}
std::map<char, struct naive_trie> children;
bool has_value;
Expand Down Expand Up @@ -843,7 +843,7 @@ struct llm_tokenizer_ugm {
// traverse the token matcher trie to find a matching token
bool single_codepoint_token_found = false;
const struct best_tokenization & current_best = tokenization_results[input_offset];
struct naive_trie * node = token_matcher.traverse(normalized[prefix_offset++]);
const struct naive_trie * node = token_matcher.traverse(normalized[prefix_offset++]);

while (prefix_offset <= input_len && node != NULL) {
// check if we found valid token in prefix
Expand Down Expand Up @@ -1103,6 +1103,7 @@ struct llm_tokenizer_ugm {

static std::vector<uint8_t> llama_unescape_rwkv_token(const std::string & escaped) {
std::vector<uint8_t> output;
output.reserve(escaped.size());

// Parser state
bool escaping = false;
Expand Down Expand Up @@ -1158,46 +1159,47 @@ struct llm_tokenizer_rwkv {
llm_tokenizer_rwkv(const llama_vocab & vocab): vocab(vocab) {
// RWKV supports arbitrary byte tokens, but the vocab struct only supports string tokens.
Comment thread
MollySophia marked this conversation as resolved.
Outdated
// For now, we decode the vocab here into the lookup we'll use for tokenization.
for (const auto & token : vocab.id_to_token) {
auto data = llama_unescape_rwkv_token(token.text);
tokens.push_back(data);

// build trie
for (unsigned int id = 0; id < vocab.id_to_token.size(); ++id) {
const auto & token = vocab.id_to_token[id];
const auto data = llama_unescape_rwkv_token(token.text);
token_matcher.insert((const char *) data.data(), data.size(), id);
}
}

void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
uint32_t position = 0;

while (position < text.size()) {
// Iterate through possible tokens backwards, starting with the largest
for (int32_t i = (int32_t)tokens.size() - 1; i >= 0; i--) {
// Skip tokens that aren't normal type, we can't match on those
if (!(vocab.id_to_token[i].attr & LLAMA_TOKEN_ATTR_NORMAL)) {
continue;
}

uint32_t token_size = tokens[i].size();

// If there's not enough left for this token
if (text.size() - position < token_size) {
continue;
}
const struct naive_trie * node = token_matcher.traverse(text[position]);
if (node == NULL) {
// no matching token found, add unknown token
output.push_back(vocab.special_unk_id);
position += 1;
continue;
}

// If the token doesn't match the data
if (std::memcmp(text.data() + position, tokens[i].data(), token_size) != 0) {
continue;
// traverse the trie to find the longest matching token
uint32_t token_id = 0;
uint32_t token_length = 0;
while (node != NULL) {
if (node->has_value) {
token_id = node->value;
token_length = position + 1;
}

// Add the token and advance
output.push_back(i);
position += token_size;
break;
node = node->traverse(text[++position]);
}

// add the longest matching token
output.push_back(token_id);
position = token_length;
}
}

const llama_vocab & vocab;

std::vector<std::vector<uint8_t>> tokens;
struct naive_trie token_matcher;
};

//
Expand Down