13
13
//
14
14
15
15
// The default parameters
16
- struct gpt_params {
17
- int32_t seed = -1 ; // RNG seed
18
- int32_t n_threads = std::min(4 , (int32_t ) std::thread::hardware_concurrency());
19
- int32_t n_predict = 128 ; // new tokens to predict
20
- int32_t repeat_last_n = 64 ; // last n tokens to penalize
21
- int32_t n_ctx = 2048 ; // context size
22
-
16
+ struct gpt_params
17
+ {
18
+ int32_t seed = -1 ; // RNG seed
19
+ int32_t n_threads = std::min(4 , (int32_t )std::thread::hardware_concurrency());
20
+ int32_t n_predict = 128 ; // new tokens to predict
21
+ int32_t repeat_last_n = 64 ; // last n tokens to penalize
22
+ int32_t n_ctx = 2048 ; // context size
23
+
23
24
// sampling parameters
24
25
int32_t top_k = 40 ;
25
- float top_p = 0 .95f ;
26
- float temp = 0 .10f ;
27
- float repeat_penalty = 1 .30f ;
26
+ float top_p = 0 .95f ;
27
+ float temp = 0 .10f ;
28
+ float repeat_penalty = 1 .30f ;
28
29
29
30
int32_t n_batch = 8 ; // batch size for prompt processing
30
31
31
- std::string model = " ggml-alpaca-7b-q4 .bin" ; // model path
32
+ std::string model = " gpt4all-lora-quantized .bin" ; // model path
32
33
std::string prompt;
33
34
34
35
bool use_color = true ; // use color to distinguish generations and inputs
35
36
36
- bool interactive = true ; // interactive mode
37
+ bool interactive = true ; // interactive mode
37
38
bool interactive_start = true ; // reverse prompt immediately
38
- std::string antiprompt = " " ; // string upon seeing which more user input is prompted
39
+ std::string antiprompt = " " ; // string upon seeing which more user input is prompted
39
40
};
40
41
41
- bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
42
+ bool gpt_params_parse (int argc, char **argv, gpt_params ¶ms);
42
43
43
- void gpt_print_usage (int argc, char ** argv, const gpt_params & params);
44
+ void gpt_print_usage (int argc, char **argv, const gpt_params ¶ms);
44
45
45
- std::string gpt_random_prompt (std::mt19937 & rng);
46
+ std::string gpt_random_prompt (std::mt19937 &rng);
46
47
47
48
//
48
49
// Vocab utils
49
50
//
50
51
51
- struct gpt_vocab {
52
- using id = int32_t ;
52
+ struct gpt_vocab
53
+ {
54
+ using id = int32_t ;
53
55
using token = std::string;
54
56
55
57
std::map<token, id> token_to_id;
56
58
std::map<id, token> id_to_token;
57
59
};
58
60
59
- void replace (std::string & str, const std::string & needle, const std::string & replacement);
61
+ void replace (std::string &str, const std::string &needle, const std::string &replacement);
60
62
61
63
// poor-man's JSON parsing
62
- std::map<std::string, int32_t > json_parse (const std::string & fname);
64
+ std::map<std::string, int32_t > json_parse (const std::string &fname);
63
65
64
66
// split text into tokens
65
67
//
@@ -71,36 +73,36 @@ std::map<std::string, int32_t> json_parse(const std::string & fname);
71
73
// Regex (C++):
72
74
// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
73
75
//
74
- std::vector<gpt_vocab::id> gpt_tokenize (const gpt_vocab & vocab, const std::string & text);
76
+ std::vector<gpt_vocab::id> gpt_tokenize (const gpt_vocab &vocab, const std::string &text);
75
77
76
78
// TODO: this is probably wrong, but I cannot figure out how this tokenizer works ..
77
79
// ref: https://github.com/google/sentencepiece
78
- std::vector<gpt_vocab::id> llama_tokenize (const gpt_vocab & vocab, const std::string & text, bool bos);
80
+ std::vector<gpt_vocab::id> llama_tokenize (const gpt_vocab &vocab, const std::string &text, bool bos);
79
81
80
82
// load the tokens from encoder.json
81
- bool gpt_vocab_init (const std::string & fname, gpt_vocab & vocab);
83
+ bool gpt_vocab_init (const std::string &fname, gpt_vocab &vocab);
82
84
83
85
// sample next token given probabilities for each embedding
84
86
//
85
87
// - consider only the top K tokens
86
88
// - from them, consider only the top tokens with cumulative probability > P
87
89
//
88
90
gpt_vocab::id llama_sample_top_p_top_k (
89
- const gpt_vocab & vocab,
90
- const float * logits,
91
- std::vector<gpt_vocab::id> & last_n_tokens,
92
- double repeat_penalty,
93
- int top_k,
94
- double top_p,
95
- double temp,
96
- std::mt19937 & rng);
91
+ const gpt_vocab &vocab,
92
+ const float *logits,
93
+ std::vector<gpt_vocab::id> &last_n_tokens,
94
+ double repeat_penalty,
95
+ int top_k,
96
+ double top_p,
97
+ double temp,
98
+ std::mt19937 &rng);
97
99
98
100
// filer to top K tokens from list of logits
99
- void sample_top_k (std::vector<std::pair<double , gpt_vocab::id>> & logits_id, int top_k);
101
+ void sample_top_k (std::vector<std::pair<double , gpt_vocab::id>> &logits_id, int top_k);
100
102
101
103
//
102
104
// Quantization
103
105
//
104
106
105
- size_t ggml_quantize_q4_0 (float * src, void * dst, int n, int k, int qk, int64_t * hist);
106
- size_t ggml_quantize_q4_1 (float * src, void * dst, int n, int k, int qk, int64_t * hist);
107
+ size_t ggml_quantize_q4_0 (float *src, void *dst, int n, int k, int qk, int64_t *hist);
108
+ size_t ggml_quantize_q4_1 (float *src, void *dst, int n, int k, int qk, int64_t *hist);
0 commit comments