Merge remote-tracking branch 'origin/master' into perplexity

ggml-org · ggerganov · Mar 21, 2023 · Mar 18, 2023 · Mar 19, 2023 · Mar 19, 2023
commit 9d1cdb893807bc57c4a053a788fe167e0c8f0e59
diff --git a/main.cpp b/main.cpp
@@ -558,9 +558,9 @@ bool llama_eval(
         const llama_model & model,
         const int n_threads,
         const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp,
-              std::vector<float>         & embd_w,
-              size_t                     & mem_per_token,
+        const std::vector<llama_vocab::id> & embd_inp,
+              std::vector<float>           & embd_w,
+              size_t                       & mem_per_token,
               bool return_all_logits = false) {
     const int N = embd_inp.size();
 
@@ -800,11 +800,11 @@ std::vector<double> softmax(const std::vector<float>& logits) {
     return probs;
 }
 
-void perplexity(const gpt_vocab &vocab, const llama_model &model, const gpt_params &params, size_t mem_per_token) {
+void perplexity(const llama_vocab &vocab, const llama_model &model, const gpt_params &params, size_t mem_per_token) {
     // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
     // Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
     // Output: `perplexity: 13.5106 [114/114]`
-    std::vector<gpt_vocab::id> tokens = ::llama_tokenize(vocab, params.prompt, true);
+    std::vector<llama_vocab::id> tokens = ::llama_tokenize(vocab, params.prompt, true);
 
     int count = 0;
     double nll = 0.0;
@@ -813,7 +813,7 @@ void perplexity(const gpt_vocab &vocab, const llama_model &model, const gpt_para
     for (int i = 0; i < seq_count; ++i) {
         int start = i * params.n_ctx;
         int end = start + params.n_ctx - 1;
-        std::vector<gpt_vocab::id> embd(tokens.begin() + start, tokens.begin() + end);
+        std::vector<llama_vocab::id> embd(tokens.begin() + start, tokens.begin() + end);
         std::vector<float> logits;
         auto start_t = std::chrono::high_resolution_clock::now();
         if (!llama_eval(model, params.n_threads, 0, embd, logits, mem_per_token, true)) {
@@ -977,13 +977,6 @@ int main(int argc, char ** argv) {
         params.antiprompt.push_back("### Instruction:\n\n");
     }
 
-    // tokenize the reverse prompt
-    std::vector<std::vector<gpt_vocab::id>> antipromptv_inp;
-
-    for (auto antiprompt : params.antiprompt) {
-        antipromptv_inp.push_back(::llama_tokenize(vocab, antiprompt, false));
-    }
-
     // enable interactive mode if reverse prompt is specified
     if (params.antiprompt.size() != 0) {
         params.interactive = true;

diff --git a/utils.h b/utils.h
@@ -32,9 +32,15 @@ struct gpt_params {
     std::string prompt = "";
 
     std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
-    bool instruct    = false; // instruction mode (used for Alpaca models)
-    bool ignore_eos = false; // do not stop generating after eos
-    bool perplexity = false; // compute perplexity over the prompt
+
+    bool memory_f16        = false; // use f16 instead of f32 for memory kv
+    bool random_prompt     = false; // do not randomize prompt if none provided
+    bool use_color         = false; // use color to distinguish generations and inputs
+    bool interactive       = false; // interactive mode
+    bool interactive_start = false; // reverse prompt immediately
+    bool instruct          = false; // instruction mode (used for Alpaca models)
+    bool ignore_eos        = false; // do not stop generating after eos
+    bool perplexity        = false; // compute perplexity over the prompt
 };
 
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);