check n_ubatch >= n_tokens with non-casual attention

slaren · slaren · commit 202adca030e9 · 2024-03-13T13:59:08.000+01:00
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -1738,7 +1738,8 @@ struct server_context {
         }
 
         // process in chunks of params.n_batch
-        int32_t n_batch = params.n_batch;
+        int32_t n_batch = llama_n_batch(ctx);
+        int32_t n_ubatch = llama_n_ubatch(ctx);
 
         // next, batch any pending prompts without exceeding n_batch
         if (params.cont_batching || batch.n_tokens == 0) {
@@ -1811,7 +1812,7 @@ struct server_context {
 
                         if (slot.embedding) {
                             // this prompt is too large to process - discard it
-                            if (slot.n_prompt_tokens > n_batch) {
+                            if (slot.n_prompt_tokens > n_ubatch) {
                                 slot.state = SLOT_STATE_PROCESSING;
                                 slot.command = SLOT_COMMAND_NONE;
                                 slot.release();
diff --git a/llama.cpp b/llama.cpp
@@ -8774,6 +8774,8 @@ static int llama_decode_internal(
 
     GGML_ASSERT(n_tokens_all <= cparams.n_batch);
 
+    GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
+
     if (lctx.t_compute_start_us == 0) {
         lctx.t_compute_start_us = ggml_time_us();
     }
@@ -9011,9 +9013,6 @@ static int llama_decode_internal(
                 case LLAMA_POOLING_TYPE_CLS:
                 case LLAMA_POOLING_TYPE_MEAN:
                     {
-                        // FIXME: this may not work if the sequences are split into different batches
-                        GGML_ASSERT(n_tokens_all == n_tokens);
-
                         GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0);
 
                         // extract sequence embeddings
@@ -13076,6 +13075,10 @@ uint32_t llama_n_batch(const struct llama_context * ctx) {
     return ctx->cparams.n_batch;
 }
 
+uint32_t llama_n_ubatch(const struct llama_context *ctx) {
+    return ctx->cparams.n_ubatch;
+}
+
 uint32_t llama_n_seq_max(const struct llama_context * ctx) {
     return ctx->kv_self.size;
 }
diff --git a/llama.h b/llama.h
@@ -378,6 +378,7 @@ extern "C" {
 
     LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
     LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
+    LLAMA_API uint32_t llama_n_ubatch   (const struct llama_context * ctx);
     LLAMA_API uint32_t llama_n_seq_max  (const struct llama_context * ctx);
 
     LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);

Original file line number	Diff line number	Diff line change
`@@ -8774,6 +8774,8 @@ static int llama_decode_internal(`
`8774`	`8774`
`8775`	`8775`	`GGML_ASSERT(n_tokens_all <= cparams.n_batch);`
`8776`	`8776`
	`8777`	`+ GGML_ASSERT((cparams.causal_attn \|\| cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");`
	`8778`	`+`
`8777`	`8779`	`if (lctx.t_compute_start_us == 0) {`
`8778`	`8780`	`lctx.t_compute_start_us = ggml_time_us();`
`8779`	`8781`	`}`
`@@ -9011,9 +9013,6 @@ static int llama_decode_internal(`
`9011`	`9013`	`case LLAMA_POOLING_TYPE_CLS:`
`9012`	`9014`	`case LLAMA_POOLING_TYPE_MEAN:`
`9013`	`9015`	`{`
`9014`		`- // FIXME: this may not work if the sequences are split into different batches`
`9015`		`- GGML_ASSERT(n_tokens_all == n_tokens);`
`9016`		`-`
`9017`	`9016`	`GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0);`
`9018`	`9017`
`9019`	`9018`	`// extract sequence embeddings`
`@@ -13076,6 +13075,10 @@ uint32_t llama_n_batch(const struct llama_context * ctx) {`
`13076`	`13075`	`return ctx->cparams.n_batch;`
`13077`	`13076`	`}`
`13078`	`13077`
	`13078`	`+uint32_t llama_n_ubatch(const struct llama_context *ctx) {`
	`13079`	`+ return ctx->cparams.n_ubatch;`
	`13080`	`+}`
	`13081`	`+`
`13079`	`13082`	`uint32_t llama_n_seq_max(const struct llama_context * ctx) {`
`13080`	`13083`	`return ctx->kv_self.size;`
`13081`	`13084`	`}`