@@ -1358,6 +1358,14 @@ struct server_slot {
1358
1358
return server_task_type_need_logits (task_type);
1359
1359
}
1360
1360
1361
+ // if the context does not have a memory module then all embeddings have to be computed within a single ubatch
1362
+ // also we cannot split if the pooling would require any past tokens
1363
+ bool can_split () const {
1364
+ return
1365
+ !need_embd () ||
1366
+ (llama_get_memory (ctx) && llama_pooling_type (ctx) == LLAMA_POOLING_TYPE_LAST);
1367
+ }
1368
+
1361
1369
bool can_batch_with (server_slot & other_slot) const {
1362
1370
return task_type == other_slot.task_type && are_lora_equal (lora, other_slot.lora );
1363
1371
}
@@ -1929,14 +1937,6 @@ struct server_context {
1929
1937
llama_batch_free (batch);
1930
1938
}
1931
1939
1932
- // if the context does not have a memory module then all embeddings have to be computed within a single ubatch
1933
- // also we cannot split if the pooling would require any past tokens
1934
- bool can_split () const {
1935
- return
1936
- !llama_get_embeddings (ctx) ||
1937
- (llama_get_memory (ctx) && llama_pooling_type (ctx) == LLAMA_POOLING_TYPE_LAST);
1938
- }
1939
-
1940
1940
bool load_model (const common_params & params) {
1941
1941
SRV_INF (" loading model '%s'\n " , params.model .path .c_str ());
1942
1942
@@ -3130,7 +3130,7 @@ struct server_context {
3130
3130
continue ;
3131
3131
}
3132
3132
3133
- if (!can_split ()) {
3133
+ if (!slot. can_split ()) {
3134
3134
if (slot.n_prompt_tokens > n_ubatch) {
3135
3135
slot.release ();
3136
3136
send_error (slot, " input is too large to process. increase the physical batch size" , ERROR_TYPE_SERVER);
@@ -3273,7 +3273,7 @@ struct server_context {
3273
3273
slot.n_prompt_tokens_processed = 0 ;
3274
3274
}
3275
3275
3276
- if (!can_split ()) {
3276
+ if (!slot. can_split ()) {
3277
3277
// cannot fit the prompt in the current batch - will try next iter
3278
3278
if (batch.n_tokens + slot.n_prompt_tokens > n_batch) {
3279
3279
continue ;
0 commit comments