Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
server : do not make checkpoints if --swa-full
ggml-ci
  • Loading branch information
ggerganov committed Aug 14, 2025
commit e7d2ecdf2ae86c660aca445a1b69aa9dc21fa833
15 changes: 8 additions & 7 deletions tools/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3330,7 +3330,7 @@ struct server_context {
if (!do_reset) {
// restore the checkpoint
const size_t swa_size = it->data.size();
const size_t n = llama_state_seq_set_data(ctx, it->data.data(), swa_size, slot.id);
const size_t n = llama_state_seq_set_data_ext(ctx, it->data.data(), swa_size, slot.id, LLAMA_STATE_SEQ_FLAGS_SWA_ONLY);

if (n != swa_size) {
SLT_ERR(slot, "failed to restore SWA checkpoint, pos_min = %d, pos_max = %d, size = %.3f MiB\n", it->pos_min, it->pos_max, (float) swa_size / 1024 / 1024);
Expand Down Expand Up @@ -3361,7 +3361,7 @@ struct server_context {
if (cur.pos_min > pos_min_thold) {
slot.swa_checkpoints.erase(slot.swa_checkpoints.begin() + i);

SLT_WRN(slot, "SWA checkpoint erase, pos_min = %d, pos_max = %d, size = %f MiB\n", cur.pos_min, cur.pos_max, (float) cur.data.size() / 1024 / 1024);
SLT_WRN(slot, "SWA checkpoint erase, pos_min = %d, pos_max = %d, size = %.3f MiB\n", cur.pos_min, cur.pos_max, (float) cur.data.size() / 1024 / 1024);
}
}
}
Expand Down Expand Up @@ -3578,28 +3578,29 @@ struct server_context {
slot.state = SLOT_STATE_GENERATING;

// make a checkpoint with the SWA memory
if (llama_model_n_swa(model) > 0) {
// checkpoints are needed only if we are not using "--swa-full"
if (llama_model_n_swa(model) > 0 && !params_base.swa_full) {
if (slot.swa_checkpoints.size() >= SERVER_MAX_SWA_CHECKPOINTS_PER_SLOT) {
{
const auto & cur = slot.swa_checkpoints.back();

SLT_WRN(slot, "SWA checkpoint erase, pos_min = %d, pos_max = %d, size = %f MiB\n", cur.pos_min, cur.pos_max, (float) cur.data.size() / 1024 / 1024);
SLT_WRN(slot, "SWA checkpoint erase, pos_min = %d, pos_max = %d, size = %.3f MiB\n", cur.pos_min, cur.pos_max, (float) cur.data.size() / 1024 / 1024);
}

slot.swa_checkpoints.erase(slot.swa_checkpoints.begin());
}

const size_t swa_size = llama_state_seq_get_size(ctx, slot.id);
const size_t swa_size = llama_state_seq_get_size_ext(ctx, slot.id, LLAMA_STATE_SEQ_FLAGS_SWA_ONLY);

auto & cur = slot.swa_checkpoints.emplace_back(swa_checkpoint{
/*.pos_min = */ llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id),
/*.pos_max = */ llama_memory_seq_pos_max(llama_get_memory(ctx), slot.id),
/*.data = */ std::vector<uint8_t>(swa_size),
});

llama_state_seq_get_data(ctx, cur.data.data(), swa_size, slot.id);
llama_state_seq_get_data_ext(ctx, cur.data.data(), swa_size, slot.id, LLAMA_STATE_SEQ_FLAGS_SWA_ONLY);

SLT_WRN(slot, "SWA checkpoint create, pos_min = %d, pos_max = %d, size = %f MiB\n", cur.pos_min, cur.pos_max, (float) swa_size / 1024 / 1024);
SLT_WRN(slot, "SWA checkpoint create, pos_min = %d, pos_max = %d, size = %.3f MiB\n", cur.pos_min, cur.pos_max, (float) swa_size / 1024 / 1024);
}
} else if (slot.state != SLOT_STATE_GENERATING) {
continue; // continue loop of slots
Expand Down
Loading