Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
metal : cleanp
ggml-ci
  • Loading branch information
ggerganov committed Sep 17, 2025
commit 831b2042158af6c14aa4f9e08aae9eb7029c7276
35 changes: 10 additions & 25 deletions ggml/src/ggml-metal/ggml-metal-context.m
Original file line number Diff line number Diff line change
Expand Up @@ -234,17 +234,6 @@ void ggml_metal_synchronize(ggml_metal_t ctx) {
}
}

// TODO: temporary shim
static id<MTLBuffer> ggml_metal_get_buffer(const struct ggml_tensor * t, size_t * offs) {
ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;

struct ggml_metal_buffer_id res = ggml_metal_buffer_get_id(buffer->context, t);

*offs = res.offs;

return res.metal;
}

static struct ggml_metal_buffer_id ggml_metal_get_buffer_id(const struct ggml_tensor * t) {
if (!t) {
return (struct ggml_metal_buffer_id) { nil, 0 };
Expand All @@ -262,14 +251,12 @@ void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor,
length:size
options:MTLResourceStorageModeShared];

size_t buf_dst_offset = 0;
id<MTLBuffer> buf_dst = ggml_metal_get_buffer(tensor, &buf_dst_offset);

if (buf_dst == nil) {
struct ggml_metal_buffer_id bid_dst = ggml_metal_get_buffer_id(tensor);
if (bid_dst.metal == nil) {
GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name);
}

buf_dst_offset += offset;
bid_dst.offs += offset;

// queue the copy operation into the queue of the Metal context
// this will be queued at the end, after any currently ongoing GPU operations
Expand All @@ -278,8 +265,8 @@ void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor,

[encoder copyFromBuffer:buf_src
sourceOffset:0
toBuffer:buf_dst
destinationOffset:buf_dst_offset
toBuffer:bid_dst.metal
destinationOffset:bid_dst.offs
size:size];

[encoder endEncoding];
Expand All @@ -303,22 +290,20 @@ void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * te
options:MTLResourceStorageModeShared
deallocator:nil];

size_t buf_src_offset = 0;
id<MTLBuffer> buf_src = ggml_metal_get_buffer(tensor, &buf_src_offset);

if (buf_src == nil) {
struct ggml_metal_buffer_id bid_src = ggml_metal_get_buffer_id(tensor);
if (bid_src.metal == nil) {
GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name);
}

buf_src_offset += offset;
bid_src.offs += offset;

// queue the copy operation into the queue of the Metal context
// this will be queued at the end, after any currently ongoing GPU operations
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];

[encoder copyFromBuffer:buf_src
sourceOffset:buf_src_offset
[encoder copyFromBuffer:bid_src.metal
sourceOffset:bid_src.offs
toBuffer:buf_dst
destinationOffset:0
size:size];
Expand Down
26 changes: 14 additions & 12 deletions ggml/src/ggml-metal/ggml-metal-device.m
Original file line number Diff line number Diff line change
Expand Up @@ -1119,17 +1119,17 @@ void ggml_metal_buffer_memset_tensor(ggml_metal_buffer_t buf, struct ggml_tensor

@autoreleasepool {
// dst
struct ggml_metal_buffer_id buf_dst = ggml_metal_buffer_get_id(buf, tensor);
buf_dst.offs += offset;
struct ggml_metal_buffer_id bid_dst = ggml_metal_buffer_get_id(buf, tensor);
bid_dst.offs += offset;

id<MTLCommandQueue> queue = buf->queue;
id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];

{
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];

[encoder fillBuffer:buf_dst.metal
range:NSMakeRange(buf_dst.offs, buf_dst.offs + size)
[encoder fillBuffer:bid_dst.metal
range:NSMakeRange(bid_dst.offs, bid_dst.offs + size)
value:value];

[encoder endEncoding];
Expand All @@ -1155,8 +1155,8 @@ void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor *
deallocator:nil];

// dst
struct ggml_metal_buffer_id buf_dst = ggml_metal_buffer_get_id(buf, tensor);
buf_dst.offs += offset;
struct ggml_metal_buffer_id bid_dst = ggml_metal_buffer_get_id(buf, tensor);
bid_dst.offs += offset;

// note: for experimentation purposes, here we use a semaphore to wait for the copy to complete
// this is alternative to waitUntilCompleted, which should be faster, but don't seem to make much difference
Expand All @@ -1170,8 +1170,8 @@ void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor *

[encoder copyFromBuffer:buf_src
sourceOffset:0
toBuffer:buf_dst.metal
destinationOffset:buf_dst.offs
toBuffer:bid_dst.metal
destinationOffset:bid_dst.offs
size:size];

[encoder endEncoding];
Expand All @@ -1187,6 +1187,8 @@ void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor *
[cmd_buf commit];

dispatch_semaphore_wait(completion_semaphore, DISPATCH_TIME_FOREVER);
dispatch_release(completion_semaphore);

//[cmd_buf waitUntilCompleted];
}
}
Expand All @@ -1199,8 +1201,8 @@ void ggml_metal_buffer_get_tensor(ggml_metal_buffer_t buf, const struct ggml_ten

@autoreleasepool {
// src
struct ggml_metal_buffer_id buf_src = ggml_metal_buffer_get_id(buf, tensor);
buf_src.offs += offset;
struct ggml_metal_buffer_id bid_src = ggml_metal_buffer_get_id(buf, tensor);
bid_src.offs += offset;

// dst
id<MTLBuffer> buf_dst = [buf->device newBufferWithBytesNoCopy:data
Expand All @@ -1214,8 +1216,8 @@ void ggml_metal_buffer_get_tensor(ggml_metal_buffer_t buf, const struct ggml_ten
{
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];

[encoder copyFromBuffer:buf_src.metal
sourceOffset:buf_src.offs
[encoder copyFromBuffer:bid_src.metal
sourceOffset:bid_src.offs
toBuffer:buf_dst
destinationOffset:0
size:size];
Expand Down
5 changes: 4 additions & 1 deletion ggml/src/ggml-metal/ggml-metal.metal
Original file line number Diff line number Diff line change
Expand Up @@ -1406,7 +1406,10 @@ kernel void kernel_elu_f32_4(
device float4 * dst,
uint tpig[[thread_position_in_grid]]) {
const float4 x = src0[tpig];
dst[tpig] = float4(x > 0.0f)*x + float4(x <= 0.0f)*(exp(x) - 1.0f);
dst[tpig][0] = (x[0] > 0.0f) ? x[0] : (exp(x[0]) - 1.0f);
dst[tpig][1] = (x[1] > 0.0f) ? x[1] : (exp(x[1]) - 1.0f);
dst[tpig][2] = (x[2] > 0.0f) ? x[2] : (exp(x[2]) - 1.0f);
dst[tpig][3] = (x[3] > 0.0f) ? x[3] : (exp(x[3]) - 1.0f);
}

kernel void kernel_sqr_f32(
Expand Down
Loading