metal : cleanp

ggml-ci
ggml-org · ggerganov · Sep 17, 2025 · Sep 14, 2025 · Sep 14, 2025 · Sep 14, 2025
commit 831b2042158af6c14aa4f9e08aae9eb7029c7276
@@ -234,17 +234,6 @@ void ggml_metal_synchronize(ggml_metal_t ctx) {
     }
 }
 
-// TODO: temporary shim
-static id<MTLBuffer> ggml_metal_get_buffer(const struct ggml_tensor * t, size_t * offs) {
-    ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
-
-    struct ggml_metal_buffer_id res = ggml_metal_buffer_get_id(buffer->context, t);
-
-    *offs = res.offs;
-
-    return res.metal;
-}
-
 static struct ggml_metal_buffer_id ggml_metal_get_buffer_id(const struct ggml_tensor * t) {
     if (!t) {
         return (struct ggml_metal_buffer_id) { nil, 0 };
@@ -262,14 +251,12 @@ void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor,
                                                          length:size
                                                         options:MTLResourceStorageModeShared];
 
-        size_t buf_dst_offset = 0;
-        id<MTLBuffer> buf_dst = ggml_metal_get_buffer(tensor, &buf_dst_offset);
-
-        if (buf_dst == nil) {
+        struct ggml_metal_buffer_id bid_dst = ggml_metal_get_buffer_id(tensor);
+        if (bid_dst.metal == nil) {
             GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name);
         }
 
-        buf_dst_offset += offset;
+        bid_dst.offs += offset;
 
         // queue the copy operation into the queue of the Metal context
         // this will be queued at the end, after any currently ongoing GPU operations
@@ -278,8 +265,8 @@ void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor,
 
         [encoder copyFromBuffer:buf_src
                    sourceOffset:0
-                       toBuffer:buf_dst
-              destinationOffset:buf_dst_offset
+                       toBuffer:bid_dst.metal
+              destinationOffset:bid_dst.offs
                            size:size];
 
         [encoder endEncoding];
@@ -303,22 +290,20 @@ void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * te
                                                               options:MTLResourceStorageModeShared
                                                           deallocator:nil];
 
-        size_t buf_src_offset = 0;
-        id<MTLBuffer> buf_src = ggml_metal_get_buffer(tensor, &buf_src_offset);
-
-        if (buf_src == nil) {
+        struct ggml_metal_buffer_id bid_src = ggml_metal_get_buffer_id(tensor);
+        if (bid_src.metal == nil) {
             GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name);
         }
 
-        buf_src_offset += offset;
+        bid_src.offs += offset;
 
         // queue the copy operation into the queue of the Metal context
         // this will be queued at the end, after any currently ongoing GPU operations
         id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
         id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
 
-        [encoder copyFromBuffer:buf_src
-                   sourceOffset:buf_src_offset
+        [encoder copyFromBuffer:bid_src.metal
+                   sourceOffset:bid_src.offs
                        toBuffer:buf_dst
               destinationOffset:0
                            size:size];

@@ -1119,17 +1119,17 @@ void ggml_metal_buffer_memset_tensor(ggml_metal_buffer_t buf, struct ggml_tensor
 
     @autoreleasepool {
         // dst
-        struct ggml_metal_buffer_id buf_dst = ggml_metal_buffer_get_id(buf, tensor);
-        buf_dst.offs += offset;
+        struct ggml_metal_buffer_id bid_dst = ggml_metal_buffer_get_id(buf, tensor);
+        bid_dst.offs += offset;
 
         id<MTLCommandQueue>  queue   = buf->queue;
         id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
 
         {
             id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
 
-            [encoder fillBuffer:buf_dst.metal
-                          range:NSMakeRange(buf_dst.offs, buf_dst.offs + size)
+            [encoder fillBuffer:bid_dst.metal
+                          range:NSMakeRange(bid_dst.offs, bid_dst.offs + size)
                           value:value];
 
             [encoder endEncoding];
@@ -1155,8 +1155,8 @@ void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor *
                                                           deallocator:nil];
 
         // dst
-        struct ggml_metal_buffer_id buf_dst = ggml_metal_buffer_get_id(buf, tensor);
-        buf_dst.offs += offset;
+        struct ggml_metal_buffer_id bid_dst = ggml_metal_buffer_get_id(buf, tensor);
+        bid_dst.offs += offset;
 
         // note: for experimentation purposes, here we use a semaphore to wait for the copy to complete
         //       this is alternative to waitUntilCompleted, which should be faster, but don't seem to make much difference
@@ -1170,8 +1170,8 @@ void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor *
 
             [encoder copyFromBuffer:buf_src
                        sourceOffset:0
-                           toBuffer:buf_dst.metal
-                  destinationOffset:buf_dst.offs
+                           toBuffer:bid_dst.metal
+                  destinationOffset:bid_dst.offs
                                size:size];
 
             [encoder endEncoding];
@@ -1187,6 +1187,8 @@ void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor *
         [cmd_buf commit];
 
         dispatch_semaphore_wait(completion_semaphore, DISPATCH_TIME_FOREVER);
+        dispatch_release(completion_semaphore);
+
         //[cmd_buf waitUntilCompleted];
     }
 }
@@ -1199,8 +1201,8 @@ void ggml_metal_buffer_get_tensor(ggml_metal_buffer_t buf, const struct ggml_ten
 
     @autoreleasepool {
         // src
-        struct ggml_metal_buffer_id buf_src = ggml_metal_buffer_get_id(buf, tensor);
-        buf_src.offs += offset;
+        struct ggml_metal_buffer_id bid_src = ggml_metal_buffer_get_id(buf, tensor);
+        bid_src.offs += offset;
 
         // dst
         id<MTLBuffer> buf_dst = [buf->device newBufferWithBytesNoCopy:data
@@ -1214,8 +1216,8 @@ void ggml_metal_buffer_get_tensor(ggml_metal_buffer_t buf, const struct ggml_ten
         {
             id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
 
-            [encoder copyFromBuffer:buf_src.metal
-                       sourceOffset:buf_src.offs
+            [encoder copyFromBuffer:bid_src.metal
+                       sourceOffset:bid_src.offs
                            toBuffer:buf_dst
                   destinationOffset:0
                                size:size];

@@ -1406,7 +1406,10 @@ kernel void kernel_elu_f32_4(
         device       float4 * dst,
         uint tpig[[thread_position_in_grid]]) {
     const float4 x = src0[tpig];
-    dst[tpig] = float4(x > 0.0f)*x + float4(x <= 0.0f)*(exp(x) - 1.0f);
+    dst[tpig][0] = (x[0] > 0.0f) ? x[0] : (exp(x[0]) - 1.0f);
+    dst[tpig][1] = (x[1] > 0.0f) ? x[1] : (exp(x[1]) - 1.0f);
+    dst[tpig][2] = (x[2] > 0.0f) ? x[2] : (exp(x[2]) - 1.0f);
+    dst[tpig][3] = (x[3] > 0.0f) ? x[3] : (exp(x[3]) - 1.0f);
 }
 
 kernel void kernel_sqr_f32(