metal : final refactoring and simplification

ggml-org · ggerganov · Jun 4, 2023 · May 29, 2023 · May 29, 2023 · May 29, 2023
commit a7fb899c53013f6a9b776a073f93272f6954805b
diff --git a/.gitignore b/.gitignore
@@ -17,7 +17,7 @@ build-release/
 build-static/
 build-cublas/
 build-opencl/
-build-mtl/
+build-metal/
 build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -207,16 +207,16 @@ if (LLAMA_METAL)
     find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED)
     find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
 
-    set(GGML_SOURCES_METAL ggml-mtl.m ggml-mtl.h)
+    set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
 
     add_compile_definitions(GGML_USE_METAL)
     add_compile_definitions(GGML_METAL_NDEBUG)
 
     # get full path to the file
     #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
 
-    # copy ggml-mtl.metal to bin directory
-    configure_file(ggml-mtl.metal bin/ggml-mtl.metal COPYONLY)
+    # copy ggml-metal.metal to bin directory
+    configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
 
     set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
         ${FOUNDATION_LIBRARY}

diff --git a/Makefile b/Makefile
@@ -105,6 +105,7 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
 	#CFLAGS   += -mfma -mf16c -mavx
 	#CXXFLAGS += -mfma -mf16c -mavx
 endif
+
 ifneq ($(filter ppc64%,$(UNAME_M)),)
 	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
 	ifneq (,$(findstring POWER9,$(POWER9_M)))
@@ -116,26 +117,30 @@ ifneq ($(filter ppc64%,$(UNAME_M)),)
 		CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
 	endif
 endif
+
 ifndef LLAMA_NO_ACCELERATE
 	# Mac M1 - include Accelerate framework.
 	# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
 	ifeq ($(UNAME_S),Darwin)
 		CFLAGS  += -DGGML_USE_ACCELERATE
 		LDFLAGS += -framework Accelerate
 	endif
-endif
+endif # LLAMA_NO_ACCELERATE
+
 ifdef LLAMA_OPENBLAS
 	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
 	ifneq ($(shell grep -e "Arch Linux" -e "ID_LIKE=arch" /etc/os-release 2>/dev/null),)
 		LDFLAGS += -lopenblas -lcblas
 	else
 		LDFLAGS += -lopenblas
 	endif
-endif
+endif # LLAMA_OPENBLAS
+
 ifdef LLAMA_BLIS
 	CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
 	LDFLAGS += -lblis -L/usr/local/lib
-endif
+endif # LLAMA_BLIS
+
 ifdef LLAMA_CUBLAS
 	CFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
 	CXXFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
@@ -156,33 +161,49 @@ endif # LLAMA_CUDA_DMMV_Y
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
 endif # LLAMA_CUBLAS
+
 ifdef LLAMA_CLBLAST
-	CFLAGS  += -DGGML_USE_CLBLAST
-	CXXFLAGS  += -DGGML_USE_CLBLAST
+	CFLAGS   += -DGGML_USE_CLBLAST
+	CXXFLAGS += -DGGML_USE_CLBLAST
 	# Mac provides OpenCL as a framework
 	ifeq ($(UNAME_S),Darwin)
 		LDFLAGS += -lclblast -framework OpenCL
 	else
 		LDFLAGS += -lclblast -lOpenCL
 	endif
 	OBJS    += ggml-opencl.o
+
 ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
-endif
+endif # LLAMA_CLBLAST
+
+ifdef LLAMA_METAL
+	CFLAGS   += -DGGML_USE_METAL -DGGML_METAL_NDEBUG
+	CXXFLAGS += -DGGML_USE_METAL
+	LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
+	OBJS     += ggml-metal.o
+
+ggml-metal.o: ggml-metal.m ggml-metal.h
+	$(CC) $(CFLAGS) -c $< -o $@
+endif # LLAMA_METAL
+
 ifneq ($(filter aarch64%,$(UNAME_M)),)
 	# Apple M1, M2, etc.
 	# Raspberry Pi 3, 4, Zero 2 (64-bit)
 	CFLAGS   += -mcpu=native
 	CXXFLAGS += -mcpu=native
 endif
+
 ifneq ($(filter armv6%,$(UNAME_M)),)
 	# Raspberry Pi 1, Zero
 	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
 endif
+
 ifneq ($(filter armv7%,$(UNAME_M)),)
 	# Raspberry Pi 2
 	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
 endif
+
 ifneq ($(filter armv8%,$(UNAME_M)),)
 	# Raspberry Pi 3, 4, Zero 2 (32-bit)
 	CFLAGS += -mfp16-format=ieee -mno-unaligned-access

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -38,7 +38,7 @@ else()
     add_subdirectory(benchmark)
     add_subdirectory(baby-llama)
     if (LLAMA_METAL)
-        add_subdirectory(mtl)
+        add_subdirectory(metal)
     endif()
     if (LLAMA_BUILD_SERVER)
         add_subdirectory(server)

diff --git a/examples/metal/CMakeLists.txt b/examples/metal/CMakeLists.txt
@@ -0,0 +1,3 @@
+set(TEST_TARGET metal)
+add_executable(${TEST_TARGET} metal.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
diff --git a/examples/mtl/mtl.cpp → examples/metal/metal.cpp b/examples/mtl/mtl.cpp → examples/metal/metal.cpp
@@ -1,5 +1,5 @@
 #include "ggml.h"
-#include "ggml-mtl.h"
+#include "ggml-metal.h"
 
 #include <cstdio>
 #include <cstring>
@@ -23,28 +23,28 @@ int main(int argc, char ** argv) {
     gf.n_threads = 1;
 
     // this allocates all Metal resources and memory buffers
-    auto * ctx_mtl = ggml_mtl_init();
+    auto * ctx_metal = ggml_metal_init();
 
-    ggml_mtl_add_buffer(ctx_mtl, "data", ggml_get_mem_buffer(ctx_data), ggml_get_mem_size(ctx_data));
-    ggml_mtl_add_buffer(ctx_mtl, "eval", ggml_get_mem_buffer(ctx_eval), ggml_get_mem_size(ctx_eval));
+    ggml_metal_add_buffer(ctx_metal, "data", ggml_get_mem_buffer(ctx_data), ggml_get_mem_size(ctx_data));
+    ggml_metal_add_buffer(ctx_metal, "eval", ggml_get_mem_buffer(ctx_eval), ggml_get_mem_size(ctx_eval));
 
     // main
     {
         struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "embd");
         *(int32_t *) input->data = 1; // BOS
 
-        ggml_mtl_set_tensor(ctx_mtl, input);
+        ggml_metal_set_tensor(ctx_metal, input);
 
         // warmup
-        ggml_mtl_graph_compute(ctx_mtl, &gf);
+        ggml_metal_graph_compute(ctx_metal, &gf);
 
         const int n_iter = 16;
 
         const int64_t t0 = ggml_time_us();
 
         // the actual inference happens here
         for (int i = 0; i < n_iter; ++i) {
-            ggml_mtl_graph_compute(ctx_mtl, &gf);
+            ggml_metal_graph_compute(ctx_metal, &gf);
         }
 
         const int64_t t1 = ggml_time_us();
@@ -55,7 +55,7 @@ int main(int argc, char ** argv) {
     // debug output
     {
         struct ggml_tensor * logits = gf.nodes[gf.n_nodes - 1];
-        ggml_mtl_get_tensor(ctx_mtl, logits);
+        ggml_metal_get_tensor(ctx_metal, logits);
 
         float * ptr = (float *) ggml_get_data(logits);
 
@@ -77,7 +77,7 @@ int main(int argc, char ** argv) {
         printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax);
     }
 
-    ggml_mtl_free(ctx_mtl);
+    ggml_metal_free(ctx_metal);
 
     ggml_free(ctx_data);
     ggml_free(ctx_eval);

diff --git a/examples/mtl/CMakeLists.txt b/examples/mtl/CMakeLists.txt
diff --git a/ggml-metal.h b/ggml-metal.h
@@ -0,0 +1,63 @@
+// An interface allowing to compute ggml_cgraph with Metal
+//
+// This is a fully functional interface that extends ggml with GPU support for Apple devices.
+// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, OpenCL, etc.)
+//
+// How it works?
+//
+// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this
+// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you
+// use ggml_metal_graph_compute() (or ggml_vulkan_graph_compute(), etc.)
+//
+// You only need to make sure that all memory buffers that you used during the graph creation
+// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
+// used during the graph evaluation to determine the arguments of the compute kernels.
+//
+// Synchronization between device and host memory (for example for input and output tensors)
+// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
+//
+
+#pragma once
+
+#include <stddef.h>
+#include <stdbool.h>
+
+// max memory buffers that can be mapped to the device
+#define GGML_METAL_MAX_BUFFERS 16
+
+struct ggml_tensor;
+struct ggml_cgraph;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ggml_metal_context;
+
+struct ggml_metal_context * ggml_metal_init(void);
+void ggml_metal_free(struct ggml_metal_context * ctx);
+
+// creates a mapping between a host memory buffer and a device memory buffer
+// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
+// - the mapping is used during computation to determine the arguments of the compute kernels
+// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
+//
+bool ggml_metal_add_buffer(
+        struct ggml_metal_context * ctx,
+                       const char * name,
+                             void * data,
+                           size_t   size);
+
+// set data from host memory into the device
+void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
+
+// get data from the device into host memory
+void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
+
+// same as ggml_graph_compute but uses Metal
+void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
+
+#ifdef __cplusplus
+}
+#endif
+