Skip to content

llama : Metal inference #1642

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 49 commits into from
Jun 4, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
f85020b
mtl : export the LLaMA computation graph
ggerganov May 29, 2023
98c267f
ci : disable temporary
ggerganov May 29, 2023
b23fe8c
mtl : adapt the MNIST example as starter
ggerganov May 29, 2023
a792cbd
mtl : no need for mtl-export tool, add cli arg for main instead
ggerganov May 29, 2023
897d6d8
mtl : export just a small part of the graph for now to make it easier
ggerganov May 29, 2023
248a8c3
mtl : move MSL code into separate file for easy editing
ggerganov May 29, 2023
a8fd9dc
mtl : initial get_rows_q4_0 kernel
ggerganov May 29, 2023
794704e
mtl : confirmed get_rows_q4_0 is working correctly
ggerganov May 30, 2023
72256eb
mtl : add rms_norm kernel + confirm working
ggerganov May 30, 2023
64afc0b
mtl : add mul kernel + confirm working
ggerganov May 30, 2023
2a24994
mtl : initial mul_mat Q4 kernel (wrong results)
ggerganov May 30, 2023
96d0052
mtl : mul_mat fixes (still wrong)
ggerganov May 30, 2023
29bec00
mtl : another mul_mat Q4 (still does not work)
ggerganov May 30, 2023
b2fd06c
mtl : working mul_mat q4
ggerganov May 30, 2023
6af6a05
ggml : fix handling of "view" ops in ggml_graph_import()
ggerganov May 31, 2023
1213af7
mtl : add rope kernel
ggerganov May 31, 2023
7ca81e9
mtl : add reshape and transpose handling
ggerganov May 31, 2023
94ea9e7
ggml : store offset as opt arg for ggml_view_xd() operators
ggerganov Jun 1, 2023
948fcfd
mtl : add cpy kernel + handle view ops
ggerganov Jun 1, 2023
51efb59
mtl : confirm f16 x f32 attention mul mat
ggerganov Jun 1, 2023
0f1c580
mtl : add scale kernel
ggerganov Jun 1, 2023
17a7036
mtl : add diag_mask_inf kernel
ggerganov Jun 1, 2023
17930fb
mtl : fix soft_max kernel
ggerganov Jun 1, 2023
f67c2d8
ggml : update ggml_nbytes() to handle non-contiguous tensors
ggerganov Jun 1, 2023
a266c26
mtl : verify V tensor contents
ggerganov Jun 1, 2023
a0cc3de
mtl : add f32 -> f32 cpy kernel
ggerganov Jun 1, 2023
42dca40
mtl : add silu kernel
ggerganov Jun 1, 2023
fbd3f62
mtl : add non-broadcast mul kernel
ggerganov Jun 1, 2023
9665429
mtl : full GPU inference of the computation graph
ggerganov Jun 1, 2023
f0196a7
mtl : optimize rms_norm and soft_max kernels
ggerganov Jun 1, 2023
e55f7b0
mtl : add f16 mat x f32 vec multiplication kernel
ggerganov Jun 1, 2023
3367146
mtl : fix bug in f16 x f32 mul mat + speed-up computation
ggerganov Jun 2, 2023
847bbfe
mtl : faster mul_mat_q4_0_f32 kernel
ggerganov Jun 2, 2023
70c3387
mtl : fix kernel signature + roll inner loop
ggerganov Jun 2, 2023
b088e14
mtl : more threads for rms_norm + better timing
ggerganov Jun 2, 2023
6276057
mtl : remove printfs from inner loop
ggerganov Jun 2, 2023
03c2d72
mtl : simplify implementation
ggerganov Jun 2, 2023
640a889
mtl : add save/load vocab to ggml file
ggerganov Jun 2, 2023
2f4e9d1
mtl : plug Metal inference into llama.cpp (very quick-n-dirty)
ggerganov Jun 2, 2023
4df2ef3
mtl : make it work with main example
ggerganov Jun 3, 2023
18e482a
mtl : preparing for merge
ggerganov Jun 4, 2023
e4b5222
mtl : clean-up ggml mtl interface + suport scratch / inplace
ggerganov Jun 4, 2023
e26cd6b
mtl : remove temp / debug code
ggerganov Jun 4, 2023
a7fb899
metal : final refactoring and simplification
ggerganov Jun 4, 2023
d8a7486
Revert "ci : disable temporary"
ggerganov Jun 4, 2023
b252acb
metal : add comments
ggerganov Jun 4, 2023
db3db9e
metal : clean-up stuff, fix typos
ggerganov Jun 4, 2023
e33002d
readme : add Metal instructions
ggerganov Jun 4, 2023
324e823
readme : add example for main
ggerganov Jun 4, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
metal : final refactoring and simplification
  • Loading branch information
ggerganov committed Jun 4, 2023
commit a7fb899c53013f6a9b776a073f93272f6954805b
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ build-release/
build-static/
build-cublas/
build-opencl/
build-mtl/
build-metal/
build-no-accel/
build-sanitize-addr/
build-sanitize-thread/
Expand Down
6 changes: 3 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -207,16 +207,16 @@ if (LLAMA_METAL)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)

set(GGML_SOURCES_METAL ggml-mtl.m ggml-mtl.h)
set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)

add_compile_definitions(GGML_USE_METAL)
add_compile_definitions(GGML_METAL_NDEBUG)

# get full path to the file
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")

# copy ggml-mtl.metal to bin directory
configure_file(ggml-mtl.metal bin/ggml-mtl.metal COPYONLY)
# copy ggml-metal.metal to bin directory
configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)

set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
${FOUNDATION_LIBRARY}
Expand Down
33 changes: 27 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
#CFLAGS += -mfma -mf16c -mavx
#CXXFLAGS += -mfma -mf16c -mavx
endif

ifneq ($(filter ppc64%,$(UNAME_M)),)
POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
ifneq (,$(findstring POWER9,$(POWER9_M)))
Expand All @@ -116,26 +117,30 @@ ifneq ($(filter ppc64%,$(UNAME_M)),)
CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
endif
endif

ifndef LLAMA_NO_ACCELERATE
# Mac M1 - include Accelerate framework.
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
ifeq ($(UNAME_S),Darwin)
CFLAGS += -DGGML_USE_ACCELERATE
LDFLAGS += -framework Accelerate
endif
endif
endif # LLAMA_NO_ACCELERATE

ifdef LLAMA_OPENBLAS
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
ifneq ($(shell grep -e "Arch Linux" -e "ID_LIKE=arch" /etc/os-release 2>/dev/null),)
LDFLAGS += -lopenblas -lcblas
else
LDFLAGS += -lopenblas
endif
endif
endif # LLAMA_OPENBLAS

ifdef LLAMA_BLIS
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
LDFLAGS += -lblis -L/usr/local/lib
endif
endif # LLAMA_BLIS

ifdef LLAMA_CUBLAS
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
Expand All @@ -156,33 +161,49 @@ endif # LLAMA_CUDA_DMMV_Y
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
endif # LLAMA_CUBLAS

ifdef LLAMA_CLBLAST
CFLAGS += -DGGML_USE_CLBLAST
CXXFLAGS += -DGGML_USE_CLBLAST
CFLAGS += -DGGML_USE_CLBLAST
CXXFLAGS += -DGGML_USE_CLBLAST
# Mac provides OpenCL as a framework
ifeq ($(UNAME_S),Darwin)
LDFLAGS += -lclblast -framework OpenCL
else
LDFLAGS += -lclblast -lOpenCL
endif
OBJS += ggml-opencl.o

ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
$(CXX) $(CXXFLAGS) -c $< -o $@
endif
endif # LLAMA_CLBLAST

ifdef LLAMA_METAL
CFLAGS += -DGGML_USE_METAL -DGGML_METAL_NDEBUG
CXXFLAGS += -DGGML_USE_METAL
LDFLAGS += -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
OBJS += ggml-metal.o

ggml-metal.o: ggml-metal.m ggml-metal.h
$(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_METAL

ifneq ($(filter aarch64%,$(UNAME_M)),)
# Apple M1, M2, etc.
# Raspberry Pi 3, 4, Zero 2 (64-bit)
CFLAGS += -mcpu=native
CXXFLAGS += -mcpu=native
endif

ifneq ($(filter armv6%,$(UNAME_M)),)
# Raspberry Pi 1, Zero
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
endif

ifneq ($(filter armv7%,$(UNAME_M)),)
# Raspberry Pi 2
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
endif

ifneq ($(filter armv8%,$(UNAME_M)),)
# Raspberry Pi 3, 4, Zero 2 (32-bit)
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
Expand Down
2 changes: 1 addition & 1 deletion examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ else()
add_subdirectory(benchmark)
add_subdirectory(baby-llama)
if (LLAMA_METAL)
add_subdirectory(mtl)
add_subdirectory(metal)
endif()
if (LLAMA_BUILD_SERVER)
add_subdirectory(server)
Expand Down
3 changes: 3 additions & 0 deletions examples/metal/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
set(TEST_TARGET metal)
add_executable(${TEST_TARGET} metal.cpp)
target_link_libraries(${TEST_TARGET} PRIVATE ggml)
18 changes: 9 additions & 9 deletions examples/mtl/mtl.cpp → examples/metal/metal.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#include "ggml.h"
#include "ggml-mtl.h"
#include "ggml-metal.h"

#include <cstdio>
#include <cstring>
Expand All @@ -23,28 +23,28 @@ int main(int argc, char ** argv) {
gf.n_threads = 1;

// this allocates all Metal resources and memory buffers
auto * ctx_mtl = ggml_mtl_init();
auto * ctx_metal = ggml_metal_init();

ggml_mtl_add_buffer(ctx_mtl, "data", ggml_get_mem_buffer(ctx_data), ggml_get_mem_size(ctx_data));
ggml_mtl_add_buffer(ctx_mtl, "eval", ggml_get_mem_buffer(ctx_eval), ggml_get_mem_size(ctx_eval));
ggml_metal_add_buffer(ctx_metal, "data", ggml_get_mem_buffer(ctx_data), ggml_get_mem_size(ctx_data));
ggml_metal_add_buffer(ctx_metal, "eval", ggml_get_mem_buffer(ctx_eval), ggml_get_mem_size(ctx_eval));

// main
{
struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "embd");
*(int32_t *) input->data = 1; // BOS

ggml_mtl_set_tensor(ctx_mtl, input);
ggml_metal_set_tensor(ctx_metal, input);

// warmup
ggml_mtl_graph_compute(ctx_mtl, &gf);
ggml_metal_graph_compute(ctx_metal, &gf);

const int n_iter = 16;

const int64_t t0 = ggml_time_us();

// the actual inference happens here
for (int i = 0; i < n_iter; ++i) {
ggml_mtl_graph_compute(ctx_mtl, &gf);
ggml_metal_graph_compute(ctx_metal, &gf);
}

const int64_t t1 = ggml_time_us();
Expand All @@ -55,7 +55,7 @@ int main(int argc, char ** argv) {
// debug output
{
struct ggml_tensor * logits = gf.nodes[gf.n_nodes - 1];
ggml_mtl_get_tensor(ctx_mtl, logits);
ggml_metal_get_tensor(ctx_metal, logits);

float * ptr = (float *) ggml_get_data(logits);

Expand All @@ -77,7 +77,7 @@ int main(int argc, char ** argv) {
printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax);
}

ggml_mtl_free(ctx_mtl);
ggml_metal_free(ctx_metal);

ggml_free(ctx_data);
ggml_free(ctx_eval);
Expand Down
6 changes: 0 additions & 6 deletions examples/mtl/CMakeLists.txt

This file was deleted.

63 changes: 63 additions & 0 deletions ggml-metal.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
// An interface allowing to compute ggml_cgraph with Metal
//
// This is a fully functional interface that extends ggml with GPU support for Apple devices.
// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, OpenCL, etc.)
//
// How it works?
//
// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this
// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you
// use ggml_metal_graph_compute() (or ggml_vulkan_graph_compute(), etc.)
//
// You only need to make sure that all memory buffers that you used during the graph creation
// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
// used during the graph evaluation to determine the arguments of the compute kernels.
//
// Synchronization between device and host memory (for example for input and output tensors)
// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
//

#pragma once

#include <stddef.h>
#include <stdbool.h>

// max memory buffers that can be mapped to the device
#define GGML_METAL_MAX_BUFFERS 16

struct ggml_tensor;
struct ggml_cgraph;

#ifdef __cplusplus
extern "C" {
#endif

struct ggml_metal_context;

struct ggml_metal_context * ggml_metal_init(void);
void ggml_metal_free(struct ggml_metal_context * ctx);

// creates a mapping between a host memory buffer and a device memory buffer
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
// - the mapping is used during computation to determine the arguments of the compute kernels
// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
//
bool ggml_metal_add_buffer(
struct ggml_metal_context * ctx,
const char * name,
void * data,
size_t size);

// set data from host memory into the device
void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);

// get data from the device into host memory
void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);

// same as ggml_graph_compute but uses Metal
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);

#ifdef __cplusplus
}
#endif

Loading