Skip to content

llama : Metal inference #1642

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 49 commits into from
Jun 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
f85020b
mtl : export the LLaMA computation graph
ggerganov May 29, 2023
98c267f
ci : disable temporary
ggerganov May 29, 2023
b23fe8c
mtl : adapt the MNIST example as starter
ggerganov May 29, 2023
a792cbd
mtl : no need for mtl-export tool, add cli arg for main instead
ggerganov May 29, 2023
897d6d8
mtl : export just a small part of the graph for now to make it easier
ggerganov May 29, 2023
248a8c3
mtl : move MSL code into separate file for easy editing
ggerganov May 29, 2023
a8fd9dc
mtl : initial get_rows_q4_0 kernel
ggerganov May 29, 2023
794704e
mtl : confirmed get_rows_q4_0 is working correctly
ggerganov May 30, 2023
72256eb
mtl : add rms_norm kernel + confirm working
ggerganov May 30, 2023
64afc0b
mtl : add mul kernel + confirm working
ggerganov May 30, 2023
2a24994
mtl : initial mul_mat Q4 kernel (wrong results)
ggerganov May 30, 2023
96d0052
mtl : mul_mat fixes (still wrong)
ggerganov May 30, 2023
29bec00
mtl : another mul_mat Q4 (still does not work)
ggerganov May 30, 2023
b2fd06c
mtl : working mul_mat q4
ggerganov May 30, 2023
6af6a05
ggml : fix handling of "view" ops in ggml_graph_import()
ggerganov May 31, 2023
1213af7
mtl : add rope kernel
ggerganov May 31, 2023
7ca81e9
mtl : add reshape and transpose handling
ggerganov May 31, 2023
94ea9e7
ggml : store offset as opt arg for ggml_view_xd() operators
ggerganov Jun 1, 2023
948fcfd
mtl : add cpy kernel + handle view ops
ggerganov Jun 1, 2023
51efb59
mtl : confirm f16 x f32 attention mul mat
ggerganov Jun 1, 2023
0f1c580
mtl : add scale kernel
ggerganov Jun 1, 2023
17a7036
mtl : add diag_mask_inf kernel
ggerganov Jun 1, 2023
17930fb
mtl : fix soft_max kernel
ggerganov Jun 1, 2023
f67c2d8
ggml : update ggml_nbytes() to handle non-contiguous tensors
ggerganov Jun 1, 2023
a266c26
mtl : verify V tensor contents
ggerganov Jun 1, 2023
a0cc3de
mtl : add f32 -> f32 cpy kernel
ggerganov Jun 1, 2023
42dca40
mtl : add silu kernel
ggerganov Jun 1, 2023
fbd3f62
mtl : add non-broadcast mul kernel
ggerganov Jun 1, 2023
9665429
mtl : full GPU inference of the computation graph
ggerganov Jun 1, 2023
f0196a7
mtl : optimize rms_norm and soft_max kernels
ggerganov Jun 1, 2023
e55f7b0
mtl : add f16 mat x f32 vec multiplication kernel
ggerganov Jun 1, 2023
3367146
mtl : fix bug in f16 x f32 mul mat + speed-up computation
ggerganov Jun 2, 2023
847bbfe
mtl : faster mul_mat_q4_0_f32 kernel
ggerganov Jun 2, 2023
70c3387
mtl : fix kernel signature + roll inner loop
ggerganov Jun 2, 2023
b088e14
mtl : more threads for rms_norm + better timing
ggerganov Jun 2, 2023
6276057
mtl : remove printfs from inner loop
ggerganov Jun 2, 2023
03c2d72
mtl : simplify implementation
ggerganov Jun 2, 2023
640a889
mtl : add save/load vocab to ggml file
ggerganov Jun 2, 2023
2f4e9d1
mtl : plug Metal inference into llama.cpp (very quick-n-dirty)
ggerganov Jun 2, 2023
4df2ef3
mtl : make it work with main example
ggerganov Jun 3, 2023
18e482a
mtl : preparing for merge
ggerganov Jun 4, 2023
e4b5222
mtl : clean-up ggml mtl interface + suport scratch / inplace
ggerganov Jun 4, 2023
e26cd6b
mtl : remove temp / debug code
ggerganov Jun 4, 2023
a7fb899
metal : final refactoring and simplification
ggerganov Jun 4, 2023
d8a7486
Revert "ci : disable temporary"
ggerganov Jun 4, 2023
b252acb
metal : add comments
ggerganov Jun 4, 2023
db3db9e
metal : clean-up stuff, fix typos
ggerganov Jun 4, 2023
e33002d
readme : add Metal instructions
ggerganov Jun 4, 2023
324e823
readme : add example for main
ggerganov Jun 4, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ build-release/
build-static/
build-cublas/
build-opencl/
build-metal/
build-no-accel/
build-sanitize-addr/
build-sanitize-thread/
Expand Down
62 changes: 47 additions & 15 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,14 @@ if (NOT MSVC)
endif()

# 3rd party libs
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_BLAS "llama: use BLAS" OFF)
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_BLAS "llama: use BLAS" OFF)
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_METAL "llama: use Metal" OFF)

option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
Expand Down Expand Up @@ -183,7 +184,7 @@ if (LLAMA_CUBLAS)

enable_language(CUDA)

set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)

add_compile_definitions(GGML_USE_CUBLAS)
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
Expand All @@ -200,12 +201,37 @@ if (LLAMA_CUBLAS)
endif()
endif()

if (LLAMA_METAL)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)

set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)

add_compile_definitions(GGML_USE_METAL)
add_compile_definitions(GGML_METAL_NDEBUG)

# get full path to the file
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")

# copy ggml-metal.metal to bin directory
configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)

set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
${FOUNDATION_LIBRARY}
${METAL_FRAMEWORK}
${METALKIT_FRAMEWORK}
${METALPERFORMANCE_FRAMEWORK}
)
endif()

if (LLAMA_CLBLAST)
find_package(CLBlast)
if (CLBlast_FOUND)
message(STATUS "CLBlast found")

set(GGML_OPENCL_SOURCES ggml-opencl.cpp ggml-opencl.h)
set(GGML_SOURCES_OPENCL ggml-opencl.cpp ggml-opencl.h)

add_compile_definitions(GGML_USE_CLBLAST)

Expand Down Expand Up @@ -370,8 +396,10 @@ endif()
add_library(ggml OBJECT
ggml.c
ggml.h
${GGML_CUDA_SOURCES}
${GGML_OPENCL_SOURCES})
${GGML_SOURCES_CUDA}
${GGML_SOURCES_OPENCL}
${GGML_SOURCES_METAL}
)

target_include_directories(ggml PUBLIC .)
target_compile_features(ggml PUBLIC c_std_11) # don't bump
Expand All @@ -384,21 +412,25 @@ endif()
add_library(llama
llama.cpp
llama.h
llama-util.h)
llama-util.h
)

target_include_directories(llama PUBLIC .)
target_compile_features(llama PUBLIC cxx_std_11) # don't bump
target_link_libraries(llama PRIVATE ggml ${LLAMA_EXTRA_LIBS})
target_link_libraries(llama PRIVATE
ggml
${LLAMA_EXTRA_LIBS}
)

if (BUILD_SHARED_LIBS)
set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
endif()

if (GGML_CUDA_SOURCES)
if (GGML_SOURCES_CUDA)
message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES OFF)
set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES OFF)
set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES OFF)
endif()

Expand Down
33 changes: 27 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
#CFLAGS += -mfma -mf16c -mavx
#CXXFLAGS += -mfma -mf16c -mavx
endif

ifneq ($(filter ppc64%,$(UNAME_M)),)
POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
ifneq (,$(findstring POWER9,$(POWER9_M)))
Expand All @@ -116,26 +117,30 @@ ifneq ($(filter ppc64%,$(UNAME_M)),)
CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
endif
endif

ifndef LLAMA_NO_ACCELERATE
# Mac M1 - include Accelerate framework.
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
ifeq ($(UNAME_S),Darwin)
CFLAGS += -DGGML_USE_ACCELERATE
LDFLAGS += -framework Accelerate
endif
endif
endif # LLAMA_NO_ACCELERATE

ifdef LLAMA_OPENBLAS
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
ifneq ($(shell grep -e "Arch Linux" -e "ID_LIKE=arch" /etc/os-release 2>/dev/null),)
LDFLAGS += -lopenblas -lcblas
else
LDFLAGS += -lopenblas
endif
endif
endif # LLAMA_OPENBLAS

ifdef LLAMA_BLIS
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
LDFLAGS += -lblis -L/usr/local/lib
endif
endif # LLAMA_BLIS

ifdef LLAMA_CUBLAS
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
Expand All @@ -156,33 +161,49 @@ endif # LLAMA_CUDA_DMMV_Y
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
endif # LLAMA_CUBLAS

ifdef LLAMA_CLBLAST
CFLAGS += -DGGML_USE_CLBLAST
CXXFLAGS += -DGGML_USE_CLBLAST
CFLAGS += -DGGML_USE_CLBLAST
CXXFLAGS += -DGGML_USE_CLBLAST
# Mac provides OpenCL as a framework
ifeq ($(UNAME_S),Darwin)
LDFLAGS += -lclblast -framework OpenCL
else
LDFLAGS += -lclblast -lOpenCL
endif
OBJS += ggml-opencl.o

ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
$(CXX) $(CXXFLAGS) -c $< -o $@
endif
endif # LLAMA_CLBLAST

ifdef LLAMA_METAL
CFLAGS += -DGGML_USE_METAL -DGGML_METAL_NDEBUG
CXXFLAGS += -DGGML_USE_METAL
LDFLAGS += -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
OBJS += ggml-metal.o

ggml-metal.o: ggml-metal.m ggml-metal.h
$(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_METAL

ifneq ($(filter aarch64%,$(UNAME_M)),)
# Apple M1, M2, etc.
# Raspberry Pi 3, 4, Zero 2 (64-bit)
CFLAGS += -mcpu=native
CXXFLAGS += -mcpu=native
endif

ifneq ($(filter armv6%,$(UNAME_M)),)
# Raspberry Pi 1, Zero
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
endif

ifneq ($(filter armv7%,$(UNAME_M)),)
# Raspberry Pi 2
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
endif

ifneq ($(filter armv8%,$(UNAME_M)),)
# Raspberry Pi 3, 4, Zero 2 (32-bit)
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
Expand Down
31 changes: 28 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,10 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quantization on a MacBook

- Plain C/C++ implementation without dependencies
- Apple silicon first-class citizen - optimized via ARM NEON and Accelerate framework
- Apple silicon first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
- AVX, AVX2 and AVX512 support for x86 architectures
- Mixed F16 / F32 precision
- 4-bit, 5-bit and 8-bit integer quantization support
- Runs on the CPU
- Supports OpenBLAS/Apple BLAS/ARM Performance Lib/ATLAS/BLIS/Intel MKL/NVHPC/ACML/SCSL/SGIMATH and [more](https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors) in BLAS
- cuBLAS and CLBlast support

Expand Down Expand Up @@ -236,6 +235,32 @@ In order to build llama.cpp you have three different options.
zig build -Drelease-fast
```

### Metal Build

Using Metal allows the computation to be executed on the GPU for Apple devices:

- Using `make`:

```bash
LLAMA_METAL=1 make
```

- Using `CMake`:

```bash
mkdir build-metal
cd build-metal
cmake -DLLAMA_METAL=ON ..
cmake --build . --config Release
```

When built with Metal support, you can enable GPU inference with the `--gpu-layers|-ngl` command-line argument.
Any value larger than 0 will offload the computation to the GPU. For example:

```bash
./main -m ./models/7B/ggml-model-q4_0.bin -n 128 -ngl 1
```

### BLAS Build

Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:
Expand Down Expand Up @@ -367,7 +392,7 @@ Building the program with BLAS support may lead to some performance improvements

Running:

The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does.
The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does.

To select the correct platform (driver) and device (GPU), you can use the environment variables `GGML_OPENCL_PLATFORM` and `GGML_OPENCL_DEVICE`.
The selection can be a number (starting from 0) or a text string to search:
Expand Down
5 changes: 4 additions & 1 deletion examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,10 @@ else()
add_subdirectory(save-load-state)
add_subdirectory(benchmark)
add_subdirectory(baby-llama)
if(LLAMA_BUILD_SERVER)
if (LLAMA_METAL)
add_subdirectory(metal)
endif()
if (LLAMA_BUILD_SERVER)
add_subdirectory(server)
endif()
endif()
3 changes: 3 additions & 0 deletions examples/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.use_mmap = false;
} else if (arg == "--mtest") {
params.mem_test = true;
} else if (arg == "--export") {
params.export_cgraph = true;
} else if (arg == "--verbose-prompt") {
params.verbose_prompt = true;
} else if (arg == "-r" || arg == "--reverse-prompt") {
Expand Down Expand Up @@ -438,6 +440,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " number of layers to store in VRAM\n");
#endif
fprintf(stderr, " --mtest compute maximum memory usage\n");
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
Expand Down
1 change: 1 addition & 0 deletions examples/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ struct gpt_params {
bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory
bool mem_test = false; // compute maximum memory usage
bool export_cgraph = false; // export the computation graph
bool verbose_prompt = false; // print prompt tokens before generation
};

Expand Down
7 changes: 7 additions & 0 deletions examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,13 @@ int main(int argc, char ** argv) {
return 0;
}

// export the cgraph and exit
if (params.export_cgraph) {
llama_eval_export(ctx, "llama.ggml");
llama_free(ctx);

return 0;
}

std::string path_session = params.path_prompt_cache;
std::vector<llama_token> session_tokens;
Expand Down
3 changes: 3 additions & 0 deletions examples/metal/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
set(TEST_TARGET metal)
add_executable(${TEST_TARGET} metal.cpp)
target_link_libraries(${TEST_TARGET} PRIVATE ggml)
Loading