Skip to content

k-quants : allow to optionally disable at compile time #1734

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kern
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_METAL "llama: use Metal" OFF)
option(LLAMA_K_QUANTS "llama: use k-quants" ON)

option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
Expand Down Expand Up @@ -226,6 +227,10 @@ if (LLAMA_METAL)
)
endif()

if (LLAMA_K_QUANTS)
set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
endif()

if (LLAMA_CLBLAST)
find_package(CLBlast)
if (CLBlast_FOUND)
Expand Down Expand Up @@ -396,11 +401,10 @@ endif()
add_library(ggml OBJECT
ggml.c
ggml.h
ggml-quants-k.h
ggml-quants-k.c
${GGML_SOURCES_CUDA}
${GGML_SOURCES_OPENCL}
${GGML_SOURCES_METAL}
${GGML_SOURCES_EXTRA}
)

target_include_directories(ggml PUBLIC .)
Expand Down
37 changes: 22 additions & 15 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,11 @@ ifneq ($(filter ppc64%,$(UNAME_M)),)
endif
endif

ifndef LLAMA_NO_K_QUANTS
CFLAGS += -DGGML_USE_K_QUANTS
OBJS += k_quants.o
endif

ifndef LLAMA_NO_ACCELERATE
# Mac M1 - include Accelerate framework.
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
Expand All @@ -140,7 +145,7 @@ ifdef LLAMA_OPENBLAS
endif # LLAMA_OPENBLAS

ifdef LLAMA_BLIS
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
LDFLAGS += -lblis -L/usr/local/lib
endif # LLAMA_BLIS

Expand Down Expand Up @@ -212,6 +217,11 @@ ifneq ($(filter armv8%,$(UNAME_M)),)
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
endif

ifdef LLAMA_NO_K_QUANTS
k_quants.o: k_quants.c k_quants.h
$(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_NO_K_QUANTS

#
# Print build information
#
Expand All @@ -231,10 +241,7 @@ $(info )
# Build library
#

ggml.o: ggml.c ggml.h ggml-cuda.h ggml-quants-k.h
$(CC) $(CFLAGS) -c $< -o $@

ggml-quants-k.o: ggml-quants-k.c ggml-quants-k.h ggml.h ggml-cuda.h
ggml.o: ggml.c ggml.h ggml-cuda.h
$(CC) $(CFLAGS) -c $< -o $@

llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
Expand All @@ -243,7 +250,7 @@ llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
common.o: examples/common.cpp examples/common.h
$(CXX) $(CXXFLAGS) -c $< -o $@

libllama.so: llama.o ggml.o ggml-quants-k.o $(OBJS)
libllama.so: llama.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

clean:
Expand All @@ -253,28 +260,28 @@ clean:
# Examples
#

main: examples/main/main.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
main: examples/main/main.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@echo
@echo '==== Run ./main -h for help. ===='
@echo

quantize: examples/quantize/quantize.cpp build-info.h ggml.o ggml-quants-k.o llama.o $(OBJS)
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o ggml-quants-k.o llama.o $(OBJS)
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

embedding: examples/embedding/embedding.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)

build-info.h: $(wildcard .git/index) scripts/build-info.sh
Expand All @@ -289,11 +296,11 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
# Tests
#

benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o ggml-quants-k.o $(OBJS)
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
./$@

vdot: pocs/vdot/vdot.cpp ggml.o ggml-quants-k.o $(OBJS)
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

.PHONY: tests clean
Expand Down
Loading