Spaces:
Build error
Build error
Illumotion
commited on
Commit
•
7243d06
1
Parent(s):
da7d918
Upload folder using huggingface_hub
Browse files- .gitignore +0 -4
- CMakeLists.txt +0 -2
- Makefile +14 -16
- Package.swift +3 -4
- colab.ipynb +1 -1
- common/CMakeLists.txt +0 -2
- common/common.cpp +172 -56
- common/common.h +39 -4
- convert-refact-hf-to-gguf.py +63 -8
- examples/CMakeLists.txt +0 -1
- examples/batched/batched.cpp +1 -1
- examples/embd-input/embd-input-lib.cpp +9 -10
- examples/infill/infill.cpp +12 -43
- examples/main/main.cpp +11 -13
- examples/parallel/parallel.cpp +3 -5
- examples/save-load-state/save-load-state.cpp +2 -3
- examples/server/index.html.hpp +0 -0
- examples/server/public/index.html +58 -133
- examples/server/server.cpp +60 -93
- examples/speculative/speculative.cpp +2 -10
- ggml-alloc.c +107 -62
- ggml-alloc.h +5 -11
- ggml-cuda.cu +78 -500
- ggml-cuda.h +0 -4
- ggml-metal.h +1 -18
- ggml-metal.m +9 -152
- ggml-metal.metal +6 -12
- ggml.c +45 -23
- ggml.h +7 -9
- gguf-py/gguf/gguf.py +42 -70
- gpttype_adapter.cpp +1 -1
- koboldcpp.py +53 -105
- llama.cpp +41 -820
- otherarch/llama_v3.cpp +8 -7
- spm-headers/ggml.h +7 -9
- tests/test-tokenizer-0-falcon.cpp +4 -4
- tests/test-tokenizer-0-falcon.py +4 -5
- tests/test-tokenizer-0-llama.cpp +3 -1
- tests/test-tokenizer-0-llama.py +4 -3
.gitignore
CHANGED
@@ -45,7 +45,6 @@ models-mnt
|
|
45 |
/server
|
46 |
/simple
|
47 |
/batched
|
48 |
-
/batched-bench
|
49 |
/export-lora
|
50 |
/finetune
|
51 |
/speculative
|
@@ -107,6 +106,3 @@ tests/test-tokenizer-1-bpe
|
|
107 |
rocblas.dll
|
108 |
hipblas.dll
|
109 |
koboldcpp_hipblas.so
|
110 |
-
|
111 |
-
# Jetbrains idea folder
|
112 |
-
.idea/
|
|
|
45 |
/server
|
46 |
/simple
|
47 |
/batched
|
|
|
48 |
/export-lora
|
49 |
/finetune
|
50 |
/speculative
|
|
|
106 |
rocblas.dll
|
107 |
hipblas.dll
|
108 |
koboldcpp_hipblas.so
|
|
|
|
|
|
CMakeLists.txt
CHANGED
@@ -356,8 +356,6 @@ add_library(ggml OBJECT
|
|
356 |
ggml.h
|
357 |
ggml-alloc.c
|
358 |
ggml-alloc.h
|
359 |
-
ggml-backend.c
|
360 |
-
ggml-backend.h
|
361 |
k_quants.h
|
362 |
k_quants.c
|
363 |
${GGML_SOURCES_CUDA})
|
|
|
356 |
ggml.h
|
357 |
ggml-alloc.c
|
358 |
ggml-alloc.h
|
|
|
|
|
359 |
k_quants.h
|
360 |
k_quants.c
|
361 |
${GGML_SOURCES_CUDA})
|
Makefile
CHANGED
@@ -372,8 +372,6 @@ endif # LLAMA_NO_K_QUANTS
|
|
372 |
#there's no intrinsics or special gpu ops used here, so we can have a universal object
|
373 |
ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
|
374 |
$(CC) $(CFLAGS) -c $< -o $@
|
375 |
-
ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
|
376 |
-
$(CC) $(CFLAGS) -c $< -o $@
|
377 |
|
378 |
#version 2 libs
|
379 |
ggml_v2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
|
@@ -404,7 +402,7 @@ ggml_v2-opencl-legacy.o: otherarch/ggml_v2-opencl-legacy.c otherarch/ggml_v2-ope
|
|
404 |
$(CC) $(CFLAGS) -c $< -o $@
|
405 |
|
406 |
# intermediate objects
|
407 |
-
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-
|
408 |
$(CXX) $(CXXFLAGS) -c $< -o $@
|
409 |
common.o: common/common.cpp common/common.h common/log.h
|
410 |
$(CXX) $(CXXFLAGS) -c $< -o $@
|
@@ -429,7 +427,7 @@ gpttype_adapter_cublas.o: $(GPTTYPE_ADAPTER)
|
|
429 |
clean:
|
430 |
rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf gguf.exe main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp_hipblas.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so koboldcpp_hipblas.so
|
431 |
|
432 |
-
main: examples/main/main.cpp build-info.h ggml.o $(KQ1) ggml-alloc.o
|
433 |
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
434 |
@echo
|
435 |
@echo '==== Run ./main -h for help. ===='
|
@@ -440,11 +438,11 @@ gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS)
|
|
440 |
|
441 |
|
442 |
#generated libraries
|
443 |
-
koboldcpp_default: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o $(KQ1) ggml-alloc.o
|
444 |
$(DEFAULT_BUILD)
|
445 |
|
446 |
ifdef OPENBLAS_BUILD
|
447 |
-
koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o $(KQ1) ggml-alloc.o
|
448 |
$(OPENBLAS_BUILD)
|
449 |
else
|
450 |
koboldcpp_openblas:
|
@@ -452,7 +450,7 @@ koboldcpp_openblas:
|
|
452 |
endif
|
453 |
|
454 |
ifdef FAILSAFE_BUILD
|
455 |
-
koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o $(KQ3) ggml-alloc.o
|
456 |
$(FAILSAFE_BUILD)
|
457 |
else
|
458 |
koboldcpp_failsafe:
|
@@ -460,7 +458,7 @@ koboldcpp_failsafe:
|
|
460 |
endif
|
461 |
|
462 |
ifdef NOAVX2_BUILD
|
463 |
-
koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o $(KQ2) ggml-alloc.o
|
464 |
$(NOAVX2_BUILD)
|
465 |
else
|
466 |
koboldcpp_noavx2:
|
@@ -468,7 +466,7 @@ koboldcpp_noavx2:
|
|
468 |
endif
|
469 |
|
470 |
ifdef CLBLAST_BUILD
|
471 |
-
koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o $(KQ1) ggml-alloc.o
|
472 |
$(CLBLAST_BUILD)
|
473 |
else
|
474 |
koboldcpp_clblast:
|
@@ -476,7 +474,7 @@ koboldcpp_clblast:
|
|
476 |
endif
|
477 |
|
478 |
ifdef CUBLAS_BUILD
|
479 |
-
koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o $(KQ1) ggml-alloc.o
|
480 |
$(CUBLAS_BUILD)
|
481 |
else
|
482 |
koboldcpp_cublas:
|
@@ -484,7 +482,7 @@ koboldcpp_cublas:
|
|
484 |
endif
|
485 |
|
486 |
ifdef HIPBLAS_BUILD
|
487 |
-
koboldcpp_hipblas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o $(KQ1) ggml-alloc.o
|
488 |
$(HIPBLAS_BUILD)
|
489 |
else
|
490 |
koboldcpp_hipblas:
|
@@ -492,15 +490,15 @@ koboldcpp_hipblas:
|
|
492 |
endif
|
493 |
|
494 |
# tools
|
495 |
-
quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o $(KQ1) ggml-alloc.o
|
496 |
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
497 |
-
quantize_gptj: ggml.o llama.o $(KQ1) ggml-alloc.o
|
498 |
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
499 |
-
quantize_gpt2: ggml.o llama.o $(KQ1) ggml-alloc.o
|
500 |
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
501 |
-
quantize_neox: ggml.o llama.o $(KQ1) ggml-alloc.o
|
502 |
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
503 |
-
quantize_mpt: ggml.o llama.o $(KQ1) ggml-alloc.o
|
504 |
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
505 |
|
506 |
|
|
|
372 |
#there's no intrinsics or special gpu ops used here, so we can have a universal object
|
373 |
ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
|
374 |
$(CC) $(CFLAGS) -c $< -o $@
|
|
|
|
|
375 |
|
376 |
#version 2 libs
|
377 |
ggml_v2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
|
|
|
402 |
$(CC) $(CFLAGS) -c $< -o $@
|
403 |
|
404 |
# intermediate objects
|
405 |
+
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h otherarch/llama-util.h
|
406 |
$(CXX) $(CXXFLAGS) -c $< -o $@
|
407 |
common.o: common/common.cpp common/common.h common/log.h
|
408 |
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
|
427 |
clean:
|
428 |
rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf gguf.exe main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp_hipblas.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so koboldcpp_hipblas.so
|
429 |
|
430 |
+
main: examples/main/main.cpp build-info.h ggml.o $(KQ1) ggml-alloc.o llama.o common.o console.o grammar-parser.o $(OBJS)
|
431 |
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
432 |
@echo
|
433 |
@echo '==== Run ./main -h for help. ===='
|
|
|
438 |
|
439 |
|
440 |
#generated libraries
|
441 |
+
koboldcpp_default: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o $(KQ1) ggml-alloc.o grammar-parser.o $(OBJS)
|
442 |
$(DEFAULT_BUILD)
|
443 |
|
444 |
ifdef OPENBLAS_BUILD
|
445 |
+
koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o $(KQ1) ggml-alloc.o grammar-parser.o $(OBJS)
|
446 |
$(OPENBLAS_BUILD)
|
447 |
else
|
448 |
koboldcpp_openblas:
|
|
|
450 |
endif
|
451 |
|
452 |
ifdef FAILSAFE_BUILD
|
453 |
+
koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o $(KQ3) ggml-alloc.o grammar-parser.o $(OBJS)
|
454 |
$(FAILSAFE_BUILD)
|
455 |
else
|
456 |
koboldcpp_failsafe:
|
|
|
458 |
endif
|
459 |
|
460 |
ifdef NOAVX2_BUILD
|
461 |
+
koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o $(KQ2) ggml-alloc.o grammar-parser.o $(OBJS)
|
462 |
$(NOAVX2_BUILD)
|
463 |
else
|
464 |
koboldcpp_noavx2:
|
|
|
466 |
endif
|
467 |
|
468 |
ifdef CLBLAST_BUILD
|
469 |
+
koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o $(KQ1) ggml-alloc.o grammar-parser.o $(OBJS)
|
470 |
$(CLBLAST_BUILD)
|
471 |
else
|
472 |
koboldcpp_clblast:
|
|
|
474 |
endif
|
475 |
|
476 |
ifdef CUBLAS_BUILD
|
477 |
+
koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o $(KQ1) ggml-alloc.o grammar-parser.o $(CUBLAS_OBJS) $(OBJS)
|
478 |
$(CUBLAS_BUILD)
|
479 |
else
|
480 |
koboldcpp_cublas:
|
|
|
482 |
endif
|
483 |
|
484 |
ifdef HIPBLAS_BUILD
|
485 |
+
koboldcpp_hipblas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o $(KQ1) ggml-alloc.o grammar-parser.o $(HIP_OBJS) $(OBJS)
|
486 |
$(HIPBLAS_BUILD)
|
487 |
else
|
488 |
koboldcpp_hipblas:
|
|
|
490 |
endif
|
491 |
|
492 |
# tools
|
493 |
+
quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o $(KQ1) ggml-alloc.o
|
494 |
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
495 |
+
quantize_gptj: ggml.o llama.o $(KQ1) ggml-alloc.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp
|
496 |
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
497 |
+
quantize_gpt2: ggml.o llama.o $(KQ1) ggml-alloc.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp
|
498 |
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
499 |
+
quantize_neox: ggml.o llama.o $(KQ1) ggml-alloc.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp
|
500 |
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
501 |
+
quantize_mpt: ggml.o llama.o $(KQ1) ggml-alloc.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp
|
502 |
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
503 |
|
504 |
|
Package.swift
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
-
// swift-tools-version:5.
|
2 |
|
3 |
import PackageDescription
|
4 |
|
5 |
#if arch(arm) || arch(arm64)
|
6 |
let platforms: [SupportedPlatform]? = [
|
7 |
-
.macOS(.
|
8 |
.iOS(.v14),
|
9 |
.watchOS(.v4),
|
10 |
.tvOS(.v14)
|
@@ -41,13 +41,12 @@ let package = Package(
|
|
41 |
"ggml.c",
|
42 |
"llama.cpp",
|
43 |
"ggml-alloc.c",
|
44 |
-
"ggml-backend.c",
|
45 |
"k_quants.c",
|
46 |
] + additionalSources,
|
47 |
resources: resources,
|
48 |
publicHeadersPath: "spm-headers",
|
49 |
cSettings: [
|
50 |
-
.unsafeFlags(["-Wno-shorten-64-to-32"
|
51 |
.define("GGML_USE_K_QUANTS"),
|
52 |
.define("GGML_USE_ACCELERATE")
|
53 |
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
|
|
1 |
+
// swift-tools-version:5.3
|
2 |
|
3 |
import PackageDescription
|
4 |
|
5 |
#if arch(arm) || arch(arm64)
|
6 |
let platforms: [SupportedPlatform]? = [
|
7 |
+
.macOS(.v11),
|
8 |
.iOS(.v14),
|
9 |
.watchOS(.v4),
|
10 |
.tvOS(.v14)
|
|
|
41 |
"ggml.c",
|
42 |
"llama.cpp",
|
43 |
"ggml-alloc.c",
|
|
|
44 |
"k_quants.c",
|
45 |
] + additionalSources,
|
46 |
resources: resources,
|
47 |
publicHeadersPath: "spm-headers",
|
48 |
cSettings: [
|
49 |
+
.unsafeFlags(["-Wno-shorten-64-to-32"]),
|
50 |
.define("GGML_USE_K_QUANTS"),
|
51 |
.define("GGML_USE_ACCELERATE")
|
52 |
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
colab.ipynb
CHANGED
@@ -33,7 +33,7 @@
|
|
33 |
"!nohup ./cloudflared-linux-amd64 tunnel --url http://localhost:5001 &\r\n",
|
34 |
"!sleep 10\r\n",
|
35 |
"!cat nohup.out\r\n",
|
36 |
-
"!python koboldcpp.py model.ggml --usecublas 0 mmq --gpulayers $Layers\r\n"
|
37 |
]
|
38 |
}
|
39 |
],
|
|
|
33 |
"!nohup ./cloudflared-linux-amd64 tunnel --url http://localhost:5001 &\r\n",
|
34 |
"!sleep 10\r\n",
|
35 |
"!cat nohup.out\r\n",
|
36 |
+
"!python koboldcpp.py model.ggml --usecublas 0 mmq --gpulayers $Layers --hordeconfig concedo\r\n"
|
37 |
]
|
38 |
}
|
39 |
],
|
common/CMakeLists.txt
CHANGED
@@ -5,8 +5,6 @@ set(TARGET common)
|
|
5 |
add_library(${TARGET} OBJECT
|
6 |
common.h
|
7 |
common.cpp
|
8 |
-
sampling.h
|
9 |
-
sampling.cpp
|
10 |
console.h
|
11 |
console.cpp
|
12 |
grammar-parser.h
|
|
|
5 |
add_library(${TARGET} OBJECT
|
6 |
common.h
|
7 |
common.cpp
|
|
|
|
|
8 |
console.h
|
9 |
console.cpp
|
10 |
grammar-parser.h
|
common/common.cpp
CHANGED
@@ -107,7 +107,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|
107 |
std::string arg;
|
108 |
gpt_params default_params;
|
109 |
const std::string arg_prefix = "--";
|
110 |
-
llama_sampling_params & sparams = params.sampling_params;
|
111 |
|
112 |
for (int i = 1; i < argc; i++) {
|
113 |
arg = argv[i];
|
@@ -185,7 +184,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|
185 |
invalid_param = true;
|
186 |
break;
|
187 |
}
|
188 |
-
|
189 |
} else if (arg == "-c" || arg == "--ctx-size") {
|
190 |
if (++i >= argc) {
|
191 |
invalid_param = true;
|
@@ -217,73 +216,73 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|
217 |
invalid_param = true;
|
218 |
break;
|
219 |
}
|
220 |
-
|
221 |
} else if (arg == "--temp") {
|
222 |
if (++i >= argc) {
|
223 |
invalid_param = true;
|
224 |
break;
|
225 |
}
|
226 |
-
|
227 |
} else if (arg == "--tfs") {
|
228 |
if (++i >= argc) {
|
229 |
invalid_param = true;
|
230 |
break;
|
231 |
}
|
232 |
-
|
233 |
} else if (arg == "--typical") {
|
234 |
if (++i >= argc) {
|
235 |
invalid_param = true;
|
236 |
break;
|
237 |
}
|
238 |
-
|
239 |
} else if (arg == "--repeat-last-n") {
|
240 |
if (++i >= argc) {
|
241 |
invalid_param = true;
|
242 |
break;
|
243 |
}
|
244 |
-
|
245 |
} else if (arg == "--repeat-penalty") {
|
246 |
if (++i >= argc) {
|
247 |
invalid_param = true;
|
248 |
break;
|
249 |
}
|
250 |
-
|
251 |
} else if (arg == "--frequency-penalty") {
|
252 |
if (++i >= argc) {
|
253 |
invalid_param = true;
|
254 |
break;
|
255 |
}
|
256 |
-
|
257 |
} else if (arg == "--presence-penalty") {
|
258 |
if (++i >= argc) {
|
259 |
invalid_param = true;
|
260 |
break;
|
261 |
}
|
262 |
-
|
263 |
} else if (arg == "--mirostat") {
|
264 |
if (++i >= argc) {
|
265 |
invalid_param = true;
|
266 |
break;
|
267 |
}
|
268 |
-
|
269 |
} else if (arg == "--mirostat-lr") {
|
270 |
if (++i >= argc) {
|
271 |
invalid_param = true;
|
272 |
break;
|
273 |
}
|
274 |
-
|
275 |
} else if (arg == "--mirostat-ent") {
|
276 |
if (++i >= argc) {
|
277 |
invalid_param = true;
|
278 |
break;
|
279 |
}
|
280 |
-
|
281 |
} else if (arg == "--cfg-negative-prompt") {
|
282 |
if (++i >= argc) {
|
283 |
invalid_param = true;
|
284 |
break;
|
285 |
}
|
286 |
-
|
287 |
} else if (arg == "--cfg-negative-prompt-file") {
|
288 |
if (++i >= argc) {
|
289 |
invalid_param = true;
|
@@ -295,16 +294,16 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|
295 |
invalid_param = true;
|
296 |
break;
|
297 |
}
|
298 |
-
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(
|
299 |
-
if (!
|
300 |
-
|
301 |
}
|
302 |
} else if (arg == "--cfg-scale") {
|
303 |
if (++i >= argc) {
|
304 |
invalid_param = true;
|
305 |
break;
|
306 |
}
|
307 |
-
|
308 |
} else if (arg == "-b" || arg == "--batch-size") {
|
309 |
if (++i >= argc) {
|
310 |
invalid_param = true;
|
@@ -513,7 +512,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|
513 |
} else if (arg == "--ignore-eos") {
|
514 |
params.ignore_eos = true;
|
515 |
} else if (arg == "--no-penalize-nl") {
|
516 |
-
|
517 |
} else if (arg == "-l" || arg == "--logit-bias") {
|
518 |
if (++i >= argc) {
|
519 |
invalid_param = true;
|
@@ -525,7 +524,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|
525 |
std::string value_str;
|
526 |
try {
|
527 |
if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
|
528 |
-
|
529 |
} else {
|
530 |
throw std::exception();
|
531 |
}
|
@@ -628,8 +627,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|
628 |
}
|
629 |
|
630 |
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
631 |
-
const llama_sampling_params & sparams = params.sampling_params;
|
632 |
-
|
633 |
printf("usage: %s [options]\n", argv[0]);
|
634 |
printf("\n");
|
635 |
printf("options:\n");
|
@@ -662,19 +659,19 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|
662 |
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
|
663 |
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
|
664 |
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
665 |
-
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n",
|
666 |
-
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)
|
667 |
-
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)
|
668 |
-
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)
|
669 |
-
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n",
|
670 |
-
printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)
|
671 |
-
printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)
|
672 |
-
printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)
|
673 |
printf(" --mirostat N use Mirostat sampling.\n");
|
674 |
printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
|
675 |
-
printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n",
|
676 |
-
printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)
|
677 |
-
printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)
|
678 |
printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
|
679 |
printf(" modifies the likelihood of token appearing in the completion,\n");
|
680 |
printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
|
@@ -685,7 +682,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|
685 |
printf(" negative prompt to use for guidance. (default: empty)\n");
|
686 |
printf(" --cfg-negative-prompt-file FNAME\n");
|
687 |
printf(" negative prompt file to use for guidance. (default: empty)\n");
|
688 |
-
printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n",
|
689 |
printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
|
690 |
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
|
691 |
printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n");
|
@@ -693,7 +690,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|
693 |
printf(" --no-penalize-nl do not penalize newline token\n");
|
694 |
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
695 |
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
696 |
-
printf(" --temp N temperature (default: %.1f)\n", (double)
|
697 |
printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n");
|
698 |
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
|
699 |
printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
|
@@ -843,7 +840,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|
843 |
}
|
844 |
|
845 |
if (params.ignore_eos) {
|
846 |
-
params.
|
847 |
}
|
848 |
|
849 |
{
|
@@ -935,6 +932,127 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_to
|
|
935 |
return result;
|
936 |
}
|
937 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
938 |
//
|
939 |
// YAML utils
|
940 |
//
|
@@ -1086,8 +1204,6 @@ std::string get_sortable_timestamp() {
|
|
1086 |
|
1087 |
void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
|
1088 |
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
1089 |
-
const llama_sampling_params & sparams = params.sampling_params;
|
1090 |
-
|
1091 |
fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
|
1092 |
fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
|
1093 |
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
|
@@ -1134,21 +1250,21 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
|
1134 |
|
1135 |
fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
|
1136 |
fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
|
1137 |
-
dump_string_yaml_multiline(stream, "cfg_negative_prompt",
|
1138 |
-
fprintf(stream, "cfg_scale: %f # default: 1.0\n",
|
1139 |
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
|
1140 |
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
|
1141 |
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
|
1142 |
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
|
1143 |
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
|
1144 |
-
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n",
|
1145 |
dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
|
1146 |
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
|
1147 |
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
|
1148 |
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
|
1149 |
|
1150 |
-
const auto logit_bias_eos =
|
1151 |
-
const bool ignore_eos = logit_bias_eos !=
|
1152 |
fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
|
1153 |
|
1154 |
dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str());
|
@@ -1161,7 +1277,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
|
1161 |
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
|
1162 |
|
1163 |
fprintf(stream, "logit_bias:\n");
|
1164 |
-
for (std::pair<llama_token, float> lb :
|
1165 |
if (ignore_eos && lb.first == logit_bias_eos->first) {
|
1166 |
continue;
|
1167 |
}
|
@@ -1185,30 +1301,30 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
|
1185 |
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
|
1186 |
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
1187 |
fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
|
1188 |
-
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n",
|
1189 |
-
fprintf(stream, "mirostat_ent: %f # default: 5.0\n",
|
1190 |
-
fprintf(stream, "mirostat_lr: %f # default: 0.1\n",
|
1191 |
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
|
1192 |
fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
|
1193 |
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
|
1194 |
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
|
1195 |
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
|
1196 |
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
|
1197 |
-
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n",
|
1198 |
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
|
1199 |
fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
|
1200 |
-
fprintf(stream, "no_penalize_nl: %s # default: false\n", !
|
1201 |
fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
|
1202 |
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
|
1203 |
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
|
1204 |
-
fprintf(stream, "presence_penalty: %f # default: 0.0\n",
|
1205 |
dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
|
1206 |
fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
|
1207 |
fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
|
1208 |
fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
|
1209 |
dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
|
1210 |
fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
|
1211 |
-
fprintf(stream, "repeat_penalty: %f # default: 1.1\n",
|
1212 |
|
1213 |
fprintf(stream, "reverse_prompt:\n");
|
1214 |
for (std::string ap : params.antiprompt) {
|
@@ -1226,15 +1342,15 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
|
1226 |
fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
|
1227 |
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
|
1228 |
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
|
1229 |
-
fprintf(stream, "temp: %f # default: 0.8\n",
|
1230 |
|
1231 |
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
|
1232 |
dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
|
1233 |
|
1234 |
-
fprintf(stream, "tfs: %f # default: 1.0\n",
|
1235 |
fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
|
1236 |
-
fprintf(stream, "top_k: %d # default: 40\n",
|
1237 |
-
fprintf(stream, "top_p: %f # default: 0.95\n",
|
1238 |
-
fprintf(stream, "typical_p: %f # default: 1.0\n",
|
1239 |
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
1240 |
}
|
|
|
107 |
std::string arg;
|
108 |
gpt_params default_params;
|
109 |
const std::string arg_prefix = "--";
|
|
|
110 |
|
111 |
for (int i = 1; i < argc; i++) {
|
112 |
arg = argv[i];
|
|
|
184 |
invalid_param = true;
|
185 |
break;
|
186 |
}
|
187 |
+
params.top_k = std::stoi(argv[i]);
|
188 |
} else if (arg == "-c" || arg == "--ctx-size") {
|
189 |
if (++i >= argc) {
|
190 |
invalid_param = true;
|
|
|
216 |
invalid_param = true;
|
217 |
break;
|
218 |
}
|
219 |
+
params.top_p = std::stof(argv[i]);
|
220 |
} else if (arg == "--temp") {
|
221 |
if (++i >= argc) {
|
222 |
invalid_param = true;
|
223 |
break;
|
224 |
}
|
225 |
+
params.temp = std::stof(argv[i]);
|
226 |
} else if (arg == "--tfs") {
|
227 |
if (++i >= argc) {
|
228 |
invalid_param = true;
|
229 |
break;
|
230 |
}
|
231 |
+
params.tfs_z = std::stof(argv[i]);
|
232 |
} else if (arg == "--typical") {
|
233 |
if (++i >= argc) {
|
234 |
invalid_param = true;
|
235 |
break;
|
236 |
}
|
237 |
+
params.typical_p = std::stof(argv[i]);
|
238 |
} else if (arg == "--repeat-last-n") {
|
239 |
if (++i >= argc) {
|
240 |
invalid_param = true;
|
241 |
break;
|
242 |
}
|
243 |
+
params.repeat_last_n = std::stoi(argv[i]);
|
244 |
} else if (arg == "--repeat-penalty") {
|
245 |
if (++i >= argc) {
|
246 |
invalid_param = true;
|
247 |
break;
|
248 |
}
|
249 |
+
params.repeat_penalty = std::stof(argv[i]);
|
250 |
} else if (arg == "--frequency-penalty") {
|
251 |
if (++i >= argc) {
|
252 |
invalid_param = true;
|
253 |
break;
|
254 |
}
|
255 |
+
params.frequency_penalty = std::stof(argv[i]);
|
256 |
} else if (arg == "--presence-penalty") {
|
257 |
if (++i >= argc) {
|
258 |
invalid_param = true;
|
259 |
break;
|
260 |
}
|
261 |
+
params.presence_penalty = std::stof(argv[i]);
|
262 |
} else if (arg == "--mirostat") {
|
263 |
if (++i >= argc) {
|
264 |
invalid_param = true;
|
265 |
break;
|
266 |
}
|
267 |
+
params.mirostat = std::stoi(argv[i]);
|
268 |
} else if (arg == "--mirostat-lr") {
|
269 |
if (++i >= argc) {
|
270 |
invalid_param = true;
|
271 |
break;
|
272 |
}
|
273 |
+
params.mirostat_eta = std::stof(argv[i]);
|
274 |
} else if (arg == "--mirostat-ent") {
|
275 |
if (++i >= argc) {
|
276 |
invalid_param = true;
|
277 |
break;
|
278 |
}
|
279 |
+
params.mirostat_tau = std::stof(argv[i]);
|
280 |
} else if (arg == "--cfg-negative-prompt") {
|
281 |
if (++i >= argc) {
|
282 |
invalid_param = true;
|
283 |
break;
|
284 |
}
|
285 |
+
params.cfg_negative_prompt = argv[i];
|
286 |
} else if (arg == "--cfg-negative-prompt-file") {
|
287 |
if (++i >= argc) {
|
288 |
invalid_param = true;
|
|
|
294 |
invalid_param = true;
|
295 |
break;
|
296 |
}
|
297 |
+
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.cfg_negative_prompt));
|
298 |
+
if (!params.cfg_negative_prompt.empty() && params.cfg_negative_prompt.back() == '\n') {
|
299 |
+
params.cfg_negative_prompt.pop_back();
|
300 |
}
|
301 |
} else if (arg == "--cfg-scale") {
|
302 |
if (++i >= argc) {
|
303 |
invalid_param = true;
|
304 |
break;
|
305 |
}
|
306 |
+
params.cfg_scale = std::stof(argv[i]);
|
307 |
} else if (arg == "-b" || arg == "--batch-size") {
|
308 |
if (++i >= argc) {
|
309 |
invalid_param = true;
|
|
|
512 |
} else if (arg == "--ignore-eos") {
|
513 |
params.ignore_eos = true;
|
514 |
} else if (arg == "--no-penalize-nl") {
|
515 |
+
params.penalize_nl = false;
|
516 |
} else if (arg == "-l" || arg == "--logit-bias") {
|
517 |
if (++i >= argc) {
|
518 |
invalid_param = true;
|
|
|
524 |
std::string value_str;
|
525 |
try {
|
526 |
if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
|
527 |
+
params.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
|
528 |
} else {
|
529 |
throw std::exception();
|
530 |
}
|
|
|
627 |
}
|
628 |
|
629 |
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|
|
|
|
630 |
printf("usage: %s [options]\n", argv[0]);
|
631 |
printf("\n");
|
632 |
printf("options:\n");
|
|
|
659 |
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
|
660 |
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
|
661 |
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
662 |
+
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
|
663 |
+
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
|
664 |
+
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
|
665 |
+
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
|
666 |
+
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
|
667 |
+
printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
|
668 |
+
printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
|
669 |
+
printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
|
670 |
printf(" --mirostat N use Mirostat sampling.\n");
|
671 |
printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
|
672 |
+
printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
|
673 |
+
printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
|
674 |
+
printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
|
675 |
printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
|
676 |
printf(" modifies the likelihood of token appearing in the completion,\n");
|
677 |
printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
|
|
|
682 |
printf(" negative prompt to use for guidance. (default: empty)\n");
|
683 |
printf(" --cfg-negative-prompt-file FNAME\n");
|
684 |
printf(" negative prompt file to use for guidance. (default: empty)\n");
|
685 |
+
printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
|
686 |
printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
|
687 |
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
|
688 |
printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n");
|
|
|
690 |
printf(" --no-penalize-nl do not penalize newline token\n");
|
691 |
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
692 |
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
693 |
+
printf(" --temp N temperature (default: %.1f)\n", (double)params.temp);
|
694 |
printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n");
|
695 |
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
|
696 |
printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
|
|
|
840 |
}
|
841 |
|
842 |
if (params.ignore_eos) {
|
843 |
+
params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
|
844 |
}
|
845 |
|
846 |
{
|
|
|
932 |
return result;
|
933 |
}
|
934 |
|
935 |
+
//
|
936 |
+
// Sampling utils
|
937 |
+
//
|
938 |
+
|
939 |
+
llama_token llama_sample_token(
|
940 |
+
struct llama_context * ctx,
|
941 |
+
struct llama_context * ctx_guidance,
|
942 |
+
struct llama_grammar * grammar,
|
943 |
+
const struct gpt_params & params,
|
944 |
+
const std::vector<llama_token> & last_tokens,
|
945 |
+
std::vector<llama_token_data> & candidates,
|
946 |
+
int idx) {
|
947 |
+
const int n_ctx = llama_n_ctx(ctx);
|
948 |
+
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
949 |
+
|
950 |
+
const float temp = params.temp;
|
951 |
+
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
|
952 |
+
const float top_p = params.top_p;
|
953 |
+
const float tfs_z = params.tfs_z;
|
954 |
+
const float typical_p = params.typical_p;
|
955 |
+
const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
|
956 |
+
const float repeat_penalty = params.repeat_penalty;
|
957 |
+
const float alpha_presence = params.presence_penalty;
|
958 |
+
const float alpha_frequency = params.frequency_penalty;
|
959 |
+
const int mirostat = params.mirostat;
|
960 |
+
const float mirostat_tau = params.mirostat_tau;
|
961 |
+
const float mirostat_eta = params.mirostat_eta;
|
962 |
+
const bool penalize_nl = params.penalize_nl;
|
963 |
+
|
964 |
+
llama_token id = 0;
|
965 |
+
|
966 |
+
float * logits = llama_get_logits_ith(ctx, idx);
|
967 |
+
|
968 |
+
// Apply params.logit_bias map
|
969 |
+
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
|
970 |
+
logits[it->first] += it->second;
|
971 |
+
}
|
972 |
+
|
973 |
+
candidates.clear();
|
974 |
+
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
975 |
+
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
976 |
+
}
|
977 |
+
|
978 |
+
llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
|
979 |
+
|
980 |
+
if (ctx_guidance) {
|
981 |
+
llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
|
982 |
+
}
|
983 |
+
|
984 |
+
// apply penalties
|
985 |
+
if (!last_tokens.empty()) {
|
986 |
+
const float nl_logit = logits[llama_token_nl(ctx)];
|
987 |
+
const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx);
|
988 |
+
|
989 |
+
llama_sample_repetition_penalty(ctx, &cur_p,
|
990 |
+
last_tokens.data() + last_tokens.size() - last_n_repeat,
|
991 |
+
last_n_repeat, repeat_penalty);
|
992 |
+
llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
|
993 |
+
last_tokens.data() + last_tokens.size() - last_n_repeat,
|
994 |
+
last_n_repeat, alpha_frequency, alpha_presence);
|
995 |
+
|
996 |
+
if (!penalize_nl) {
|
997 |
+
for (size_t idx = 0; idx < cur_p.size; idx++) {
|
998 |
+
if (cur_p.data[idx].id == llama_token_nl(ctx)) {
|
999 |
+
cur_p.data[idx].logit = nl_logit;
|
1000 |
+
break;
|
1001 |
+
}
|
1002 |
+
}
|
1003 |
+
}
|
1004 |
+
}
|
1005 |
+
|
1006 |
+
if (grammar != NULL) {
|
1007 |
+
llama_sample_grammar(ctx, &cur_p, grammar);
|
1008 |
+
}
|
1009 |
+
|
1010 |
+
if (temp <= 0) {
|
1011 |
+
// Greedy sampling
|
1012 |
+
id = llama_sample_token_greedy(ctx, &cur_p);
|
1013 |
+
} else {
|
1014 |
+
if (mirostat == 1) {
|
1015 |
+
static float mirostat_mu = 2.0f * mirostat_tau;
|
1016 |
+
const int mirostat_m = 100;
|
1017 |
+
llama_sample_temp(ctx, &cur_p, temp);
|
1018 |
+
id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
|
1019 |
+
} else if (mirostat == 2) {
|
1020 |
+
static float mirostat_mu = 2.0f * mirostat_tau;
|
1021 |
+
llama_sample_temp(ctx, &cur_p, temp);
|
1022 |
+
id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
|
1023 |
+
} else {
|
1024 |
+
// Temperature sampling
|
1025 |
+
size_t min_keep = std::max(1, params.n_probs);
|
1026 |
+
llama_sample_top_k (ctx, &cur_p, top_k, min_keep);
|
1027 |
+
llama_sample_tail_free (ctx, &cur_p, tfs_z, min_keep);
|
1028 |
+
llama_sample_typical (ctx, &cur_p, typical_p, min_keep);
|
1029 |
+
llama_sample_top_p (ctx, &cur_p, top_p, min_keep);
|
1030 |
+
llama_sample_temp(ctx, &cur_p, temp);
|
1031 |
+
|
1032 |
+
{
|
1033 |
+
const int n_top = 10;
|
1034 |
+
LOG("top %d candidates:\n", n_top);
|
1035 |
+
|
1036 |
+
for (int i = 0; i < n_top; i++) {
|
1037 |
+
const llama_token id = cur_p.data[i].id;
|
1038 |
+
LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
|
1039 |
+
}
|
1040 |
+
}
|
1041 |
+
|
1042 |
+
id = llama_sample_token(ctx, &cur_p);
|
1043 |
+
|
1044 |
+
LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
|
1045 |
+
}
|
1046 |
+
}
|
1047 |
+
// printf("`%d`", candidates_p.size);
|
1048 |
+
|
1049 |
+
if (grammar != NULL) {
|
1050 |
+
llama_grammar_accept_token(ctx, grammar, id);
|
1051 |
+
}
|
1052 |
+
|
1053 |
+
return id;
|
1054 |
+
}
|
1055 |
+
|
1056 |
//
|
1057 |
// YAML utils
|
1058 |
//
|
|
|
1204 |
|
1205 |
void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
|
1206 |
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
|
|
|
|
1207 |
fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
|
1208 |
fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
|
1209 |
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
|
|
|
1250 |
|
1251 |
fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
|
1252 |
fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
|
1253 |
+
dump_string_yaml_multiline(stream, "cfg_negative_prompt", params.cfg_negative_prompt.c_str());
|
1254 |
+
fprintf(stream, "cfg_scale: %f # default: 1.0\n", params.cfg_scale);
|
1255 |
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
|
1256 |
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
|
1257 |
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
|
1258 |
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
|
1259 |
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
|
1260 |
+
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", params.frequency_penalty);
|
1261 |
dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
|
1262 |
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
|
1263 |
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
|
1264 |
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
|
1265 |
|
1266 |
+
const auto logit_bias_eos = params.logit_bias.find(llama_token_eos(lctx));
|
1267 |
+
const bool ignore_eos = logit_bias_eos != params.logit_bias.end() && logit_bias_eos->second == -INFINITY;
|
1268 |
fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
|
1269 |
|
1270 |
dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str());
|
|
|
1277 |
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
|
1278 |
|
1279 |
fprintf(stream, "logit_bias:\n");
|
1280 |
+
for (std::pair<llama_token, float> lb : params.logit_bias) {
|
1281 |
if (ignore_eos && lb.first == logit_bias_eos->first) {
|
1282 |
continue;
|
1283 |
}
|
|
|
1301 |
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
|
1302 |
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
1303 |
fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
|
1304 |
+
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", params.mirostat);
|
1305 |
+
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", params.mirostat_tau);
|
1306 |
+
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta);
|
1307 |
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
|
1308 |
fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
|
1309 |
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
|
1310 |
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
|
1311 |
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
|
1312 |
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
|
1313 |
+
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs);
|
1314 |
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
|
1315 |
fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
|
1316 |
+
fprintf(stream, "no_penalize_nl: %s # default: false\n", !params.penalize_nl ? "true" : "false");
|
1317 |
fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
|
1318 |
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
|
1319 |
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
|
1320 |
+
fprintf(stream, "presence_penalty: %f # default: 0.0\n", params.presence_penalty);
|
1321 |
dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
|
1322 |
fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
|
1323 |
fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
|
1324 |
fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
|
1325 |
dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
|
1326 |
fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
|
1327 |
+
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", params.repeat_penalty);
|
1328 |
|
1329 |
fprintf(stream, "reverse_prompt:\n");
|
1330 |
for (std::string ap : params.antiprompt) {
|
|
|
1342 |
fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
|
1343 |
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
|
1344 |
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
|
1345 |
+
fprintf(stream, "temp: %f # default: 0.8\n", params.temp);
|
1346 |
|
1347 |
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
|
1348 |
dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
|
1349 |
|
1350 |
+
fprintf(stream, "tfs: %f # default: 1.0\n", params.tfs_z);
|
1351 |
fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
|
1352 |
+
fprintf(stream, "top_k: %d # default: 40\n", params.top_k);
|
1353 |
+
fprintf(stream, "top_p: %f # default: 0.95\n", params.top_p);
|
1354 |
+
fprintf(stream, "typical_p: %f # default: 1.0\n", params.typical_p);
|
1355 |
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
1356 |
}
|
common/common.h
CHANGED
@@ -4,8 +4,6 @@
|
|
4 |
|
5 |
#include "llama.h"
|
6 |
|
7 |
-
#include "sampling.h"
|
8 |
-
|
9 |
#define LOG_NO_FILE_LINE_FUNCTION
|
10 |
#include "log.h"
|
11 |
|
@@ -51,6 +49,7 @@ struct gpt_params {
|
|
51 |
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
52 |
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
53 |
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
|
|
54 |
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
55 |
float rope_freq_base = 0.0f; // RoPE base frequency
|
56 |
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
@@ -68,8 +67,13 @@ struct gpt_params {
|
|
68 |
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
69 |
float mirostat_tau = 5.00f; // target entropy
|
70 |
float mirostat_eta = 0.10f; // learning rate
|
71 |
-
|
72 |
-
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
|
75 |
std::string model_draft = ""; // draft model for speculative decoding
|
@@ -111,6 +115,7 @@ struct gpt_params {
|
|
111 |
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
112 |
bool ignore_eos = false; // ignore generated EOS tokens
|
113 |
bool instruct = false; // instruction mode (used for Alpaca models)
|
|
|
114 |
bool logits_all = false; // return logits for all tokens in the batch
|
115 |
bool use_mmap = true; // use mmap for faster loads
|
116 |
bool use_mlock = false; // use mlock to keep model in memory
|
@@ -175,6 +180,36 @@ std::string llama_detokenize_bpe(
|
|
175 |
llama_context * ctx,
|
176 |
const std::vector<llama_token> & tokens);
|
177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
//
|
179 |
// YAML utils
|
180 |
//
|
|
|
4 |
|
5 |
#include "llama.h"
|
6 |
|
|
|
|
|
7 |
#define LOG_NO_FILE_LINE_FUNCTION
|
8 |
#include "log.h"
|
9 |
|
|
|
49 |
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
50 |
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
51 |
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
52 |
+
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
53 |
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
54 |
float rope_freq_base = 0.0f; // RoPE base frequency
|
55 |
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
|
|
67 |
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
68 |
float mirostat_tau = 5.00f; // target entropy
|
69 |
float mirostat_eta = 0.10f; // learning rate
|
70 |
+
|
71 |
+
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
|
72 |
+
|
73 |
+
// Classifier-Free Guidance
|
74 |
+
// https://arxiv.org/abs/2306.17806
|
75 |
+
std::string cfg_negative_prompt; // string to help guidance
|
76 |
+
float cfg_scale = 1.f; // How strong is guidance
|
77 |
|
78 |
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
|
79 |
std::string model_draft = ""; // draft model for speculative decoding
|
|
|
115 |
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
116 |
bool ignore_eos = false; // ignore generated EOS tokens
|
117 |
bool instruct = false; // instruction mode (used for Alpaca models)
|
118 |
+
bool penalize_nl = true; // consider newlines as a repeatable token
|
119 |
bool logits_all = false; // return logits for all tokens in the batch
|
120 |
bool use_mmap = true; // use mmap for faster loads
|
121 |
bool use_mlock = false; // use mlock to keep model in memory
|
|
|
180 |
llama_context * ctx,
|
181 |
const std::vector<llama_token> & tokens);
|
182 |
|
183 |
+
//
|
184 |
+
// Sampling utils
|
185 |
+
//
|
186 |
+
|
187 |
+
// this is a common sampling function used across the examples for convenience
|
188 |
+
// it can serve as a starting point for implementing your own sampling function
|
189 |
+
//
|
190 |
+
// required:
|
191 |
+
// - ctx: context to use for sampling
|
192 |
+
// - params: sampling parameters
|
193 |
+
//
|
194 |
+
// optional:
|
195 |
+
// - ctx_guidance: context to use for classifier-free guidance, ignore if NULL
|
196 |
+
// - grammar: grammar to use for sampling, ignore if NULL
|
197 |
+
// - last_tokens: needed for repetition penalty, ignore if empty
|
198 |
+
// - idx: sample from llama_get_logits_ith(ctx, idx)
|
199 |
+
//
|
200 |
+
// returns:
|
201 |
+
// - token: sampled token
|
202 |
+
// - candidates: vector of candidate tokens
|
203 |
+
//
|
204 |
+
llama_token llama_sample_token(
|
205 |
+
struct llama_context * ctx,
|
206 |
+
struct llama_context * ctx_guidance,
|
207 |
+
struct llama_grammar * grammar,
|
208 |
+
const struct gpt_params & params,
|
209 |
+
const std::vector<llama_token> & last_tokens,
|
210 |
+
std::vector<llama_token_data> & candidates,
|
211 |
+
int idx = 0);
|
212 |
+
|
213 |
//
|
214 |
// YAML utils
|
215 |
//
|
convert-refact-hf-to-gguf.py
CHANGED
@@ -17,6 +17,33 @@ if "NO_LOCAL_GGUF" not in os.environ:
|
|
17 |
sys.path.insert(1, str(Path(__file__).parent / "gguf-py" / "gguf"))
|
18 |
import gguf
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
def count_model_parts(dir_model: Path) -> int:
|
21 |
num_parts = 0
|
22 |
for filename in os.listdir(dir_model):
|
@@ -126,25 +153,53 @@ tokens: list[bytearray] = []
|
|
126 |
scores: list[float] = []
|
127 |
toktypes: list[int] = []
|
128 |
|
|
|
|
|
|
|
|
|
|
|
129 |
# gpt2 tokenizer
|
130 |
gguf_writer.add_tokenizer_model("gpt2")
|
131 |
|
132 |
-
|
|
|
133 |
|
134 |
-
|
135 |
-
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
136 |
|
137 |
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
138 |
# This causes downstream issues with mismatched tensor sizes when running the inference
|
139 |
-
vocab_size =
|
140 |
-
|
|
|
|
|
|
|
|
|
|
|
141 |
|
142 |
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
|
|
|
|
143 |
|
144 |
for i in range(vocab_size):
|
145 |
-
|
146 |
-
|
147 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
|
149 |
gguf_writer.add_token_list(tokens)
|
150 |
gguf_writer.add_token_scores(scores)
|
|
|
17 |
sys.path.insert(1, str(Path(__file__).parent / "gguf-py" / "gguf"))
|
18 |
import gguf
|
19 |
|
20 |
+
|
21 |
+
def bytes_to_unicode():
|
22 |
+
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
23 |
+
"""
|
24 |
+
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
25 |
+
The reversible bpe codes work on unicode strings.
|
26 |
+
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
27 |
+
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
28 |
+
This is a significant percentage of your normal, say, 32K bpe vocab.
|
29 |
+
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
30 |
+
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
31 |
+
"""
|
32 |
+
bs = (
|
33 |
+
list(range(ord("!"), ord("~") + 1))
|
34 |
+
+ list(range(ord("¡"), ord("¬") + 1))
|
35 |
+
+ list(range(ord("®"), ord("ÿ") + 1))
|
36 |
+
)
|
37 |
+
cs = bs[:]
|
38 |
+
n = 0
|
39 |
+
for b in range(2**8):
|
40 |
+
if b not in bs:
|
41 |
+
bs.append(b)
|
42 |
+
cs.append(2**8 + n)
|
43 |
+
n += 1
|
44 |
+
return dict(zip(bs, (chr(n) for n in cs)))
|
45 |
+
|
46 |
+
|
47 |
def count_model_parts(dir_model: Path) -> int:
|
48 |
num_parts = 0
|
49 |
for filename in os.listdir(dir_model):
|
|
|
153 |
scores: list[float] = []
|
154 |
toktypes: list[int] = []
|
155 |
|
156 |
+
tokenizer_json_file = dir_model / "tokenizer.json"
|
157 |
+
if not tokenizer_json_file.is_file():
|
158 |
+
print(f"Error: Missing {tokenizer_json_file}", file=sys.stderr)
|
159 |
+
sys.exit(1)
|
160 |
+
|
161 |
# gpt2 tokenizer
|
162 |
gguf_writer.add_tokenizer_model("gpt2")
|
163 |
|
164 |
+
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
|
165 |
+
tokenizer_json = json.load(f)
|
166 |
|
167 |
+
print("gguf: get gpt2 tokenizer vocab")
|
|
|
168 |
|
169 |
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
170 |
# This causes downstream issues with mismatched tensor sizes when running the inference
|
171 |
+
vocab_size = (
|
172 |
+
hparams["vocab_size"]
|
173 |
+
if "vocab_size" in hparams
|
174 |
+
else len(tokenizer_json["model"]["vocab"])
|
175 |
+
)
|
176 |
+
|
177 |
+
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
|
178 |
|
179 |
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
180 |
+
byte_encoder = bytes_to_unicode()
|
181 |
+
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
182 |
|
183 |
for i in range(vocab_size):
|
184 |
+
if i in reverse_vocab:
|
185 |
+
text = reverse_vocab[i]
|
186 |
+
try:
|
187 |
+
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
|
188 |
+
except KeyError:
|
189 |
+
text = bytearray()
|
190 |
+
for c in reverse_vocab[i]:
|
191 |
+
if ord(c) < 256: # single byte character
|
192 |
+
text.append(byte_decoder[ord(c)])
|
193 |
+
else: # multibyte special token character
|
194 |
+
text.extend(c.encode("utf-8"))
|
195 |
+
else:
|
196 |
+
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
|
197 |
+
pad_token = f"[PAD{i}]".encode("utf8")
|
198 |
+
text = bytearray(pad_token)
|
199 |
+
|
200 |
+
tokens.append(text)
|
201 |
+
scores.append(0.0) # dymmy
|
202 |
+
toktypes.append(gguf.TokenType.NORMAL) # dummy
|
203 |
|
204 |
gguf_writer.add_token_list(tokens)
|
205 |
gguf_writer.add_token_scores(scores)
|
examples/CMakeLists.txt
CHANGED
@@ -25,7 +25,6 @@ else()
|
|
25 |
add_subdirectory(convert-llama2c-to-ggml)
|
26 |
add_subdirectory(simple)
|
27 |
add_subdirectory(batched)
|
28 |
-
add_subdirectory(batched-bench)
|
29 |
add_subdirectory(speculative)
|
30 |
add_subdirectory(parallel)
|
31 |
add_subdirectory(embd-input)
|
|
|
25 |
add_subdirectory(convert-llama2c-to-ggml)
|
26 |
add_subdirectory(simple)
|
27 |
add_subdirectory(batched)
|
|
|
28 |
add_subdirectory(speculative)
|
29 |
add_subdirectory(parallel)
|
30 |
add_subdirectory(embd-input)
|
examples/batched/batched.cpp
CHANGED
@@ -66,7 +66,7 @@ int main(int argc, char ** argv) {
|
|
66 |
ctx_params.seed = 1234;
|
67 |
ctx_params.n_ctx = n_kv_req;
|
68 |
ctx_params.n_batch = std::max(n_len, n_parallel);
|
69 |
-
ctx_params.n_threads
|
70 |
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
71 |
|
72 |
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
|
|
66 |
ctx_params.seed = 1234;
|
67 |
ctx_params.n_ctx = n_kv_req;
|
68 |
ctx_params.n_batch = std::max(n_len, n_parallel);
|
69 |
+
ctx_params.n_threads = params.n_threads;
|
70 |
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
71 |
|
72 |
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
examples/embd-input/embd-input-lib.cpp
CHANGED
@@ -128,22 +128,21 @@ bool eval_string(struct MyModel * mymodel,const char* str){
|
|
128 |
llama_token sampling_id(struct MyModel* mymodel) {
|
129 |
llama_context* ctx = mymodel->ctx;
|
130 |
gpt_params params = mymodel->params;
|
131 |
-
llama_sampling_params & sparams = params.sampling_params;
|
132 |
// int n_ctx = llama_n_ctx(ctx);
|
133 |
|
134 |
// out of user input, sample next token
|
135 |
-
const float temp =
|
136 |
-
const int32_t top_k =
|
137 |
-
const float top_p =
|
138 |
-
const float tfs_z =
|
139 |
-
const float typical_p =
|
140 |
// const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
|
141 |
// const float repeat_penalty = params.repeat_penalty;
|
142 |
// const float alpha_presence = params.presence_penalty;
|
143 |
// const float alpha_frequency = params.frequency_penalty;
|
144 |
-
const int mirostat =
|
145 |
-
const float mirostat_tau =
|
146 |
-
const float mirostat_eta =
|
147 |
// const bool penalize_nl = params.penalize_nl;
|
148 |
|
149 |
llama_token id = 0;
|
@@ -152,7 +151,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
|
|
152 |
auto n_vocab = llama_n_vocab(llama_get_model(ctx));
|
153 |
|
154 |
// Apply params.logit_bias map
|
155 |
-
for (auto it =
|
156 |
logits[it->first] += it->second;
|
157 |
}
|
158 |
|
|
|
128 |
llama_token sampling_id(struct MyModel* mymodel) {
|
129 |
llama_context* ctx = mymodel->ctx;
|
130 |
gpt_params params = mymodel->params;
|
|
|
131 |
// int n_ctx = llama_n_ctx(ctx);
|
132 |
|
133 |
// out of user input, sample next token
|
134 |
+
const float temp = params.temp;
|
135 |
+
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : params.top_k;
|
136 |
+
const float top_p = params.top_p;
|
137 |
+
const float tfs_z = params.tfs_z;
|
138 |
+
const float typical_p = params.typical_p;
|
139 |
// const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
|
140 |
// const float repeat_penalty = params.repeat_penalty;
|
141 |
// const float alpha_presence = params.presence_penalty;
|
142 |
// const float alpha_frequency = params.frequency_penalty;
|
143 |
+
const int mirostat = params.mirostat;
|
144 |
+
const float mirostat_tau = params.mirostat_tau;
|
145 |
+
const float mirostat_eta = params.mirostat_eta;
|
146 |
// const bool penalize_nl = params.penalize_nl;
|
147 |
|
148 |
llama_token id = 0;
|
|
|
151 |
auto n_vocab = llama_n_vocab(llama_get_model(ctx));
|
152 |
|
153 |
// Apply params.logit_bias map
|
154 |
+
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
|
155 |
logits[it->first] += it->second;
|
156 |
}
|
157 |
|
examples/infill/infill.cpp
CHANGED
@@ -104,7 +104,6 @@ static void sigint_handler(int signo) {
|
|
104 |
|
105 |
int main(int argc, char ** argv) {
|
106 |
gpt_params params;
|
107 |
-
llama_sampling_params & sparams = params.sampling_params;
|
108 |
g_params = ¶ms;
|
109 |
|
110 |
if (!gpt_params_parse(argc, argv, params)) {
|
@@ -207,7 +206,7 @@ int main(int argc, char ** argv) {
|
|
207 |
// load the model and apply lora adapter, if any
|
208 |
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
209 |
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
210 |
-
if (
|
211 |
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
|
212 |
ctx_guidance = llama_new_context_with_model(model, lparams);
|
213 |
}
|
@@ -234,22 +233,10 @@ int main(int argc, char ** argv) {
|
|
234 |
const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;
|
235 |
LOG("add_bos: %d\n", add_bos);
|
236 |
|
237 |
-
bool suff_rm_leading_spc = params.escape;
|
238 |
-
if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
|
239 |
-
params.input_suffix.erase(0, 1);
|
240 |
-
suff_rm_leading_spc = false;
|
241 |
-
}
|
242 |
std::vector<llama_token> embd_inp;
|
243 |
-
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix,
|
244 |
-
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix,
|
245 |
-
const int space_token = 29871;
|
246 |
-
if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
|
247 |
-
inp_sfx.erase(inp_sfx.begin());
|
248 |
-
}
|
249 |
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
|
250 |
-
if (add_bos) {
|
251 |
-
inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx));
|
252 |
-
}
|
253 |
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
|
254 |
embd_inp = inp_pfx;
|
255 |
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
@@ -270,9 +257,9 @@ int main(int argc, char ** argv) {
|
|
270 |
int guidance_offset = 0;
|
271 |
int original_prompt_len = 0;
|
272 |
if (ctx_guidance) {
|
273 |
-
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(
|
274 |
|
275 |
-
guidance_inp = ::llama_tokenize(ctx_guidance,
|
276 |
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
|
277 |
|
278 |
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
|
@@ -313,7 +300,7 @@ int main(int argc, char ** argv) {
|
|
313 |
|
314 |
if (ctx_guidance) {
|
315 |
LOG_TEE("\n");
|
316 |
-
LOG_TEE("%s: negative prompt: '%s'\n", __func__,
|
317 |
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
318 |
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
319 |
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
|
@@ -359,7 +346,7 @@ int main(int argc, char ** argv) {
|
|
359 |
}
|
360 |
}
|
361 |
LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
|
362 |
-
|
363 |
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
364 |
LOG_TEE("\n\n");
|
365 |
|
@@ -377,8 +364,8 @@ int main(int argc, char ** argv) {
|
|
377 |
LOG_TEE("\n");
|
378 |
|
379 |
{
|
380 |
-
auto it =
|
381 |
-
if (it !=
|
382 |
LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
|
383 |
}
|
384 |
}
|
@@ -435,7 +422,6 @@ int main(int argc, char ** argv) {
|
|
435 |
|
436 |
const int n_vocab = llama_n_vocab(model);
|
437 |
|
438 |
-
llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar);
|
439 |
std::vector<llama_token_data> candidates;
|
440 |
candidates.reserve(n_vocab);
|
441 |
|
@@ -554,7 +540,7 @@ int main(int argc, char ** argv) {
|
|
554 |
|
555 |
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
556 |
|
557 |
-
const llama_token id =
|
558 |
|
559 |
last_tokens.erase(last_tokens.begin());
|
560 |
last_tokens.push_back(id);
|
@@ -641,27 +627,10 @@ int main(int argc, char ** argv) {
|
|
641 |
buffer.clear();
|
642 |
// done taking input, reset color
|
643 |
console::set_display(console::reset);
|
644 |
-
|
645 |
-
if (params.escape) {
|
646 |
-
//process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
|
647 |
-
process_escapes(params.input_prefix);
|
648 |
-
process_escapes(params.input_suffix);
|
649 |
-
}
|
650 |
-
suff_rm_leading_spc = params.escape;
|
651 |
-
if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
|
652 |
-
params.input_suffix.erase(0, 1);
|
653 |
-
suff_rm_leading_spc = false;
|
654 |
-
}
|
655 |
// tokenize new prefix and suffix
|
656 |
-
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix,
|
657 |
-
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix,
|
658 |
-
if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
|
659 |
-
inp_sfx.erase(inp_sfx.begin());
|
660 |
-
}
|
661 |
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
|
662 |
-
if (add_bos) {
|
663 |
-
inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx));
|
664 |
-
}
|
665 |
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
|
666 |
embd_inp = inp_pfx;
|
667 |
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
|
|
104 |
|
105 |
int main(int argc, char ** argv) {
|
106 |
gpt_params params;
|
|
|
107 |
g_params = ¶ms;
|
108 |
|
109 |
if (!gpt_params_parse(argc, argv, params)) {
|
|
|
206 |
// load the model and apply lora adapter, if any
|
207 |
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
208 |
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
209 |
+
if (params.cfg_scale > 1.f) {
|
210 |
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
|
211 |
ctx_guidance = llama_new_context_with_model(model, lparams);
|
212 |
}
|
|
|
233 |
const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;
|
234 |
LOG("add_bos: %d\n", add_bos);
|
235 |
|
|
|
|
|
|
|
|
|
|
|
236 |
std::vector<llama_token> embd_inp;
|
237 |
+
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, add_bos);
|
238 |
+
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, add_bos);
|
|
|
|
|
|
|
|
|
239 |
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
|
|
|
|
|
|
|
240 |
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
|
241 |
embd_inp = inp_pfx;
|
242 |
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
|
|
257 |
int guidance_offset = 0;
|
258 |
int original_prompt_len = 0;
|
259 |
if (ctx_guidance) {
|
260 |
+
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(params.cfg_negative_prompt));
|
261 |
|
262 |
+
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
|
263 |
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
|
264 |
|
265 |
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
|
|
|
300 |
|
301 |
if (ctx_guidance) {
|
302 |
LOG_TEE("\n");
|
303 |
+
LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
|
304 |
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
305 |
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
306 |
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
|
|
|
346 |
}
|
347 |
}
|
348 |
LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
|
349 |
+
params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
|
350 |
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
351 |
LOG_TEE("\n\n");
|
352 |
|
|
|
364 |
LOG_TEE("\n");
|
365 |
|
366 |
{
|
367 |
+
auto it = params.logit_bias.find(llama_token_eos(ctx));
|
368 |
+
if (it != params.logit_bias.end() && it->second == -INFINITY) {
|
369 |
LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
|
370 |
}
|
371 |
}
|
|
|
422 |
|
423 |
const int n_vocab = llama_n_vocab(model);
|
424 |
|
|
|
425 |
std::vector<llama_token_data> candidates;
|
426 |
candidates.reserve(n_vocab);
|
427 |
|
|
|
540 |
|
541 |
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
542 |
|
543 |
+
const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates);
|
544 |
|
545 |
last_tokens.erase(last_tokens.begin());
|
546 |
last_tokens.push_back(id);
|
|
|
627 |
buffer.clear();
|
628 |
// done taking input, reset color
|
629 |
console::set_display(console::reset);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
630 |
// tokenize new prefix and suffix
|
631 |
+
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, add_bos);
|
632 |
+
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, add_bos);
|
|
|
|
|
|
|
633 |
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
|
|
|
|
|
|
|
634 |
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
|
635 |
embd_inp = inp_pfx;
|
636 |
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
examples/main/main.cpp
CHANGED
@@ -109,7 +109,6 @@ int main(int argc, char ** argv) {
|
|
109 |
if (!gpt_params_parse(argc, argv, params)) {
|
110 |
return 1;
|
111 |
}
|
112 |
-
llama_sampling_params & sparams = params.sampling_params;
|
113 |
|
114 |
#ifndef LOG_DISABLE_LOGS
|
115 |
log_set_target(log_filename_generator("main", "log"));
|
@@ -180,7 +179,7 @@ int main(int argc, char ** argv) {
|
|
180 |
// load the model and apply lora adapter, if any
|
181 |
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
182 |
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
183 |
-
if (
|
184 |
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
|
185 |
ctx_guidance = llama_new_context_with_model(model, lparams);
|
186 |
}
|
@@ -258,9 +257,9 @@ int main(int argc, char ** argv) {
|
|
258 |
int guidance_offset = 0;
|
259 |
int original_prompt_len = 0;
|
260 |
if (ctx_guidance) {
|
261 |
-
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(
|
262 |
|
263 |
-
guidance_inp = ::llama_tokenize(ctx_guidance,
|
264 |
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
|
265 |
|
266 |
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
|
@@ -297,9 +296,6 @@ int main(int argc, char ** argv) {
|
|
297 |
LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
|
298 |
__func__, n_matching_session_tokens, embd_inp.size());
|
299 |
}
|
300 |
-
|
301 |
-
// remove any "future" tokens that we might have inherited from the previous session
|
302 |
-
llama_kv_cache_tokens_rm(ctx, n_matching_session_tokens, -1);
|
303 |
}
|
304 |
|
305 |
LOGLN(
|
@@ -347,7 +343,7 @@ int main(int argc, char ** argv) {
|
|
347 |
|
348 |
if (ctx_guidance) {
|
349 |
LOG_TEE("\n");
|
350 |
-
LOG_TEE("%s: negative prompt: '%s'\n", __func__,
|
351 |
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
352 |
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
353 |
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
|
@@ -399,7 +395,7 @@ int main(int argc, char ** argv) {
|
|
399 |
}
|
400 |
}
|
401 |
LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
|
402 |
-
|
403 |
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
404 |
LOG_TEE("\n\n");
|
405 |
|
@@ -417,8 +413,8 @@ int main(int argc, char ** argv) {
|
|
417 |
LOG_TEE("\n");
|
418 |
|
419 |
{
|
420 |
-
auto it =
|
421 |
-
if (it !=
|
422 |
LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
|
423 |
}
|
424 |
}
|
@@ -473,7 +469,6 @@ int main(int argc, char ** argv) {
|
|
473 |
|
474 |
const int n_vocab = llama_n_vocab(model);
|
475 |
|
476 |
-
llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar);
|
477 |
std::vector<llama_token_data> candidates;
|
478 |
candidates.reserve(n_vocab);
|
479 |
|
@@ -548,6 +543,9 @@ int main(int argc, char ** argv) {
|
|
548 |
if (i > 0) {
|
549 |
embd.erase(embd.begin(), embd.begin() + i);
|
550 |
}
|
|
|
|
|
|
|
551 |
}
|
552 |
|
553 |
// evaluate tokens in batches
|
@@ -627,7 +625,7 @@ int main(int argc, char ** argv) {
|
|
627 |
LOG("saved session to %s\n", path_session.c_str());
|
628 |
}
|
629 |
|
630 |
-
const llama_token id =
|
631 |
|
632 |
last_tokens.erase(last_tokens.begin());
|
633 |
last_tokens.push_back(id);
|
|
|
109 |
if (!gpt_params_parse(argc, argv, params)) {
|
110 |
return 1;
|
111 |
}
|
|
|
112 |
|
113 |
#ifndef LOG_DISABLE_LOGS
|
114 |
log_set_target(log_filename_generator("main", "log"));
|
|
|
179 |
// load the model and apply lora adapter, if any
|
180 |
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
181 |
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
182 |
+
if (params.cfg_scale > 1.f) {
|
183 |
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
|
184 |
ctx_guidance = llama_new_context_with_model(model, lparams);
|
185 |
}
|
|
|
257 |
int guidance_offset = 0;
|
258 |
int original_prompt_len = 0;
|
259 |
if (ctx_guidance) {
|
260 |
+
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(params.cfg_negative_prompt));
|
261 |
|
262 |
+
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
|
263 |
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
|
264 |
|
265 |
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
|
|
|
296 |
LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
|
297 |
__func__, n_matching_session_tokens, embd_inp.size());
|
298 |
}
|
|
|
|
|
|
|
299 |
}
|
300 |
|
301 |
LOGLN(
|
|
|
343 |
|
344 |
if (ctx_guidance) {
|
345 |
LOG_TEE("\n");
|
346 |
+
LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
|
347 |
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
348 |
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
349 |
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
|
|
|
395 |
}
|
396 |
}
|
397 |
LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
|
398 |
+
params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
|
399 |
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
400 |
LOG_TEE("\n\n");
|
401 |
|
|
|
413 |
LOG_TEE("\n");
|
414 |
|
415 |
{
|
416 |
+
auto it = params.logit_bias.find(llama_token_eos(ctx));
|
417 |
+
if (it != params.logit_bias.end() && it->second == -INFINITY) {
|
418 |
LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
|
419 |
}
|
420 |
}
|
|
|
469 |
|
470 |
const int n_vocab = llama_n_vocab(model);
|
471 |
|
|
|
472 |
std::vector<llama_token_data> candidates;
|
473 |
candidates.reserve(n_vocab);
|
474 |
|
|
|
543 |
if (i > 0) {
|
544 |
embd.erase(embd.begin(), embd.begin() + i);
|
545 |
}
|
546 |
+
|
547 |
+
// remove any "future" tokens that we might have inherited from the session from the KV cache
|
548 |
+
llama_kv_cache_tokens_rm(ctx, n_past, -1);
|
549 |
}
|
550 |
|
551 |
// evaluate tokens in batches
|
|
|
625 |
LOG("saved session to %s\n", path_session.c_str());
|
626 |
}
|
627 |
|
628 |
+
const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates);
|
629 |
|
630 |
last_tokens.erase(last_tokens.begin());
|
631 |
last_tokens.push_back(id);
|
examples/parallel/parallel.cpp
CHANGED
@@ -125,8 +125,6 @@ int main(int argc, char ** argv) {
|
|
125 |
params.logits_all = true;
|
126 |
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
127 |
|
128 |
-
llama_sampling_context ctx_sampling = llama_sampling_context_init(params, NULL);
|
129 |
-
|
130 |
// load the prompts from an external file if there are any
|
131 |
if (params.prompt.empty()) {
|
132 |
printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
|
@@ -169,7 +167,7 @@ int main(int argc, char ** argv) {
|
|
169 |
|
170 |
// the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
|
171 |
// users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
|
172 |
-
llama_batch batch = llama_batch_init(n_ctx, 0);
|
173 |
|
174 |
int32_t n_total_prompt = 0;
|
175 |
int32_t n_total_gen = 0;
|
@@ -341,7 +339,7 @@ int main(int argc, char ** argv) {
|
|
341 |
//printf("client %d, seq %d, token %d, pos %d, batch %d\n",
|
342 |
// client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
|
343 |
|
344 |
-
const llama_token id =
|
345 |
|
346 |
if (client.n_decoded == 1) {
|
347 |
// start measuring generation time after the first token to make sure all concurrent clients
|
@@ -386,7 +384,7 @@ int main(int argc, char ** argv) {
|
|
386 |
|
387 |
n_total_prompt += client.n_prompt;
|
388 |
n_total_gen += client.n_decoded;
|
389 |
-
|
390 |
client.seq_id = -1;
|
391 |
}
|
392 |
|
|
|
125 |
params.logits_all = true;
|
126 |
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
127 |
|
|
|
|
|
128 |
// load the prompts from an external file if there are any
|
129 |
if (params.prompt.empty()) {
|
130 |
printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
|
|
|
167 |
|
168 |
// the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
|
169 |
// users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
|
170 |
+
llama_batch batch = llama_batch_init(params.n_ctx, 0);
|
171 |
|
172 |
int32_t n_total_prompt = 0;
|
173 |
int32_t n_total_gen = 0;
|
|
|
339 |
//printf("client %d, seq %d, token %d, pos %d, batch %d\n",
|
340 |
// client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
|
341 |
|
342 |
+
const llama_token id = llama_sample_token(ctx, NULL, NULL, params, client.tokens_prev, candidates, client.i_batch - i);
|
343 |
|
344 |
if (client.n_decoded == 1) {
|
345 |
// start measuring generation time after the first token to make sure all concurrent clients
|
|
|
384 |
|
385 |
n_total_prompt += client.n_prompt;
|
386 |
n_total_gen += client.n_decoded;
|
387 |
+
|
388 |
client.seq_id = -1;
|
389 |
}
|
390 |
|
examples/save-load-state/save-load-state.cpp
CHANGED
@@ -8,10 +8,9 @@
|
|
8 |
|
9 |
int main(int argc, char ** argv) {
|
10 |
gpt_params params;
|
11 |
-
llama_sampling_params & sparams = params.sampling_params;
|
12 |
params.seed = 42;
|
13 |
params.n_threads = 4;
|
14 |
-
|
15 |
params.prompt = "The quick brown fox";
|
16 |
|
17 |
if (!gpt_params_parse(argc, argv, params)) {
|
@@ -25,7 +24,7 @@ int main(int argc, char ** argv) {
|
|
25 |
}
|
26 |
|
27 |
auto n_past = 0;
|
28 |
-
auto last_n_tokens_data = std::vector<llama_token>(
|
29 |
|
30 |
// init
|
31 |
llama_model * model;
|
|
|
8 |
|
9 |
int main(int argc, char ** argv) {
|
10 |
gpt_params params;
|
|
|
11 |
params.seed = 42;
|
12 |
params.n_threads = 4;
|
13 |
+
params.repeat_last_n = 64;
|
14 |
params.prompt = "The quick brown fox";
|
15 |
|
16 |
if (!gpt_params_parse(argc, argv, params)) {
|
|
|
24 |
}
|
25 |
|
26 |
auto n_past = 0;
|
27 |
+
auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);
|
28 |
|
29 |
// init
|
30 |
llama_model * model;
|
examples/server/index.html.hpp
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
examples/server/public/index.html
CHANGED
@@ -136,11 +136,6 @@
|
|
136 |
display: block;
|
137 |
}
|
138 |
|
139 |
-
fieldset label.slim {
|
140 |
-
margin: 0 0.5em;
|
141 |
-
display: inline;
|
142 |
-
}
|
143 |
-
|
144 |
header, footer {
|
145 |
text-align: center;
|
146 |
}
|
@@ -150,14 +145,6 @@
|
|
150 |
color: #888;
|
151 |
}
|
152 |
|
153 |
-
.mode-chat textarea[name=prompt] {
|
154 |
-
height: 4.5em;
|
155 |
-
}
|
156 |
-
|
157 |
-
.mode-completion textarea[name=prompt] {
|
158 |
-
height: 10em;
|
159 |
-
}
|
160 |
-
|
161 |
|
162 |
@keyframes loading-bg-wipe {
|
163 |
0% {
|
@@ -200,7 +187,7 @@
|
|
200 |
template: "{{prompt}}\n\n{{history}}\n{{char}}:",
|
201 |
historyTemplate: "{{name}}: {{message}}",
|
202 |
transcript: [],
|
203 |
-
type: "chat",
|
204 |
char: "Llama",
|
205 |
user: "User",
|
206 |
})
|
@@ -378,44 +365,13 @@
|
|
378 |
return String(str).replaceAll(/\{\{(.*?)\}\}/g, (_, key) => template(settings[key]));
|
379 |
}
|
380 |
|
381 |
-
async function runLlama(prompt, llamaParams, char) {
|
382 |
-
const currentMessages = [];
|
383 |
-
const history = session.value.transcript;
|
384 |
-
if (controller.value) {
|
385 |
-
throw new Error("already running");
|
386 |
-
}
|
387 |
-
controller.value = new AbortController();
|
388 |
-
for await (const chunk of llama(prompt, llamaParams, {controller: controller.value})) {
|
389 |
-
const data = chunk.data;
|
390 |
-
|
391 |
-
if (data.stop) {
|
392 |
-
while (
|
393 |
-
currentMessages.length > 0 &&
|
394 |
-
currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
|
395 |
-
) {
|
396 |
-
currentMessages.pop();
|
397 |
-
}
|
398 |
-
transcriptUpdate([...history, [char, currentMessages]])
|
399 |
-
console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
|
400 |
-
} else {
|
401 |
-
currentMessages.push(data);
|
402 |
-
transcriptUpdate([...history, [char, currentMessages]])
|
403 |
-
}
|
404 |
-
|
405 |
-
if (data.timings) {
|
406 |
-
llamaStats.value = data.timings;
|
407 |
-
}
|
408 |
-
}
|
409 |
-
|
410 |
-
controller.value = null;
|
411 |
-
}
|
412 |
-
|
413 |
// send message to server
|
414 |
const chat = async (msg) => {
|
415 |
if (controller.value) {
|
416 |
console.log('already running...');
|
417 |
return;
|
418 |
}
|
|
|
419 |
|
420 |
transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
|
421 |
|
@@ -435,41 +391,55 @@
|
|
435 |
).join("\n"),
|
436 |
});
|
437 |
|
438 |
-
|
|
|
|
|
|
|
439 |
...params.value,
|
440 |
stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
|
441 |
-
}, "{{char}}");
|
442 |
-
}
|
443 |
-
|
444 |
-
const runCompletion = async () => {
|
445 |
-
if (controller.value) {
|
446 |
-
console.log('already running...');
|
447 |
-
return;
|
448 |
}
|
449 |
-
const {prompt} = session.value;
|
450 |
-
transcriptUpdate([...session.value.transcript, ["", prompt]]);
|
451 |
-
await runLlama(prompt, {
|
452 |
-
...params.value,
|
453 |
-
stop: [],
|
454 |
-
}, "");
|
455 |
-
}
|
456 |
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
462 |
}
|
463 |
-
}
|
464 |
|
465 |
-
|
466 |
-
stop(e);
|
467 |
-
transcriptUpdate([]);
|
468 |
}
|
469 |
|
470 |
function MessageInput() {
|
471 |
const message = useSignal("")
|
472 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
473 |
const submit = (e) => {
|
474 |
stop(e);
|
475 |
chat(message.value);
|
@@ -504,19 +474,6 @@
|
|
504 |
`
|
505 |
}
|
506 |
|
507 |
-
function CompletionControls() {
|
508 |
-
const submit = (e) => {
|
509 |
-
stop(e);
|
510 |
-
runCompletion();
|
511 |
-
}
|
512 |
-
return html`
|
513 |
-
<div>
|
514 |
-
<button onclick=${submit} type="button" disabled=${generating.value}>Start</button>
|
515 |
-
<button onclick=${stop} disabled=${!generating.value}>Stop</button>
|
516 |
-
<button onclick=${reset}>Reset</button>
|
517 |
-
</div>`;
|
518 |
-
}
|
519 |
-
|
520 |
const ChatLog = (props) => {
|
521 |
const messages = session.value.transcript;
|
522 |
const container = useRef(null)
|
@@ -540,11 +497,7 @@
|
|
540 |
data;
|
541 |
message = html`<${Markdownish} text=${template(text)} />`
|
542 |
}
|
543 |
-
|
544 |
-
return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
|
545 |
-
} else {
|
546 |
-
return html`<p key=${index}>${message}</p>`
|
547 |
-
}
|
548 |
};
|
549 |
|
550 |
return html`
|
@@ -621,31 +574,18 @@
|
|
621 |
userTemplateAutosave()
|
622 |
}, [session.value, params.value])
|
623 |
|
624 |
-
|
625 |
-
|
626 |
-
<
|
627 |
-
|
628 |
-
|
629 |
-
<input type="text" name="prop-order" placeholder="order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
|
630 |
-
<button type="button" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
|
631 |
-
</div>
|
632 |
-
`
|
633 |
-
);
|
634 |
-
|
635 |
-
const PromptControlFieldSet = () => (
|
636 |
-
html`
|
637 |
-
<fieldset>
|
638 |
-
<div>
|
639 |
-
<label htmlFor="prompt">Prompt</label>
|
640 |
-
<textarea type="text" name="prompt" value="${session.value.prompt}" oninput=${updateSession}/>
|
641 |
-
</div>
|
642 |
-
</fieldset>
|
643 |
-
`
|
644 |
-
);
|
645 |
|
646 |
-
|
647 |
-
|
648 |
-
|
|
|
|
|
|
|
649 |
|
650 |
<fieldset class="two">
|
651 |
<div>
|
@@ -669,30 +609,15 @@
|
|
669 |
<label for="template">Chat history template</label>
|
670 |
<textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
|
671 |
</div>
|
672 |
-
${GrammarControl()}
|
673 |
-
</fieldset>
|
674 |
-
`
|
675 |
-
);
|
676 |
-
|
677 |
-
const CompletionConfigForm = () => (
|
678 |
-
html`
|
679 |
-
${PromptControlFieldSet()}
|
680 |
-
<fieldset>${GrammarControl()}</fieldset>
|
681 |
-
`
|
682 |
-
);
|
683 |
|
684 |
-
return html`
|
685 |
-
<form>
|
686 |
-
<fieldset class="two">
|
687 |
-
<${UserTemplateResetButton}/>
|
688 |
<div>
|
689 |
-
<label
|
690 |
-
<
|
|
|
|
|
691 |
</div>
|
692 |
</fieldset>
|
693 |
|
694 |
-
${session.value.type === 'chat' ? ChatConfigForm() : CompletionConfigForm()}
|
695 |
-
|
696 |
<fieldset class="two">
|
697 |
${IntField({label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict})}
|
698 |
${FloatField({label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature})}
|
@@ -926,7 +851,7 @@
|
|
926 |
function App(props) {
|
927 |
|
928 |
return html`
|
929 |
-
<div
|
930 |
<header>
|
931 |
<h1>llama.cpp</h1>
|
932 |
</header>
|
@@ -936,7 +861,7 @@
|
|
936 |
</main>
|
937 |
|
938 |
<section id="write">
|
939 |
-
<${
|
940 |
</section>
|
941 |
|
942 |
<footer>
|
|
|
136 |
display: block;
|
137 |
}
|
138 |
|
|
|
|
|
|
|
|
|
|
|
139 |
header, footer {
|
140 |
text-align: center;
|
141 |
}
|
|
|
145 |
color: #888;
|
146 |
}
|
147 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
|
149 |
@keyframes loading-bg-wipe {
|
150 |
0% {
|
|
|
187 |
template: "{{prompt}}\n\n{{history}}\n{{char}}:",
|
188 |
historyTemplate: "{{name}}: {{message}}",
|
189 |
transcript: [],
|
190 |
+
type: "chat",
|
191 |
char: "Llama",
|
192 |
user: "User",
|
193 |
})
|
|
|
365 |
return String(str).replaceAll(/\{\{(.*?)\}\}/g, (_, key) => template(settings[key]));
|
366 |
}
|
367 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
368 |
// send message to server
|
369 |
const chat = async (msg) => {
|
370 |
if (controller.value) {
|
371 |
console.log('already running...');
|
372 |
return;
|
373 |
}
|
374 |
+
controller.value = new AbortController();
|
375 |
|
376 |
transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
|
377 |
|
|
|
391 |
).join("\n"),
|
392 |
});
|
393 |
|
394 |
+
const currentMessages = [];
|
395 |
+
const history = session.value.transcript
|
396 |
+
|
397 |
+
const llamaParams = {
|
398 |
...params.value,
|
399 |
stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
400 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
401 |
|
402 |
+
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
|
403 |
+
const data = chunk.data;
|
404 |
+
|
405 |
+
if (data.stop) {
|
406 |
+
while (
|
407 |
+
currentMessages.length > 0 &&
|
408 |
+
currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
|
409 |
+
) {
|
410 |
+
currentMessages.pop();
|
411 |
+
}
|
412 |
+
transcriptUpdate([...history, ["{{char}}", currentMessages]])
|
413 |
+
console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
|
414 |
+
} else {
|
415 |
+
currentMessages.push(data);
|
416 |
+
transcriptUpdate([...history, ["{{char}}", currentMessages]])
|
417 |
+
}
|
418 |
+
|
419 |
+
if (data.timings) {
|
420 |
+
llamaStats.value = data.timings;
|
421 |
+
}
|
422 |
}
|
|
|
423 |
|
424 |
+
controller.value = null;
|
|
|
|
|
425 |
}
|
426 |
|
427 |
function MessageInput() {
|
428 |
const message = useSignal("")
|
429 |
|
430 |
+
const stop = (e) => {
|
431 |
+
e.preventDefault();
|
432 |
+
if (controller.value) {
|
433 |
+
controller.value.abort();
|
434 |
+
controller.value = null;
|
435 |
+
}
|
436 |
+
}
|
437 |
+
|
438 |
+
const reset = (e) => {
|
439 |
+
stop(e);
|
440 |
+
transcriptUpdate([]);
|
441 |
+
}
|
442 |
+
|
443 |
const submit = (e) => {
|
444 |
stop(e);
|
445 |
chat(message.value);
|
|
|
474 |
`
|
475 |
}
|
476 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
477 |
const ChatLog = (props) => {
|
478 |
const messages = session.value.transcript;
|
479 |
const container = useRef(null)
|
|
|
497 |
data;
|
498 |
message = html`<${Markdownish} text=${template(text)} />`
|
499 |
}
|
500 |
+
return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
|
|
|
|
|
|
|
|
|
501 |
};
|
502 |
|
503 |
return html`
|
|
|
574 |
userTemplateAutosave()
|
575 |
}, [session.value, params.value])
|
576 |
|
577 |
+
return html`
|
578 |
+
<form>
|
579 |
+
<fieldset>
|
580 |
+
<${UserTemplateResetButton}/>
|
581 |
+
</fieldset>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
582 |
|
583 |
+
<fieldset>
|
584 |
+
<div>
|
585 |
+
<label for="prompt">Prompt</label>
|
586 |
+
<textarea type="text" name="prompt" value="${session.value.prompt}" rows=4 oninput=${updateSession}/>
|
587 |
+
</div>
|
588 |
+
</fieldset>
|
589 |
|
590 |
<fieldset class="two">
|
591 |
<div>
|
|
|
609 |
<label for="template">Chat history template</label>
|
610 |
<textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
|
611 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
612 |
|
|
|
|
|
|
|
|
|
613 |
<div>
|
614 |
+
<label for="template">Grammar</label>
|
615 |
+
<textarea id="grammar" name="grammar" placeholder="Use gbnf or JSON Schema+convert" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
|
616 |
+
<input type="text" name="prop-order" placeholder="order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
|
617 |
+
<button type="button" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
|
618 |
</div>
|
619 |
</fieldset>
|
620 |
|
|
|
|
|
621 |
<fieldset class="two">
|
622 |
${IntField({label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict})}
|
623 |
${FloatField({label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature})}
|
|
|
851 |
function App(props) {
|
852 |
|
853 |
return html`
|
854 |
+
<div>
|
855 |
<header>
|
856 |
<h1>llama.cpp</h1>
|
857 |
</header>
|
|
|
861 |
</main>
|
862 |
|
863 |
<section id="write">
|
864 |
+
<${MessageInput} />
|
865 |
</section>
|
866 |
|
867 |
<footer>
|
examples/server/server.cpp
CHANGED
@@ -200,7 +200,6 @@ struct llama_server_context
|
|
200 |
llama_model *model = nullptr;
|
201 |
llama_context *ctx = nullptr;
|
202 |
gpt_params params;
|
203 |
-
llama_sampling_context ctx_sampling;
|
204 |
int n_ctx;
|
205 |
|
206 |
grammar_parser::parse_state parsed_grammar;
|
@@ -255,7 +254,6 @@ struct llama_server_context
|
|
255 |
if (grammar != nullptr) {
|
256 |
llama_grammar_free(grammar);
|
257 |
grammar = nullptr;
|
258 |
-
ctx_sampling = llama_sampling_context_init(params, NULL);
|
259 |
}
|
260 |
}
|
261 |
|
@@ -331,8 +329,8 @@ struct llama_server_context
|
|
331 |
grammar_parser::print_grammar(stderr, parsed_grammar);
|
332 |
|
333 |
{
|
334 |
-
auto it = params.
|
335 |
-
if (it != params.
|
336 |
LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {});
|
337 |
}
|
338 |
}
|
@@ -341,26 +339,14 @@ struct llama_server_context
|
|
341 |
grammar = llama_grammar_init(
|
342 |
grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
343 |
}
|
344 |
-
ctx_sampling = llama_sampling_context_init(params, grammar);
|
345 |
return true;
|
346 |
}
|
347 |
|
348 |
void loadInfill()
|
349 |
{
|
350 |
-
|
351 |
-
|
352 |
-
params.input_suffix.erase(0, 1);
|
353 |
-
suff_rm_leading_spc = false;
|
354 |
-
}
|
355 |
-
|
356 |
-
auto prefix_tokens = tokenize(params.input_prefix, false);
|
357 |
-
auto suffix_tokens = tokenize(params.input_suffix, false);
|
358 |
-
const int space_token = 29871;
|
359 |
-
if (suff_rm_leading_spc && suffix_tokens[0] == space_token) {
|
360 |
-
suffix_tokens.erase(suffix_tokens.begin());
|
361 |
-
}
|
362 |
prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(ctx));
|
363 |
-
prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(ctx)); // always add BOS
|
364 |
prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx));
|
365 |
prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
|
366 |
prefix_tokens.push_back(llama_token_middle(ctx));
|
@@ -405,7 +391,6 @@ struct llama_server_context
|
|
405 |
// compare the evaluated prompt with the new prompt
|
406 |
n_past = common_part(embd, prompt_tokens);
|
407 |
embd = prompt_tokens;
|
408 |
-
|
409 |
if (n_past == num_prompt_tokens)
|
410 |
{
|
411 |
// we have to evaluate at least 1 token to generate logits.
|
@@ -413,9 +398,6 @@ struct llama_server_context
|
|
413 |
n_past--;
|
414 |
}
|
415 |
|
416 |
-
// since #3228 we now have to manually manage the KV cache
|
417 |
-
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
418 |
-
|
419 |
LOG_VERBOSE("prompt ingested", {
|
420 |
{"n_past", n_past},
|
421 |
{"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
|
@@ -465,6 +447,9 @@ struct llama_server_context
|
|
465 |
// compare the evaluated prompt with the new prompt
|
466 |
n_past = common_part(embd, prompt_tokens);
|
467 |
|
|
|
|
|
|
|
468 |
embd = prompt_tokens;
|
469 |
if (n_past == num_prompt_tokens)
|
470 |
{
|
@@ -472,9 +457,6 @@ struct llama_server_context
|
|
472 |
n_past--;
|
473 |
}
|
474 |
|
475 |
-
// since #3228 we now have to manually manage the KV cache
|
476 |
-
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
477 |
-
|
478 |
LOG_VERBOSE("prompt ingested", {
|
479 |
{"n_past", n_past},
|
480 |
{"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
|
@@ -557,12 +539,12 @@ struct llama_server_context
|
|
557 |
std::vector<llama_token_data> candidates;
|
558 |
candidates.reserve(llama_n_vocab(model));
|
559 |
|
560 |
-
result.tok =
|
561 |
|
562 |
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
563 |
|
564 |
-
const int32_t n_probs = params.
|
565 |
-
if (params.
|
566 |
{
|
567 |
// For llama_sample_token_greedy we need to sort candidates
|
568 |
llama_sample_softmax(ctx, &candidates_p);
|
@@ -637,7 +619,7 @@ struct llama_server_context
|
|
637 |
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
|
638 |
generated_text += token_text;
|
639 |
|
640 |
-
if (params.
|
641 |
{
|
642 |
generated_token_probs.push_back(token_with_probs);
|
643 |
}
|
@@ -718,16 +700,15 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
|
718 |
printf("usage: %s [options]\n", argv0);
|
719 |
printf("\n");
|
720 |
printf("options:\n");
|
721 |
-
printf(" -h, --help
|
722 |
-
printf(" -v, --verbose
|
723 |
-
printf(" -t N,
|
724 |
-
printf(" -
|
725 |
-
printf("
|
726 |
-
printf(" --rope-freq-
|
727 |
-
printf(" --
|
728 |
-
printf("
|
729 |
-
printf("
|
730 |
-
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
731 |
if (llama_mlock_supported())
|
732 |
{
|
733 |
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
@@ -872,15 +853,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|
872 |
}
|
873 |
params.n_threads = std::stoi(argv[i]);
|
874 |
}
|
875 |
-
else if (arg == "--threads-batch" || arg == "-tb")
|
876 |
-
{
|
877 |
-
if (++i >= argc)
|
878 |
-
{
|
879 |
-
invalid_param = true;
|
880 |
-
break;
|
881 |
-
}
|
882 |
-
params.n_threads_batch = std::stoi(argv[i]);
|
883 |
-
}
|
884 |
else if (arg == "-b" || arg == "--batch-size")
|
885 |
{
|
886 |
if (++i >= argc)
|
@@ -1035,35 +1007,34 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|
1035 |
|
1036 |
static json format_generation_settings(llama_server_context &llama)
|
1037 |
{
|
1038 |
-
const auto
|
1039 |
-
const
|
1040 |
-
const bool ignore_eos = eos_bias != sparams.logit_bias.end() &&
|
1041 |
eos_bias->second < 0.0f && std::isinf(eos_bias->second);
|
1042 |
|
1043 |
return json{
|
1044 |
{"n_ctx", llama.n_ctx},
|
1045 |
{"model", llama.params.model_alias},
|
1046 |
{"seed", llama.params.seed},
|
1047 |
-
{"temp",
|
1048 |
-
{"top_k",
|
1049 |
-
{"top_p",
|
1050 |
-
{"tfs_z",
|
1051 |
-
{"typical_p",
|
1052 |
-
{"repeat_last_n",
|
1053 |
-
{"repeat_penalty",
|
1054 |
-
{"presence_penalty",
|
1055 |
-
{"frequency_penalty",
|
1056 |
-
{"mirostat",
|
1057 |
-
{"mirostat_tau",
|
1058 |
-
{"mirostat_eta",
|
1059 |
-
{"penalize_nl",
|
1060 |
{"stop", llama.params.antiprompt},
|
1061 |
{"n_predict", llama.params.n_predict},
|
1062 |
{"n_keep", llama.params.n_keep},
|
1063 |
{"ignore_eos", ignore_eos},
|
1064 |
{"stream", llama.stream},
|
1065 |
-
{"logit_bias",
|
1066 |
-
{"n_probs",
|
1067 |
{"grammar", llama.params.grammar},
|
1068 |
};
|
1069 |
}
|
@@ -1112,7 +1083,7 @@ static json format_final_response(llama_server_context &llama, const std::string
|
|
1112 |
{"timings", format_timings(llama)},
|
1113 |
};
|
1114 |
|
1115 |
-
if (llama.params.
|
1116 |
{
|
1117 |
res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
|
1118 |
}
|
@@ -1128,7 +1099,7 @@ static json format_partial_response(
|
|
1128 |
{"stop", false},
|
1129 |
};
|
1130 |
|
1131 |
-
if (llama.params.
|
1132 |
{
|
1133 |
res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
|
1134 |
}
|
@@ -1160,28 +1131,26 @@ static T json_value(const json &body, const std::string &key, const T &default_v
|
|
1160 |
static void parse_options_completion(const json &body, llama_server_context &llama)
|
1161 |
{
|
1162 |
gpt_params default_params;
|
1163 |
-
const auto & default_sparams = default_params.sampling_params;
|
1164 |
-
auto & sparams = llama.params.sampling_params;
|
1165 |
|
1166 |
llama.stream = json_value(body, "stream", false);
|
1167 |
llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict);
|
1168 |
-
|
1169 |
-
|
1170 |
-
|
1171 |
-
|
1172 |
-
|
1173 |
-
|
1174 |
-
|
1175 |
-
|
1176 |
-
|
1177 |
-
|
1178 |
-
|
1179 |
-
|
1180 |
-
|
1181 |
llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep);
|
1182 |
llama.params.seed = json_value(body, "seed", default_params.seed);
|
1183 |
llama.params.grammar = json_value(body, "grammar", default_params.grammar);
|
1184 |
-
|
1185 |
|
1186 |
if (body.count("prompt") != 0)
|
1187 |
{
|
@@ -1192,10 +1161,10 @@ static void parse_options_completion(const json &body, llama_server_context &lla
|
|
1192 |
llama.prompt = "";
|
1193 |
}
|
1194 |
|
1195 |
-
|
1196 |
if (json_value(body, "ignore_eos", false))
|
1197 |
{
|
1198 |
-
|
1199 |
}
|
1200 |
|
1201 |
const auto &logit_bias = body.find("logit_bias");
|
@@ -1211,11 +1180,11 @@ static void parse_options_completion(const json &body, llama_server_context &lla
|
|
1211 |
{
|
1212 |
if (el[1].is_number())
|
1213 |
{
|
1214 |
-
|
1215 |
}
|
1216 |
else if (el[1].is_boolean() && !el[1].get<bool>())
|
1217 |
{
|
1218 |
-
|
1219 |
}
|
1220 |
}
|
1221 |
}
|
@@ -1235,8 +1204,6 @@ static void parse_options_completion(const json &body, llama_server_context &lla
|
|
1235 |
}
|
1236 |
}
|
1237 |
|
1238 |
-
llama.ctx_sampling = llama_sampling_context_init(llama.params, llama.grammar);
|
1239 |
-
|
1240 |
LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama));
|
1241 |
}
|
1242 |
|
@@ -1445,7 +1412,7 @@ int main(int argc, char **argv)
|
|
1445 |
}
|
1446 |
|
1447 |
auto probs = llama.generated_token_probs;
|
1448 |
-
if (llama.params.
|
1449 |
const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
|
1450 |
probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
|
1451 |
}
|
@@ -1497,7 +1464,7 @@ int main(int argc, char **argv)
|
|
1497 |
|
1498 |
std::vector<completion_token_output> probs_output = {};
|
1499 |
|
1500 |
-
if (llama.params.
|
1501 |
const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
|
1502 |
size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
|
1503 |
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
|
@@ -1618,7 +1585,7 @@ int main(int argc, char **argv)
|
|
1618 |
|
1619 |
std::vector<completion_token_output> probs_output = {};
|
1620 |
|
1621 |
-
if (llama.params.
|
1622 |
const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
|
1623 |
size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
|
1624 |
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
|
|
|
200 |
llama_model *model = nullptr;
|
201 |
llama_context *ctx = nullptr;
|
202 |
gpt_params params;
|
|
|
203 |
int n_ctx;
|
204 |
|
205 |
grammar_parser::parse_state parsed_grammar;
|
|
|
254 |
if (grammar != nullptr) {
|
255 |
llama_grammar_free(grammar);
|
256 |
grammar = nullptr;
|
|
|
257 |
}
|
258 |
}
|
259 |
|
|
|
329 |
grammar_parser::print_grammar(stderr, parsed_grammar);
|
330 |
|
331 |
{
|
332 |
+
auto it = params.logit_bias.find(llama_token_eos(ctx));
|
333 |
+
if (it != params.logit_bias.end() && it->second == -INFINITY) {
|
334 |
LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {});
|
335 |
}
|
336 |
}
|
|
|
339 |
grammar = llama_grammar_init(
|
340 |
grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
341 |
}
|
|
|
342 |
return true;
|
343 |
}
|
344 |
|
345 |
void loadInfill()
|
346 |
{
|
347 |
+
auto prefix_tokens = tokenize(params.input_prefix, true); // always add BOS
|
348 |
+
auto suffix_tokens = tokenize(params.input_suffix, true); // always add BOS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
349 |
prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(ctx));
|
|
|
350 |
prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx));
|
351 |
prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
|
352 |
prefix_tokens.push_back(llama_token_middle(ctx));
|
|
|
391 |
// compare the evaluated prompt with the new prompt
|
392 |
n_past = common_part(embd, prompt_tokens);
|
393 |
embd = prompt_tokens;
|
|
|
394 |
if (n_past == num_prompt_tokens)
|
395 |
{
|
396 |
// we have to evaluate at least 1 token to generate logits.
|
|
|
398 |
n_past--;
|
399 |
}
|
400 |
|
|
|
|
|
|
|
401 |
LOG_VERBOSE("prompt ingested", {
|
402 |
{"n_past", n_past},
|
403 |
{"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
|
|
|
447 |
// compare the evaluated prompt with the new prompt
|
448 |
n_past = common_part(embd, prompt_tokens);
|
449 |
|
450 |
+
// since #3228 we now have to manually manage the KV cache
|
451 |
+
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
452 |
+
|
453 |
embd = prompt_tokens;
|
454 |
if (n_past == num_prompt_tokens)
|
455 |
{
|
|
|
457 |
n_past--;
|
458 |
}
|
459 |
|
|
|
|
|
|
|
460 |
LOG_VERBOSE("prompt ingested", {
|
461 |
{"n_past", n_past},
|
462 |
{"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
|
|
|
539 |
std::vector<llama_token_data> candidates;
|
540 |
candidates.reserve(llama_n_vocab(model));
|
541 |
|
542 |
+
result.tok = llama_sample_token(ctx, NULL, grammar, params, last_n_tokens, candidates);
|
543 |
|
544 |
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
545 |
|
546 |
+
const int32_t n_probs = params.n_probs;
|
547 |
+
if (params.temp <= 0 && n_probs > 0)
|
548 |
{
|
549 |
// For llama_sample_token_greedy we need to sort candidates
|
550 |
llama_sample_softmax(ctx, &candidates_p);
|
|
|
619 |
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
|
620 |
generated_text += token_text;
|
621 |
|
622 |
+
if (params.n_probs > 0)
|
623 |
{
|
624 |
generated_token_probs.push_back(token_with_probs);
|
625 |
}
|
|
|
700 |
printf("usage: %s [options]\n", argv0);
|
701 |
printf("\n");
|
702 |
printf("options:\n");
|
703 |
+
printf(" -h, --help show this help message and exit\n");
|
704 |
+
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
705 |
+
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
706 |
+
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
707 |
+
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
|
708 |
+
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n");
|
709 |
+
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
710 |
+
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
711 |
+
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
|
|
712 |
if (llama_mlock_supported())
|
713 |
{
|
714 |
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
|
|
853 |
}
|
854 |
params.n_threads = std::stoi(argv[i]);
|
855 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
856 |
else if (arg == "-b" || arg == "--batch-size")
|
857 |
{
|
858 |
if (++i >= argc)
|
|
|
1007 |
|
1008 |
static json format_generation_settings(llama_server_context &llama)
|
1009 |
{
|
1010 |
+
const auto eos_bias = llama.params.logit_bias.find(llama_token_eos(llama.ctx));
|
1011 |
+
const bool ignore_eos = eos_bias != llama.params.logit_bias.end() &&
|
|
|
1012 |
eos_bias->second < 0.0f && std::isinf(eos_bias->second);
|
1013 |
|
1014 |
return json{
|
1015 |
{"n_ctx", llama.n_ctx},
|
1016 |
{"model", llama.params.model_alias},
|
1017 |
{"seed", llama.params.seed},
|
1018 |
+
{"temp", llama.params.temp},
|
1019 |
+
{"top_k", llama.params.top_k},
|
1020 |
+
{"top_p", llama.params.top_p},
|
1021 |
+
{"tfs_z", llama.params.tfs_z},
|
1022 |
+
{"typical_p", llama.params.typical_p},
|
1023 |
+
{"repeat_last_n", llama.params.repeat_last_n},
|
1024 |
+
{"repeat_penalty", llama.params.repeat_penalty},
|
1025 |
+
{"presence_penalty", llama.params.presence_penalty},
|
1026 |
+
{"frequency_penalty", llama.params.frequency_penalty},
|
1027 |
+
{"mirostat", llama.params.mirostat},
|
1028 |
+
{"mirostat_tau", llama.params.mirostat_tau},
|
1029 |
+
{"mirostat_eta", llama.params.mirostat_eta},
|
1030 |
+
{"penalize_nl", llama.params.penalize_nl},
|
1031 |
{"stop", llama.params.antiprompt},
|
1032 |
{"n_predict", llama.params.n_predict},
|
1033 |
{"n_keep", llama.params.n_keep},
|
1034 |
{"ignore_eos", ignore_eos},
|
1035 |
{"stream", llama.stream},
|
1036 |
+
{"logit_bias", llama.params.logit_bias},
|
1037 |
+
{"n_probs", llama.params.n_probs},
|
1038 |
{"grammar", llama.params.grammar},
|
1039 |
};
|
1040 |
}
|
|
|
1083 |
{"timings", format_timings(llama)},
|
1084 |
};
|
1085 |
|
1086 |
+
if (llama.params.n_probs > 0)
|
1087 |
{
|
1088 |
res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
|
1089 |
}
|
|
|
1099 |
{"stop", false},
|
1100 |
};
|
1101 |
|
1102 |
+
if (llama.params.n_probs > 0)
|
1103 |
{
|
1104 |
res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
|
1105 |
}
|
|
|
1131 |
static void parse_options_completion(const json &body, llama_server_context &llama)
|
1132 |
{
|
1133 |
gpt_params default_params;
|
|
|
|
|
1134 |
|
1135 |
llama.stream = json_value(body, "stream", false);
|
1136 |
llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict);
|
1137 |
+
llama.params.top_k = json_value(body, "top_k", default_params.top_k);
|
1138 |
+
llama.params.top_p = json_value(body, "top_p", default_params.top_p);
|
1139 |
+
llama.params.tfs_z = json_value(body, "tfs_z", default_params.tfs_z);
|
1140 |
+
llama.params.typical_p = json_value(body, "typical_p", default_params.typical_p);
|
1141 |
+
llama.params.repeat_last_n = json_value(body, "repeat_last_n", default_params.repeat_last_n);
|
1142 |
+
llama.params.temp = json_value(body, "temperature", default_params.temp);
|
1143 |
+
llama.params.repeat_penalty = json_value(body, "repeat_penalty", default_params.repeat_penalty);
|
1144 |
+
llama.params.presence_penalty = json_value(body, "presence_penalty", default_params.presence_penalty);
|
1145 |
+
llama.params.frequency_penalty = json_value(body, "frequency_penalty", default_params.frequency_penalty);
|
1146 |
+
llama.params.mirostat = json_value(body, "mirostat", default_params.mirostat);
|
1147 |
+
llama.params.mirostat_tau = json_value(body, "mirostat_tau", default_params.mirostat_tau);
|
1148 |
+
llama.params.mirostat_eta = json_value(body, "mirostat_eta", default_params.mirostat_eta);
|
1149 |
+
llama.params.penalize_nl = json_value(body, "penalize_nl", default_params.penalize_nl);
|
1150 |
llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep);
|
1151 |
llama.params.seed = json_value(body, "seed", default_params.seed);
|
1152 |
llama.params.grammar = json_value(body, "grammar", default_params.grammar);
|
1153 |
+
llama.params.n_probs = json_value(body, "n_probs", default_params.n_probs);
|
1154 |
|
1155 |
if (body.count("prompt") != 0)
|
1156 |
{
|
|
|
1161 |
llama.prompt = "";
|
1162 |
}
|
1163 |
|
1164 |
+
llama.params.logit_bias.clear();
|
1165 |
if (json_value(body, "ignore_eos", false))
|
1166 |
{
|
1167 |
+
llama.params.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
|
1168 |
}
|
1169 |
|
1170 |
const auto &logit_bias = body.find("logit_bias");
|
|
|
1180 |
{
|
1181 |
if (el[1].is_number())
|
1182 |
{
|
1183 |
+
llama.params.logit_bias[tok] = el[1].get<float>();
|
1184 |
}
|
1185 |
else if (el[1].is_boolean() && !el[1].get<bool>())
|
1186 |
{
|
1187 |
+
llama.params.logit_bias[tok] = -INFINITY;
|
1188 |
}
|
1189 |
}
|
1190 |
}
|
|
|
1204 |
}
|
1205 |
}
|
1206 |
|
|
|
|
|
1207 |
LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama));
|
1208 |
}
|
1209 |
|
|
|
1412 |
}
|
1413 |
|
1414 |
auto probs = llama.generated_token_probs;
|
1415 |
+
if (llama.params.n_probs > 0 && llama.stopped_word) {
|
1416 |
const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
|
1417 |
probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
|
1418 |
}
|
|
|
1464 |
|
1465 |
std::vector<completion_token_output> probs_output = {};
|
1466 |
|
1467 |
+
if (llama.params.n_probs > 0) {
|
1468 |
const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
|
1469 |
size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
|
1470 |
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
|
|
|
1585 |
|
1586 |
std::vector<completion_token_output> probs_output = {};
|
1587 |
|
1588 |
+
if (llama.params.n_probs > 0) {
|
1589 |
const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
|
1590 |
size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
|
1591 |
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
|
examples/speculative/speculative.cpp
CHANGED
@@ -125,8 +125,6 @@ int main(int argc, char ** argv) {
|
|
125 |
grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
126 |
}
|
127 |
|
128 |
-
llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar_tgt);
|
129 |
-
|
130 |
const auto t_dec_start = ggml_time_us();
|
131 |
|
132 |
while (true) {
|
@@ -136,7 +134,7 @@ int main(int argc, char ** argv) {
|
|
136 |
|
137 |
while (true) {
|
138 |
// sample from the target model
|
139 |
-
llama_token id =
|
140 |
|
141 |
// remember which tokens were sampled - used for repetition penalties during sampling
|
142 |
last_tokens.erase(last_tokens.begin());
|
@@ -213,13 +211,7 @@ int main(int argc, char ** argv) {
|
|
213 |
if (grammar_dft) {
|
214 |
llama_grammar_free(grammar_dft);
|
215 |
}
|
216 |
-
|
217 |
-
// that will need to change.
|
218 |
-
auto it = ctx_sampling.sequence_contexts.find(0);
|
219 |
-
GGML_ASSERT(it != ctx_sampling.sequence_contexts.end());
|
220 |
-
// This is necessary because each sequence id in sequence_contexts
|
221 |
-
// uses a copy of the original grammar.
|
222 |
-
grammar_dft = llama_grammar_copy(it->second.grammar);
|
223 |
|
224 |
LOG("copied target grammar to draft grammar\n");
|
225 |
}
|
|
|
125 |
grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
126 |
}
|
127 |
|
|
|
|
|
128 |
const auto t_dec_start = ggml_time_us();
|
129 |
|
130 |
while (true) {
|
|
|
134 |
|
135 |
while (true) {
|
136 |
// sample from the target model
|
137 |
+
llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);
|
138 |
|
139 |
// remember which tokens were sampled - used for repetition penalties during sampling
|
140 |
last_tokens.erase(last_tokens.begin());
|
|
|
211 |
if (grammar_dft) {
|
212 |
llama_grammar_free(grammar_dft);
|
213 |
}
|
214 |
+
grammar_dft = llama_grammar_copy(grammar_tgt);
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
|
216 |
LOG("copied target grammar to draft grammar\n");
|
217 |
}
|
ggml-alloc.c
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
#include "ggml-alloc.h"
|
2 |
-
#include "ggml-backend.h"
|
3 |
#include "ggml.h"
|
4 |
#include <assert.h>
|
5 |
#include <stdarg.h>
|
@@ -7,6 +6,25 @@
|
|
7 |
#include <stdlib.h>
|
8 |
#include <string.h>
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
#define UNUSED(x) (void)(x)
|
12 |
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
@@ -62,9 +80,8 @@ struct free_block {
|
|
62 |
#define MAX_FREE_BLOCKS 256
|
63 |
|
64 |
struct ggml_allocr {
|
65 |
-
struct ggml_backend_buffer * buffer;
|
66 |
-
bool buffer_owned;
|
67 |
void * data;
|
|
|
68 |
size_t alignment;
|
69 |
int n_free_blocks;
|
70 |
struct free_block free_blocks[MAX_FREE_BLOCKS];
|
@@ -102,9 +119,16 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
|
|
102 |
}
|
103 |
#endif
|
104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
// check if a tensor is allocated by this buffer
|
106 |
static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
|
107 |
-
|
|
|
108 |
}
|
109 |
|
110 |
static bool ggml_is_view(struct ggml_tensor * t) {
|
@@ -112,10 +136,11 @@ static bool ggml_is_view(struct ggml_tensor * t) {
|
|
112 |
}
|
113 |
|
114 |
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
|
|
115 |
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
|
116 |
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
|
117 |
-
|
118 |
-
size_t size =
|
119 |
size = aligned_offset(NULL, size, alloc->alignment);
|
120 |
|
121 |
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
|
@@ -163,8 +188,6 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|
163 |
|
164 |
tensor->data = addr;
|
165 |
AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
|
166 |
-
tensor->buffer = alloc->buffer;
|
167 |
-
ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
|
168 |
|
169 |
#ifdef GGML_ALLOCATOR_DEBUG
|
170 |
add_allocated_tensor(alloc, tensor);
|
@@ -185,21 +208,19 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|
185 |
|
186 |
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
187 |
static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
|
|
|
|
188 |
if (ggml_allocr_is_own(alloc, tensor) == false) {
|
189 |
// the tensor was not allocated in this buffer
|
190 |
// this can happen because the graph allocator will try to free weights and other tensors from different buffers
|
191 |
// the easiest way to deal with this is just to ignore it
|
192 |
-
AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
|
193 |
return;
|
194 |
}
|
195 |
|
196 |
-
|
197 |
-
|
198 |
-
size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
|
199 |
size = aligned_offset(NULL, size, alloc->alignment);
|
200 |
AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
|
201 |
-
|
202 |
-
ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
|
203 |
|
204 |
#ifdef GGML_ALLOCATOR_DEBUG
|
205 |
remove_allocated_tensor(alloc, tensor);
|
@@ -264,18 +285,15 @@ void ggml_allocr_reset(struct ggml_allocr * alloc) {
|
|
264 |
alloc->n_free_blocks = 1;
|
265 |
size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
|
266 |
alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
|
267 |
-
alloc->free_blocks[0].size =
|
268 |
}
|
269 |
|
270 |
struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
|
271 |
-
struct
|
272 |
-
|
273 |
-
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
|
274 |
|
275 |
*alloc = (struct ggml_allocr){
|
276 |
-
/*.
|
277 |
-
/*.
|
278 |
-
/*.base = */ ggml_backend_buffer_get_base(buffer),
|
279 |
/*.alignment = */ alignment,
|
280 |
/*.n_free_blocks = */ 0,
|
281 |
/*.free_blocks = */ {{0}},
|
@@ -294,26 +312,74 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
|
|
294 |
return alloc;
|
295 |
}
|
296 |
|
297 |
-
|
298 |
-
|
299 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
300 |
|
301 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
302 |
}
|
303 |
|
304 |
-
|
305 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
306 |
|
307 |
*alloc = (struct ggml_allocr){
|
308 |
-
/*.
|
309 |
-
/*.
|
310 |
-
/*.
|
311 |
-
/*.alignment = */ ggml_backend_buffer_get_alignment(buffer),
|
312 |
/*.n_free_blocks = */ 0,
|
313 |
/*.free_blocks = */ {{0}},
|
314 |
/*.hash_table = */ {{0}},
|
315 |
/*.max_size = */ 0,
|
316 |
-
/*.measure = */
|
317 |
/*.parse_seq = */ {0},
|
318 |
/*.parse_seq_len = */ 0,
|
319 |
#ifdef GGML_ALLOCATOR_DEBUG
|
@@ -327,8 +393,8 @@ struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * bu
|
|
327 |
}
|
328 |
|
329 |
void ggml_allocr_free(struct ggml_allocr * alloc) {
|
330 |
-
if (alloc->
|
331 |
-
|
332 |
}
|
333 |
free(alloc);
|
334 |
}
|
@@ -371,6 +437,7 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
|
371 |
case GGML_OP_ROPE:
|
372 |
case GGML_OP_RMS_NORM:
|
373 |
case GGML_OP_SOFT_MAX:
|
|
|
374 |
return true;
|
375 |
|
376 |
default:
|
@@ -378,23 +445,12 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
|
378 |
}
|
379 |
}
|
380 |
|
381 |
-
static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view) {
|
382 |
-
assert(view->view_src != NULL && view->view_src->data != NULL);
|
383 |
-
view->backend = view->view_src->backend;
|
384 |
-
view->buffer = view->view_src->buffer;
|
385 |
-
view->data = (char *)view->view_src->data + view->view_offs;
|
386 |
-
|
387 |
-
// FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
|
388 |
-
// due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
|
389 |
-
assert(ggml_allocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
|
390 |
-
ggml_backend_buffer_init_tensor(alloc->buffer, view);
|
391 |
-
}
|
392 |
-
|
393 |
static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
|
394 |
struct hash_node * ht = alloc->hash_table;
|
395 |
if (node->data == NULL) {
|
396 |
if (ggml_is_view(node)) {
|
397 |
-
|
|
|
398 |
} else {
|
399 |
// see if we can reuse a parent's buffer (inplace)
|
400 |
if (ggml_op_can_inplace(node->op)) {
|
@@ -422,17 +478,13 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
422 |
// adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
|
423 |
// for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
|
424 |
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
|
425 |
-
node->
|
426 |
-
view_src_hn->n_views += 1;
|
427 |
-
init_view(alloc, node);
|
428 |
return;
|
429 |
}
|
430 |
}
|
431 |
else {
|
432 |
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
433 |
-
node->
|
434 |
-
p_hn->n_views += 1;
|
435 |
-
init_view(alloc, node);
|
436 |
return;
|
437 |
}
|
438 |
}
|
@@ -443,7 +495,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
443 |
}
|
444 |
}
|
445 |
|
446 |
-
size_t
|
447 |
struct ggml_allocr * alloc,
|
448 |
struct ggml_cgraph ** graphs, int n_graphs,
|
449 |
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
|
@@ -461,10 +513,6 @@ size_t ggml_allocr_alloc_graph_n(
|
|
461 |
if (ggml_is_view(node)) {
|
462 |
struct ggml_tensor * view_src = node->view_src;
|
463 |
hash_get(ht, view_src)->n_views += 1;
|
464 |
-
if (node->buffer == NULL && node->data != NULL) {
|
465 |
-
// view of a pre-allocated tensor, didn't call init_view() yet
|
466 |
-
init_view(alloc, node);
|
467 |
-
}
|
468 |
}
|
469 |
|
470 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
@@ -473,9 +521,6 @@ size_t ggml_allocr_alloc_graph_n(
|
|
473 |
break;
|
474 |
}
|
475 |
hash_get(ht, parent)->n_children += 1;
|
476 |
-
if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
|
477 |
-
init_view(alloc, parent);
|
478 |
-
}
|
479 |
}
|
480 |
}
|
481 |
}
|
@@ -586,7 +631,7 @@ size_t ggml_allocr_alloc_graph_n(
|
|
586 |
}
|
587 |
|
588 |
size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
|
589 |
-
return
|
590 |
}
|
591 |
|
592 |
size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
|
|
|
1 |
#include "ggml-alloc.h"
|
|
|
2 |
#include "ggml.h"
|
3 |
#include <assert.h>
|
4 |
#include <stdarg.h>
|
|
|
6 |
#include <stdlib.h>
|
7 |
#include <string.h>
|
8 |
|
9 |
+
#ifdef __has_include
|
10 |
+
#if __has_include(<unistd.h>)
|
11 |
+
#include <unistd.h>
|
12 |
+
#if defined(_POSIX_MAPPED_FILES)
|
13 |
+
#include <sys/types.h>
|
14 |
+
#include <sys/mman.h>
|
15 |
+
#endif
|
16 |
+
#endif
|
17 |
+
#endif
|
18 |
+
|
19 |
+
#if defined(_WIN32)
|
20 |
+
#define WIN32_LEAN_AND_MEAN
|
21 |
+
#ifndef NOMINMAX
|
22 |
+
#define NOMINMAX
|
23 |
+
#endif
|
24 |
+
#include <windows.h>
|
25 |
+
#include <memoryapi.h>
|
26 |
+
#endif
|
27 |
+
|
28 |
|
29 |
#define UNUSED(x) (void)(x)
|
30 |
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
|
|
80 |
#define MAX_FREE_BLOCKS 256
|
81 |
|
82 |
struct ggml_allocr {
|
|
|
|
|
83 |
void * data;
|
84 |
+
size_t size;
|
85 |
size_t alignment;
|
86 |
int n_free_blocks;
|
87 |
struct free_block free_blocks[MAX_FREE_BLOCKS];
|
|
|
119 |
}
|
120 |
#endif
|
121 |
|
122 |
+
static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
123 |
+
return ggml_nbytes(tensor);
|
124 |
+
|
125 |
+
UNUSED(alloc);
|
126 |
+
}
|
127 |
+
|
128 |
// check if a tensor is allocated by this buffer
|
129 |
static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
|
130 |
+
void * ptr = tensor->data;
|
131 |
+
return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
|
132 |
}
|
133 |
|
134 |
static bool ggml_is_view(struct ggml_tensor * t) {
|
|
|
136 |
}
|
137 |
|
138 |
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
139 |
+
#ifdef GGML_ALLOCATOR_DEBUG
|
140 |
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
|
141 |
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
|
142 |
+
#endif
|
143 |
+
size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
|
144 |
size = aligned_offset(NULL, size, alloc->alignment);
|
145 |
|
146 |
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
|
|
|
188 |
|
189 |
tensor->data = addr;
|
190 |
AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
|
|
|
|
|
191 |
|
192 |
#ifdef GGML_ALLOCATOR_DEBUG
|
193 |
add_allocated_tensor(alloc, tensor);
|
|
|
208 |
|
209 |
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
210 |
static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
211 |
+
void * ptr = tensor->data;
|
212 |
+
|
213 |
if (ggml_allocr_is_own(alloc, tensor) == false) {
|
214 |
// the tensor was not allocated in this buffer
|
215 |
// this can happen because the graph allocator will try to free weights and other tensors from different buffers
|
216 |
// the easiest way to deal with this is just to ignore it
|
|
|
217 |
return;
|
218 |
}
|
219 |
|
220 |
+
size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
|
|
|
|
|
221 |
size = aligned_offset(NULL, size, alloc->alignment);
|
222 |
AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
|
223 |
+
AT_PRINTF("%s: alloc->data = %p alloc->data+alloc->size = %p alloc->data+alloc->max_size = %p\n", __func__, alloc->data, (char*)alloc->data + alloc->size, (char*)alloc->data + alloc->max_size);
|
|
|
224 |
|
225 |
#ifdef GGML_ALLOCATOR_DEBUG
|
226 |
remove_allocated_tensor(alloc, tensor);
|
|
|
285 |
alloc->n_free_blocks = 1;
|
286 |
size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
|
287 |
alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
|
288 |
+
alloc->free_blocks[0].size = alloc->size - align_offset;
|
289 |
}
|
290 |
|
291 |
struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
|
292 |
+
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
|
|
|
|
|
293 |
|
294 |
*alloc = (struct ggml_allocr){
|
295 |
+
/*.data = */ data,
|
296 |
+
/*.size = */ size,
|
|
|
297 |
/*.alignment = */ alignment,
|
298 |
/*.n_free_blocks = */ 0,
|
299 |
/*.free_blocks = */ {{0}},
|
|
|
312 |
return alloc;
|
313 |
}
|
314 |
|
315 |
+
// OS specific functions to allocate and free uncommitted virtual memory
|
316 |
+
static void * alloc_vmem(size_t size) {
|
317 |
+
#if defined(_WIN32)
|
318 |
+
return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
|
319 |
+
#elif defined(_POSIX_MAPPED_FILES)
|
320 |
+
void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
|
321 |
+
if (ptr == MAP_FAILED) {
|
322 |
+
return NULL;
|
323 |
+
}
|
324 |
+
return ptr;
|
325 |
+
#else
|
326 |
+
// use a fixed address for other platforms
|
327 |
+
uintptr_t base_addr = (uintptr_t)-size - 0x100;
|
328 |
+
return (void *)base_addr;
|
329 |
+
#endif
|
330 |
+
}
|
331 |
|
332 |
+
static void free_vmem(void * base_addr, size_t size) {
|
333 |
+
#if defined(_WIN32)
|
334 |
+
VirtualFree(base_addr, 0, MEM_RELEASE);
|
335 |
+
UNUSED(size);
|
336 |
+
#elif defined(_POSIX_MAPPED_FILES)
|
337 |
+
munmap(base_addr, size);
|
338 |
+
#else
|
339 |
+
// nothing to do
|
340 |
+
UNUSED(base_addr);
|
341 |
+
UNUSED(size);
|
342 |
+
#endif
|
343 |
}
|
344 |
|
345 |
+
// allocate uncommitted virtual memory to measure the size of the graph
|
346 |
+
static void alloc_measure_vmem(void ** base_addr, size_t * size) {
|
347 |
+
// 128GB for 64-bit, 1GB for 32-bit
|
348 |
+
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
|
349 |
+
do {
|
350 |
+
*base_addr = alloc_vmem(*size);
|
351 |
+
if (*base_addr != NULL) {
|
352 |
+
AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
|
353 |
+
return;
|
354 |
+
}
|
355 |
+
// try again with half the size
|
356 |
+
*size /= 2;
|
357 |
+
} while (*size > 0);
|
358 |
+
|
359 |
+
GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
|
360 |
+
}
|
361 |
+
|
362 |
+
static void free_measure_vmem(void * base_addr, size_t size) {
|
363 |
+
free_vmem(base_addr, size);
|
364 |
+
}
|
365 |
+
|
366 |
+
struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
367 |
+
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
|
368 |
+
|
369 |
+
void * base_addr;
|
370 |
+
size_t size;
|
371 |
+
|
372 |
+
alloc_measure_vmem(&base_addr, &size);
|
373 |
|
374 |
*alloc = (struct ggml_allocr){
|
375 |
+
/*.data = */ base_addr,
|
376 |
+
/*.size = */ size,
|
377 |
+
/*.alignment = */ alignment,
|
|
|
378 |
/*.n_free_blocks = */ 0,
|
379 |
/*.free_blocks = */ {{0}},
|
380 |
/*.hash_table = */ {{0}},
|
381 |
/*.max_size = */ 0,
|
382 |
+
/*.measure = */ true,
|
383 |
/*.parse_seq = */ {0},
|
384 |
/*.parse_seq_len = */ 0,
|
385 |
#ifdef GGML_ALLOCATOR_DEBUG
|
|
|
393 |
}
|
394 |
|
395 |
void ggml_allocr_free(struct ggml_allocr * alloc) {
|
396 |
+
if (alloc->measure) {
|
397 |
+
free_measure_vmem(alloc->data, alloc->size);
|
398 |
}
|
399 |
free(alloc);
|
400 |
}
|
|
|
437 |
case GGML_OP_ROPE:
|
438 |
case GGML_OP_RMS_NORM:
|
439 |
case GGML_OP_SOFT_MAX:
|
440 |
+
case GGML_OP_CONT:
|
441 |
return true;
|
442 |
|
443 |
default:
|
|
|
445 |
}
|
446 |
}
|
447 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
448 |
static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
|
449 |
struct hash_node * ht = alloc->hash_table;
|
450 |
if (node->data == NULL) {
|
451 |
if (ggml_is_view(node)) {
|
452 |
+
assert(node->view_src->data != NULL);
|
453 |
+
node->data = (char *)node->view_src->data + node->view_offs;
|
454 |
} else {
|
455 |
// see if we can reuse a parent's buffer (inplace)
|
456 |
if (ggml_op_can_inplace(node->op)) {
|
|
|
478 |
// adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
|
479 |
// for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
|
480 |
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
|
481 |
+
node->data = parent->data;
|
|
|
|
|
482 |
return;
|
483 |
}
|
484 |
}
|
485 |
else {
|
486 |
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
487 |
+
node->data = parent->data;
|
|
|
|
|
488 |
return;
|
489 |
}
|
490 |
}
|
|
|
495 |
}
|
496 |
}
|
497 |
|
498 |
+
static size_t ggml_allocr_alloc_graph_tensors_n(
|
499 |
struct ggml_allocr * alloc,
|
500 |
struct ggml_cgraph ** graphs, int n_graphs,
|
501 |
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
|
|
|
513 |
if (ggml_is_view(node)) {
|
514 |
struct ggml_tensor * view_src = node->view_src;
|
515 |
hash_get(ht, view_src)->n_views += 1;
|
|
|
|
|
|
|
|
|
516 |
}
|
517 |
|
518 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
|
521 |
break;
|
522 |
}
|
523 |
hash_get(ht, parent)->n_children += 1;
|
|
|
|
|
|
|
524 |
}
|
525 |
}
|
526 |
}
|
|
|
631 |
}
|
632 |
|
633 |
size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
|
634 |
+
return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
|
635 |
}
|
636 |
|
637 |
size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
|
ggml-alloc.h
CHANGED
@@ -6,27 +6,21 @@
|
|
6 |
extern "C" {
|
7 |
#endif
|
8 |
|
9 |
-
struct ggml_backend_buffer;
|
10 |
|
11 |
GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
|
12 |
GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
|
13 |
-
GGML_API struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
|
14 |
|
15 |
// tell the allocator to parse nodes following the order described in the list
|
16 |
// you should call this if your graph are optimized to execute out-of-order
|
17 |
GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
|
18 |
|
19 |
-
GGML_API void ggml_allocr_free
|
20 |
-
GGML_API bool ggml_allocr_is_measure
|
21 |
-
GGML_API void ggml_allocr_reset
|
22 |
-
GGML_API void ggml_allocr_alloc
|
23 |
GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
|
24 |
-
GGML_API size_t ggml_allocr_max_size
|
25 |
|
26 |
-
GGML_API size_t ggml_allocr_alloc_graph_n(
|
27 |
-
struct ggml_allocr * alloc,
|
28 |
-
struct ggml_cgraph ** graphs, int n_graphs,
|
29 |
-
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs);
|
30 |
|
31 |
#ifdef __cplusplus
|
32 |
}
|
|
|
6 |
extern "C" {
|
7 |
#endif
|
8 |
|
|
|
9 |
|
10 |
GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
|
11 |
GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
|
|
|
12 |
|
13 |
// tell the allocator to parse nodes following the order described in the list
|
14 |
// you should call this if your graph are optimized to execute out-of-order
|
15 |
GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
|
16 |
|
17 |
+
GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
|
18 |
+
GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
|
19 |
+
GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);
|
20 |
+
GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
|
21 |
GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
|
22 |
+
GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc);
|
23 |
|
|
|
|
|
|
|
|
|
24 |
|
25 |
#ifdef __cplusplus
|
26 |
}
|
ggml-cuda.cu
CHANGED
@@ -62,7 +62,6 @@
|
|
62 |
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
63 |
#define cudaMemcpyKind hipMemcpyKind
|
64 |
#define cudaMemset hipMemset
|
65 |
-
#define cudaMemsetAsync hipMemsetAsync
|
66 |
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
67 |
#define cudaSetDevice hipSetDevice
|
68 |
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
@@ -415,13 +414,11 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
415 |
#define CUDA_SILU_BLOCK_SIZE 256
|
416 |
#define CUDA_CPY_BLOCK_SIZE 32
|
417 |
#define CUDA_SCALE_BLOCK_SIZE 256
|
418 |
-
#define CUDA_CLAMP_BLOCK_SIZE 256
|
419 |
#define CUDA_ROPE_BLOCK_SIZE 256
|
420 |
#define CUDA_ALIBI_BLOCK_SIZE 32
|
421 |
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
422 |
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
423 |
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
424 |
-
#define CUDA_GET_ROWS_BLOCK_SIZE 256
|
425 |
|
426 |
// dmmv = dequantize_mul_mat_vec
|
427 |
#ifndef GGML_CUDA_DMMV_X
|
@@ -1577,34 +1574,6 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
|
1577 |
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
1578 |
}
|
1579 |
|
1580 |
-
template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
1581 |
-
static __global__ void k_get_rows(const void * x, const int32_t * y, dst_t * dst, const int ncols) {
|
1582 |
-
const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2;
|
1583 |
-
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
1584 |
-
|
1585 |
-
if (col >= ncols) {
|
1586 |
-
return;
|
1587 |
-
}
|
1588 |
-
|
1589 |
-
const int r = y[row];
|
1590 |
-
|
1591 |
-
// copy x[r*ncols + col] to dst[row*ncols + col]
|
1592 |
-
const int xi = r*ncols + col;
|
1593 |
-
const int di = row*ncols + col;
|
1594 |
-
|
1595 |
-
const int ib = xi/qk; // block index
|
1596 |
-
const int iqs = (xi%qk)/qr; // quant index
|
1597 |
-
const int iybs = di - di%qk; // y block start index
|
1598 |
-
const int y_offset = qr == 1 ? 1 : qk/2;
|
1599 |
-
|
1600 |
-
// dequantize
|
1601 |
-
dfloat2 v;
|
1602 |
-
dequantize_kernel(x, ib, iqs, v);
|
1603 |
-
|
1604 |
-
dst[iybs + iqs + 0] = v.x;
|
1605 |
-
dst[iybs + iqs + y_offset] = v.y;
|
1606 |
-
}
|
1607 |
-
|
1608 |
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
1609 |
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
|
1610 |
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
|
@@ -4586,24 +4555,6 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
|
|
4586 |
dst[i] = scale * x[i];
|
4587 |
}
|
4588 |
|
4589 |
-
static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
|
4590 |
-
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
4591 |
-
|
4592 |
-
if (i >= k) {
|
4593 |
-
return;
|
4594 |
-
}
|
4595 |
-
|
4596 |
-
dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
|
4597 |
-
}
|
4598 |
-
|
4599 |
-
template<int qk, int qr, dequantize_kernel_t dq>
|
4600 |
-
static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
|
4601 |
-
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
4602 |
-
const int block_num_x = (ncols + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
|
4603 |
-
const dim3 block_nums(block_num_x, nrows, 1);
|
4604 |
-
k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols);
|
4605 |
-
}
|
4606 |
-
|
4607 |
static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
|
4608 |
const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
|
4609 |
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
@@ -5485,11 +5436,6 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
|
|
5485 |
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
5486 |
}
|
5487 |
|
5488 |
-
static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
|
5489 |
-
const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
|
5490 |
-
clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
|
5491 |
-
}
|
5492 |
-
|
5493 |
template<typename T>
|
5494 |
static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
|
5495 |
const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
@@ -5753,7 +5699,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
5753 |
} else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
|
5754 |
GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
|
5755 |
kind = cudaMemcpyDeviceToDevice;
|
5756 |
-
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
5757 |
int id;
|
5758 |
CUDA_CHECK(cudaGetDevice(&id));
|
5759 |
src_ptr = (char *) extra->data_device[id];
|
@@ -5789,107 +5735,6 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
5789 |
}
|
5790 |
}
|
5791 |
|
5792 |
-
static void ggml_cuda_op_repeat(
|
5793 |
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5794 |
-
const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
|
5795 |
-
// guaranteed to be an integer due to the check in ggml_can_repeat
|
5796 |
-
const int64_t ne0 = dst->ne[0];
|
5797 |
-
const int64_t ne1 = dst->ne[1];
|
5798 |
-
const int64_t ne2 = dst->ne[2];
|
5799 |
-
const int64_t ne3 = dst->ne[3];
|
5800 |
-
|
5801 |
-
const int64_t ne00 = src0->ne[0];
|
5802 |
-
const int64_t ne01 = src0->ne[1];
|
5803 |
-
const int64_t ne02 = src0->ne[2];
|
5804 |
-
const int64_t ne03 = src0->ne[3];
|
5805 |
-
|
5806 |
-
const size_t nb0 = dst->nb[0];
|
5807 |
-
const size_t nb1 = dst->nb[1];
|
5808 |
-
const size_t nb2 = dst->nb[2];
|
5809 |
-
const size_t nb3 = dst->nb[3];
|
5810 |
-
|
5811 |
-
const size_t nb00 = src0->nb[0];
|
5812 |
-
const size_t nb01 = src0->nb[1];
|
5813 |
-
const size_t nb02 = src0->nb[2];
|
5814 |
-
const size_t nb03 = src0->nb[3];
|
5815 |
-
|
5816 |
-
const int nr0 = (int)(ne0/ne00);
|
5817 |
-
const int nr1 = (int)(ne1/ne01);
|
5818 |
-
const int nr2 = (int)(ne2/ne02);
|
5819 |
-
const int nr3 = (int)(ne3/ne03);
|
5820 |
-
|
5821 |
-
// TODO: support for transposed / permuted tensors
|
5822 |
-
GGML_ASSERT(nb0 == sizeof(float));
|
5823 |
-
GGML_ASSERT(nb00 == sizeof(float));
|
5824 |
-
|
5825 |
-
// TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors
|
5826 |
-
for (int i3 = 0; i3 < nr3; i3++) {
|
5827 |
-
for (int k3 = 0; k3 < ne03; k3++) {
|
5828 |
-
for (int i2 = 0; i2 < nr2; i2++) {
|
5829 |
-
for (int k2 = 0; k2 < ne02; k2++) {
|
5830 |
-
for (int i1 = 0; i1 < nr1; i1++) {
|
5831 |
-
for (int k1 = 0; k1 < ne01; k1++) {
|
5832 |
-
for (int i0 = 0; i0 < nr0; i0++) {
|
5833 |
-
CUDA_CHECK(cudaMemcpyAsync(
|
5834 |
-
(char *) dst_d + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0,
|
5835 |
-
(const char *) src0_d + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01,
|
5836 |
-
ne00*nb0, cudaMemcpyDeviceToDevice, stream));
|
5837 |
-
}
|
5838 |
-
}
|
5839 |
-
}
|
5840 |
-
}
|
5841 |
-
}
|
5842 |
-
}
|
5843 |
-
}
|
5844 |
-
|
5845 |
-
(void) src1;
|
5846 |
-
(void) src1_d;
|
5847 |
-
}
|
5848 |
-
|
5849 |
-
static void ggml_cuda_op_get_rows(
|
5850 |
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5851 |
-
const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
|
5852 |
-
|
5853 |
-
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
5854 |
-
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
5855 |
-
GGML_ASSERT(ggml_is_contiguous(src0));
|
5856 |
-
GGML_ASSERT(ggml_is_contiguous(src1));
|
5857 |
-
GGML_ASSERT(ggml_is_contiguous(dst));
|
5858 |
-
|
5859 |
-
const int ncols = src0->ne[0];
|
5860 |
-
const int nrows = ggml_nelements(src1);
|
5861 |
-
|
5862 |
-
const int32_t * src1_i32 = (const int32_t *) src1_d;
|
5863 |
-
|
5864 |
-
switch (src0->type) {
|
5865 |
-
case GGML_TYPE_F16:
|
5866 |
-
get_rows_cuda<1, 1, convert_f16>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5867 |
-
break;
|
5868 |
-
case GGML_TYPE_F32:
|
5869 |
-
get_rows_cuda<1, 1, convert_f32>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5870 |
-
break;
|
5871 |
-
case GGML_TYPE_Q4_0:
|
5872 |
-
get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5873 |
-
break;
|
5874 |
-
case GGML_TYPE_Q4_1:
|
5875 |
-
get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5876 |
-
break;
|
5877 |
-
case GGML_TYPE_Q5_0:
|
5878 |
-
get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5879 |
-
break;
|
5880 |
-
case GGML_TYPE_Q5_1:
|
5881 |
-
get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5882 |
-
break;
|
5883 |
-
case GGML_TYPE_Q8_0:
|
5884 |
-
get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5885 |
-
break;
|
5886 |
-
default:
|
5887 |
-
// TODO: k-quants
|
5888 |
-
GGML_ASSERT(false);
|
5889 |
-
break;
|
5890 |
-
}
|
5891 |
-
}
|
5892 |
-
|
5893 |
inline void ggml_cuda_op_add(
|
5894 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5895 |
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -6430,12 +6275,12 @@ inline void ggml_cuda_op_alibi(
|
|
6430 |
const int64_t ne02 = src0->ne[2];
|
6431 |
const int64_t nrows = ggml_nrows(src0);
|
6432 |
|
6433 |
-
|
6434 |
const int n_head = ((int32_t *) dst->op_params)[1];
|
6435 |
float max_bias;
|
6436 |
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
6437 |
|
6438 |
-
|
6439 |
GGML_ASSERT(n_head == ne02);
|
6440 |
|
6441 |
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
@@ -6494,14 +6339,7 @@ inline void ggml_cuda_op_scale(
|
|
6494 |
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
6495 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6496 |
|
6497 |
-
float scale;
|
6498 |
-
// HACK: support for ggml backend interface
|
6499 |
-
if (src1->backend == GGML_BACKEND_CPU) {
|
6500 |
-
scale = ((float *) src1->data)[0];
|
6501 |
-
} else {
|
6502 |
-
// TODO: pass pointer to kernel instead of copying to host
|
6503 |
-
CUDA_CHECK(cudaMemcpy(&scale, src1->data, sizeof(float), cudaMemcpyDeviceToHost));
|
6504 |
-
}
|
6505 |
|
6506 |
scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
|
6507 |
CUDA_CHECK(cudaGetLastError());
|
@@ -6511,24 +6349,6 @@ inline void ggml_cuda_op_scale(
|
|
6511 |
(void) src1_dd;
|
6512 |
}
|
6513 |
|
6514 |
-
inline void ggml_cuda_op_clamp(
|
6515 |
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6516 |
-
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6517 |
-
|
6518 |
-
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6519 |
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6520 |
-
|
6521 |
-
const float min = ((float *) dst->op_params)[0];
|
6522 |
-
const float max = ((float *) dst->op_params)[1];
|
6523 |
-
|
6524 |
-
clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
|
6525 |
-
CUDA_CHECK(cudaGetLastError());
|
6526 |
-
|
6527 |
-
(void) src1;
|
6528 |
-
(void) dst;
|
6529 |
-
(void) src1_dd;
|
6530 |
-
}
|
6531 |
-
|
6532 |
static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
|
6533 |
const int64_t nrows0 = ggml_nrows(src0);
|
6534 |
|
@@ -6538,9 +6358,9 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
|
6538 |
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
|
6539 |
GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
|
6540 |
|
6541 |
-
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6542 |
-
ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
6543 |
-
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6544 |
|
6545 |
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
6546 |
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
|
@@ -6681,9 +6501,9 @@ static void ggml_cuda_op_mul_mat(
|
|
6681 |
const size_t q8_1_ts = sizeof(block_q8_1);
|
6682 |
const size_t q8_1_bs = QK8_1;
|
6683 |
|
6684 |
-
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6685 |
-
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
6686 |
-
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6687 |
|
6688 |
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
6689 |
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
@@ -6761,7 +6581,7 @@ static void ggml_cuda_op_mul_mat(
|
|
6761 |
if (convert_src1_to_q8_1) {
|
6762 |
src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
|
6763 |
|
6764 |
-
if (src1_on_device && src1_is_contiguous) {
|
6765 |
quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
|
6766 |
CUDA_CHECK(cudaGetLastError());
|
6767 |
}
|
@@ -6843,7 +6663,7 @@ static void ggml_cuda_op_mul_mat(
|
|
6843 |
GGML_ASSERT(false);
|
6844 |
}
|
6845 |
|
6846 |
-
if (convert_src1_to_q8_1 &&
|
6847 |
quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
6848 |
CUDA_CHECK(cudaGetLastError());
|
6849 |
}
|
@@ -6934,14 +6754,6 @@ static void ggml_cuda_op_mul_mat(
|
|
6934 |
}
|
6935 |
}
|
6936 |
|
6937 |
-
static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6938 |
-
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat);
|
6939 |
-
}
|
6940 |
-
|
6941 |
-
static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6942 |
-
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows);
|
6943 |
-
}
|
6944 |
-
|
6945 |
static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6946 |
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
|
6947 |
}
|
@@ -6996,13 +6808,13 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
|
|
6996 |
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6997 |
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6998 |
|
6999 |
-
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
7000 |
void * src0_ddq = src0_extra->data_device[g_main_device];
|
7001 |
|
7002 |
-
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
7003 |
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
7004 |
|
7005 |
-
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
7006 |
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
7007 |
|
7008 |
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
|
@@ -7027,13 +6839,13 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
7027 |
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7028 |
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
7029 |
|
7030 |
-
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
7031 |
void * src0_ddq = src0_extra->data_device[g_main_device];
|
7032 |
|
7033 |
-
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
7034 |
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
7035 |
|
7036 |
-
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
7037 |
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
7038 |
|
7039 |
const int64_t row_stride_x = nb01 / sizeof(half);
|
@@ -7054,11 +6866,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
7054 |
}
|
7055 |
}
|
7056 |
|
7057 |
-
if (all_on_device &&
|
7058 |
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
7059 |
} else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
|
7060 |
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
7061 |
-
}
|
7062 |
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
7063 |
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
7064 |
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
@@ -7090,10 +6902,6 @@ static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1,
|
|
7090 |
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
|
7091 |
}
|
7092 |
|
7093 |
-
static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7094 |
-
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp);
|
7095 |
-
}
|
7096 |
-
|
7097 |
static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7098 |
const int64_t ne = ggml_nelements(src0);
|
7099 |
GGML_ASSERT(ne == ggml_nelements(src1));
|
@@ -7123,8 +6931,8 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
7123 |
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7124 |
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
7125 |
|
7126 |
-
const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
7127 |
-
const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
7128 |
|
7129 |
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
7130 |
char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
|
@@ -7179,8 +6987,8 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
7179 |
|
7180 |
const size_t nb1 = tensor->nb[1];
|
7181 |
|
7182 |
-
|
7183 |
-
ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
7184 |
memset(extra, 0, sizeof(*extra));
|
7185 |
|
7186 |
for (int64_t id = 0; id < g_device_count; ++id) {
|
@@ -7234,6 +7042,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
7234 |
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
|
7235 |
}
|
7236 |
|
|
|
7237 |
CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
|
7238 |
|
7239 |
extra->data_device[id] = buf;
|
@@ -7272,17 +7081,17 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
|
7272 |
delete extra;
|
7273 |
}
|
7274 |
|
7275 |
-
static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
|
7276 |
static size_t g_temp_tensor_extra_index = 0;
|
7277 |
|
7278 |
-
static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
7279 |
if (g_temp_tensor_extras == nullptr) {
|
7280 |
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
|
7281 |
}
|
7282 |
|
7283 |
size_t alloc_index = g_temp_tensor_extra_index;
|
7284 |
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
|
7285 |
-
ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
|
7286 |
memset(extra, 0, sizeof(*extra));
|
7287 |
|
7288 |
return extra;
|
@@ -7310,7 +7119,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
|
|
7310 |
return;
|
7311 |
}
|
7312 |
|
7313 |
-
ggml_tensor_extra_gpu * extra;
|
7314 |
|
7315 |
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
7316 |
tensor->op == GGML_OP_VIEW ||
|
@@ -7319,7 +7128,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
|
|
7319 |
|
7320 |
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7321 |
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
7322 |
-
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
7323 |
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
7324 |
size_t offset = 0;
|
7325 |
if (tensor->op == GGML_OP_VIEW) {
|
@@ -7328,7 +7137,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
|
|
7328 |
extra = ggml_cuda_alloc_temp_tensor_extra();
|
7329 |
extra->data_device[g_main_device] = src0_ddc + offset;
|
7330 |
} else if (tensor->op == GGML_OP_CPY) {
|
7331 |
-
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
|
7332 |
void * src1_ddv = src1_extra->data_device[g_main_device];
|
7333 |
extra = ggml_cuda_alloc_temp_tensor_extra();
|
7334 |
extra->data_device[g_main_device] = src1_ddv;
|
@@ -7370,13 +7179,13 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
|
|
7370 |
CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
|
7371 |
}
|
7372 |
|
7373 |
-
ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
|
7374 |
|
7375 |
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
7376 |
tensor->op == GGML_OP_VIEW;
|
7377 |
|
7378 |
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
7379 |
-
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
7380 |
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
7381 |
size_t view_offset = 0;
|
7382 |
if (tensor->op == GGML_OP_VIEW) {
|
@@ -7394,7 +7203,7 @@ void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
|
|
7394 |
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
7395 |
GGML_ASSERT(ggml_is_contiguous(tensor));
|
7396 |
|
7397 |
-
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
7398 |
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7399 |
CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
|
7400 |
}
|
@@ -7451,47 +7260,58 @@ void ggml_cuda_free_scratch() {
|
|
7451 |
g_scratch_buffer = nullptr;
|
7452 |
}
|
7453 |
|
7454 |
-
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor)
|
7455 |
ggml_cuda_func_t func;
|
7456 |
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
7457 |
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
7458 |
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
7459 |
|
7460 |
-
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) {
|
7461 |
-
return false;
|
7462 |
-
}
|
7463 |
-
|
7464 |
switch (tensor->op) {
|
7465 |
-
case GGML_OP_REPEAT:
|
7466 |
-
func = ggml_cuda_repeat;
|
7467 |
-
break;
|
7468 |
-
case GGML_OP_GET_ROWS:
|
7469 |
-
func = ggml_cuda_get_rows;
|
7470 |
-
break;
|
7471 |
case GGML_OP_DUP:
|
|
|
|
|
|
|
7472 |
func = ggml_cuda_dup;
|
7473 |
break;
|
7474 |
case GGML_OP_ADD:
|
|
|
|
|
|
|
7475 |
func = ggml_cuda_add;
|
7476 |
break;
|
7477 |
case GGML_OP_MUL:
|
|
|
|
|
|
|
7478 |
func = ggml_cuda_mul;
|
7479 |
break;
|
7480 |
case GGML_OP_UNARY:
|
7481 |
switch (ggml_get_unary_op(tensor)) {
|
7482 |
case GGML_UNARY_OP_GELU:
|
|
|
|
|
|
|
7483 |
func = ggml_cuda_gelu;
|
7484 |
break;
|
7485 |
case GGML_UNARY_OP_SILU:
|
|
|
|
|
|
|
7486 |
func = ggml_cuda_silu;
|
7487 |
break;
|
7488 |
default:
|
7489 |
return false;
|
7490 |
} break;
|
7491 |
case GGML_OP_NORM:
|
|
|
|
|
|
|
7492 |
func = ggml_cuda_norm;
|
7493 |
break;
|
7494 |
case GGML_OP_RMS_NORM:
|
|
|
|
|
|
|
7495 |
func = ggml_cuda_rms_norm;
|
7496 |
break;
|
7497 |
case GGML_OP_MUL_MAT:
|
@@ -7501,36 +7321,54 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
7501 |
func = ggml_cuda_mul_mat;
|
7502 |
break;
|
7503 |
case GGML_OP_SCALE:
|
7504 |
-
func = ggml_cuda_scale;
|
7505 |
-
break;
|
7506 |
-
case GGML_OP_CLAMP:
|
7507 |
if (!any_on_device) {
|
7508 |
return false;
|
7509 |
}
|
7510 |
-
func =
|
7511 |
break;
|
7512 |
case GGML_OP_CPY:
|
|
|
|
|
|
|
7513 |
func = ggml_cuda_cpy;
|
7514 |
break;
|
7515 |
case GGML_OP_CONT:
|
|
|
|
|
|
|
7516 |
func = ggml_cuda_dup;
|
7517 |
break;
|
7518 |
case GGML_OP_RESHAPE:
|
7519 |
case GGML_OP_VIEW:
|
7520 |
case GGML_OP_PERMUTE:
|
7521 |
case GGML_OP_TRANSPOSE:
|
|
|
|
|
|
|
7522 |
func = ggml_cuda_nop;
|
7523 |
break;
|
7524 |
case GGML_OP_DIAG_MASK_INF:
|
|
|
|
|
|
|
7525 |
func = ggml_cuda_diag_mask_inf;
|
7526 |
break;
|
7527 |
case GGML_OP_SOFT_MAX:
|
|
|
|
|
|
|
7528 |
func = ggml_cuda_soft_max;
|
7529 |
break;
|
7530 |
case GGML_OP_ROPE:
|
|
|
|
|
|
|
7531 |
func = ggml_cuda_rope;
|
7532 |
break;
|
7533 |
case GGML_OP_ALIBI:
|
|
|
|
|
|
|
7534 |
func = ggml_cuda_alibi;
|
7535 |
break;
|
7536 |
default:
|
@@ -7558,263 +7396,3 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
|
|
7558 |
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
7559 |
snprintf(description, description_size, "%s", prop.name);
|
7560 |
}
|
7561 |
-
|
7562 |
-
////////////////////////////////////////////////////////////////////////////////
|
7563 |
-
|
7564 |
-
// backend interface
|
7565 |
-
|
7566 |
-
#define UNUSED GGML_UNUSED
|
7567 |
-
|
7568 |
-
struct ggml_backend_context_cuda {
|
7569 |
-
};
|
7570 |
-
|
7571 |
-
static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
7572 |
-
return GGML_CUDA_NAME;
|
7573 |
-
|
7574 |
-
UNUSED(backend);
|
7575 |
-
}
|
7576 |
-
|
7577 |
-
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
7578 |
-
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
7579 |
-
delete cuda_ctx;
|
7580 |
-
delete backend;
|
7581 |
-
}
|
7582 |
-
|
7583 |
-
struct ggml_backend_buffer_context_cuda {
|
7584 |
-
void * device;
|
7585 |
-
|
7586 |
-
ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
|
7587 |
-
size_t temp_tensor_extra_index = 0;
|
7588 |
-
|
7589 |
-
~ggml_backend_buffer_context_cuda() {
|
7590 |
-
delete[] temp_tensor_extras;
|
7591 |
-
}
|
7592 |
-
|
7593 |
-
ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
7594 |
-
if (temp_tensor_extras == nullptr) {
|
7595 |
-
temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
|
7596 |
-
}
|
7597 |
-
|
7598 |
-
size_t alloc_index = temp_tensor_extra_index;
|
7599 |
-
temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_MAX_NODES;
|
7600 |
-
ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
|
7601 |
-
memset(extra, 0, sizeof(*extra));
|
7602 |
-
|
7603 |
-
return extra;
|
7604 |
-
}
|
7605 |
-
};
|
7606 |
-
|
7607 |
-
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
7608 |
-
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
7609 |
-
CUDA_CHECK(cudaFree(ctx->device));
|
7610 |
-
delete ctx;
|
7611 |
-
}
|
7612 |
-
|
7613 |
-
static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
7614 |
-
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
7615 |
-
return ctx->device;
|
7616 |
-
}
|
7617 |
-
|
7618 |
-
static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
7619 |
-
int64_t row_low = 0;
|
7620 |
-
int64_t row_high = ggml_nrows(tensor);
|
7621 |
-
int64_t nrows_split = row_high - row_low;
|
7622 |
-
|
7623 |
-
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
7624 |
-
|
7625 |
-
int64_t ne0 = tensor->ne[0];
|
7626 |
-
|
7627 |
-
if (ggml_is_quantized(tensor->type)) {
|
7628 |
-
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
7629 |
-
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
7630 |
-
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
7631 |
-
}
|
7632 |
-
}
|
7633 |
-
|
7634 |
-
return size;
|
7635 |
-
|
7636 |
-
UNUSED(buffer);
|
7637 |
-
}
|
7638 |
-
|
7639 |
-
static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
7640 |
-
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
7641 |
-
|
7642 |
-
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
7643 |
-
assert(tensor->view_src->buffer->backend == buffer->backend);
|
7644 |
-
tensor->backend = tensor->view_src->backend;
|
7645 |
-
tensor->extra = tensor->view_src->extra;
|
7646 |
-
return;
|
7647 |
-
}
|
7648 |
-
|
7649 |
-
ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
|
7650 |
-
|
7651 |
-
extra->data_device[g_main_device] = tensor->data;
|
7652 |
-
|
7653 |
-
tensor->backend = GGML_BACKEND_GPU;
|
7654 |
-
tensor->extra = extra;
|
7655 |
-
|
7656 |
-
if (ggml_is_quantized(tensor->type)) {
|
7657 |
-
// initialize padding to 0 to avoid possible NaN values
|
7658 |
-
int64_t row_low = 0;
|
7659 |
-
int64_t row_high = ggml_nrows(tensor);
|
7660 |
-
int64_t nrows_split = row_high - row_low;
|
7661 |
-
|
7662 |
-
size_t original_size = ggml_nbytes_split(tensor, nrows_split);
|
7663 |
-
size_t padded_size = ggml_backend_cuda_buffer_get_alloc_size(tensor->buffer, tensor);
|
7664 |
-
|
7665 |
-
if (padded_size > original_size && tensor->view_src == nullptr) {
|
7666 |
-
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[g_main_device][0]));
|
7667 |
-
}
|
7668 |
-
}
|
7669 |
-
|
7670 |
-
UNUSED(buffer);
|
7671 |
-
}
|
7672 |
-
|
7673 |
-
static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
|
7674 |
-
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
|
7675 |
-
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
|
7676 |
-
/* .get_alloc_size = */ ggml_backend_cuda_buffer_get_alloc_size,
|
7677 |
-
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
|
7678 |
-
/* .free_tensor = */ NULL,
|
7679 |
-
};
|
7680 |
-
|
7681 |
-
static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backend, size_t size) {
|
7682 |
-
ggml_cuda_set_device(g_main_device);
|
7683 |
-
|
7684 |
-
ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
|
7685 |
-
CUDA_CHECK(cudaMalloc(&ctx->device, size));
|
7686 |
-
return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
|
7687 |
-
}
|
7688 |
-
|
7689 |
-
static size_t ggml_backend_cuda_get_alignment(ggml_backend_t backend) {
|
7690 |
-
return 128;
|
7691 |
-
UNUSED(backend);
|
7692 |
-
}
|
7693 |
-
|
7694 |
-
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
7695 |
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
7696 |
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
7697 |
-
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
7698 |
-
|
7699 |
-
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[g_main_device][0]));
|
7700 |
-
|
7701 |
-
UNUSED(backend);
|
7702 |
-
}
|
7703 |
-
|
7704 |
-
static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
7705 |
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
7706 |
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
7707 |
-
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
7708 |
-
|
7709 |
-
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
|
7710 |
-
|
7711 |
-
UNUSED(backend);
|
7712 |
-
}
|
7713 |
-
|
7714 |
-
static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
7715 |
-
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
|
7716 |
-
|
7717 |
-
UNUSED(backend);
|
7718 |
-
}
|
7719 |
-
|
7720 |
-
static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
7721 |
-
GGML_ASSERT(!"not implemented");
|
7722 |
-
|
7723 |
-
return nullptr;
|
7724 |
-
|
7725 |
-
UNUSED(backend);
|
7726 |
-
UNUSED(cgraph);
|
7727 |
-
}
|
7728 |
-
|
7729 |
-
static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
7730 |
-
GGML_ASSERT(!"not implemented");
|
7731 |
-
|
7732 |
-
UNUSED(backend);
|
7733 |
-
UNUSED(plan);
|
7734 |
-
}
|
7735 |
-
|
7736 |
-
static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
7737 |
-
GGML_ASSERT(!"not implemented");
|
7738 |
-
|
7739 |
-
UNUSED(backend);
|
7740 |
-
UNUSED(plan);
|
7741 |
-
}
|
7742 |
-
|
7743 |
-
static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
7744 |
-
ggml_cuda_set_device(g_main_device);
|
7745 |
-
|
7746 |
-
ggml_compute_params params = {};
|
7747 |
-
params.type = GGML_TASK_COMPUTE;
|
7748 |
-
params.ith = 0;
|
7749 |
-
for (int i = 0; i < cgraph->n_nodes; i++) {
|
7750 |
-
ggml_tensor * node = cgraph->nodes[i];
|
7751 |
-
|
7752 |
-
assert(node->backend == GGML_BACKEND_GPU);
|
7753 |
-
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
7754 |
-
if (node->src[j] != nullptr) {
|
7755 |
-
assert(node->src[j]->backend == GGML_BACKEND_GPU);
|
7756 |
-
}
|
7757 |
-
}
|
7758 |
-
|
7759 |
-
bool ok = ggml_cuda_compute_forward(¶ms, node);
|
7760 |
-
if (!ok) {
|
7761 |
-
fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
7762 |
-
}
|
7763 |
-
GGML_ASSERT(ok);
|
7764 |
-
|
7765 |
-
#if 0
|
7766 |
-
if (node->type == GGML_TYPE_F32) {
|
7767 |
-
cudaDeviceSynchronize();
|
7768 |
-
std::vector<float> tmp(ggml_nelements(node), 0.0f);
|
7769 |
-
cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
|
7770 |
-
printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
|
7771 |
-
ggml_type_name(node->src[0]->type),
|
7772 |
-
node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
|
7773 |
-
node->src[0]->name,
|
7774 |
-
node->src[1] ? node->src[1]->name : "none");
|
7775 |
-
double sum = 0.0;
|
7776 |
-
double sq_sum = 0.0;
|
7777 |
-
for (int i = 0; i < ggml_nelements(node); i++) {
|
7778 |
-
printf("%f ", tmp[i]);
|
7779 |
-
sum += tmp[i];
|
7780 |
-
sq_sum += tmp[i]*tmp[i];
|
7781 |
-
}
|
7782 |
-
printf("\n");
|
7783 |
-
printf("sum: %f, ", sum);
|
7784 |
-
printf("sq_sum: %f\n", sq_sum);
|
7785 |
-
}
|
7786 |
-
#endif
|
7787 |
-
}
|
7788 |
-
|
7789 |
-
UNUSED(backend);
|
7790 |
-
}
|
7791 |
-
|
7792 |
-
static ggml_backend_i cuda_backend_i = {
|
7793 |
-
/* .get_name = */ ggml_backend_cuda_name,
|
7794 |
-
/* .free = */ ggml_backend_cuda_free,
|
7795 |
-
/* .alloc_buffer = */ ggml_backend_cuda_alloc_buffer,
|
7796 |
-
/* .get_alignment = */ ggml_backend_cuda_get_alignment,
|
7797 |
-
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
|
7798 |
-
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
|
7799 |
-
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
7800 |
-
/* .cpy_tensor_from = */ nullptr,
|
7801 |
-
/* .cpy_tensor_to = */ nullptr,
|
7802 |
-
/* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
|
7803 |
-
/* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
|
7804 |
-
/* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
|
7805 |
-
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
7806 |
-
/* .supports_op = */ nullptr,
|
7807 |
-
};
|
7808 |
-
|
7809 |
-
ggml_backend_t ggml_backend_cuda_init() {
|
7810 |
-
ggml_init_cublas(); // TODO: remove from ggml.c
|
7811 |
-
|
7812 |
-
ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda;
|
7813 |
-
|
7814 |
-
ggml_backend_t cuda_backend = new ggml_backend {
|
7815 |
-
/* .interface = */ cuda_backend_i,
|
7816 |
-
/* .context = */ ctx
|
7817 |
-
};
|
7818 |
-
|
7819 |
-
return cuda_backend;
|
7820 |
-
}
|
|
|
62 |
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
63 |
#define cudaMemcpyKind hipMemcpyKind
|
64 |
#define cudaMemset hipMemset
|
|
|
65 |
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
66 |
#define cudaSetDevice hipSetDevice
|
67 |
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
|
|
414 |
#define CUDA_SILU_BLOCK_SIZE 256
|
415 |
#define CUDA_CPY_BLOCK_SIZE 32
|
416 |
#define CUDA_SCALE_BLOCK_SIZE 256
|
|
|
417 |
#define CUDA_ROPE_BLOCK_SIZE 256
|
418 |
#define CUDA_ALIBI_BLOCK_SIZE 32
|
419 |
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
420 |
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
421 |
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
|
|
422 |
|
423 |
// dmmv = dequantize_mul_mat_vec
|
424 |
#ifndef GGML_CUDA_DMMV_X
|
|
|
1574 |
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
1575 |
}
|
1576 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1577 |
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
1578 |
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
|
1579 |
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
|
|
|
4555 |
dst[i] = scale * x[i];
|
4556 |
}
|
4557 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4558 |
static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
|
4559 |
const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
|
4560 |
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
|
|
5436 |
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
5437 |
}
|
5438 |
|
|
|
|
|
|
|
|
|
|
|
5439 |
template<typename T>
|
5440 |
static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
|
5441 |
const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
|
|
5699 |
} else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
|
5700 |
GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
|
5701 |
kind = cudaMemcpyDeviceToDevice;
|
5702 |
+
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
5703 |
int id;
|
5704 |
CUDA_CHECK(cudaGetDevice(&id));
|
5705 |
src_ptr = (char *) extra->data_device[id];
|
|
|
5735 |
}
|
5736 |
}
|
5737 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5738 |
inline void ggml_cuda_op_add(
|
5739 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5740 |
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
|
|
6275 |
const int64_t ne02 = src0->ne[2];
|
6276 |
const int64_t nrows = ggml_nrows(src0);
|
6277 |
|
6278 |
+
const int n_past = ((int32_t *) dst->op_params)[0];
|
6279 |
const int n_head = ((int32_t *) dst->op_params)[1];
|
6280 |
float max_bias;
|
6281 |
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
6282 |
|
6283 |
+
GGML_ASSERT(ne01 + n_past == ne00);
|
6284 |
GGML_ASSERT(n_head == ne02);
|
6285 |
|
6286 |
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
|
|
6339 |
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
6340 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6341 |
|
6342 |
+
const float scale = ((float *) src1->data)[0];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6343 |
|
6344 |
scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
|
6345 |
CUDA_CHECK(cudaGetLastError());
|
|
|
6349 |
(void) src1_dd;
|
6350 |
}
|
6351 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6352 |
static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
|
6353 |
const int64_t nrows0 = ggml_nrows(src0);
|
6354 |
|
|
|
6358 |
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
|
6359 |
GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
|
6360 |
|
6361 |
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6362 |
+
struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
6363 |
+
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6364 |
|
6365 |
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
6366 |
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
|
|
|
6501 |
const size_t q8_1_ts = sizeof(block_q8_1);
|
6502 |
const size_t q8_1_bs = QK8_1;
|
6503 |
|
6504 |
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6505 |
+
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
6506 |
+
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6507 |
|
6508 |
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
6509 |
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
|
|
6581 |
if (convert_src1_to_q8_1) {
|
6582 |
src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
|
6583 |
|
6584 |
+
if (split && src1_on_device && src1_is_contiguous) {
|
6585 |
quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
|
6586 |
CUDA_CHECK(cudaGetLastError());
|
6587 |
}
|
|
|
6663 |
GGML_ASSERT(false);
|
6664 |
}
|
6665 |
|
6666 |
+
if (convert_src1_to_q8_1 && src1->backend == GGML_BACKEND_CPU) {
|
6667 |
quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
6668 |
CUDA_CHECK(cudaGetLastError());
|
6669 |
}
|
|
|
6754 |
}
|
6755 |
}
|
6756 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6757 |
static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6758 |
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
|
6759 |
}
|
|
|
6808 |
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6809 |
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6810 |
|
6811 |
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6812 |
void * src0_ddq = src0_extra->data_device[g_main_device];
|
6813 |
|
6814 |
+
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
6815 |
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
6816 |
|
6817 |
+
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6818 |
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
6819 |
|
6820 |
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
|
|
|
6839 |
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6840 |
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6841 |
|
6842 |
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6843 |
void * src0_ddq = src0_extra->data_device[g_main_device];
|
6844 |
|
6845 |
+
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
6846 |
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
6847 |
|
6848 |
+
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6849 |
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
6850 |
|
6851 |
const int64_t row_stride_x = nb01 / sizeof(half);
|
|
|
6866 |
}
|
6867 |
}
|
6868 |
|
6869 |
+
if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
6870 |
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
6871 |
} else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
|
6872 |
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
6873 |
+
}else if (src0->type == GGML_TYPE_F32) {
|
6874 |
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
6875 |
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
6876 |
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
|
|
6902 |
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
|
6903 |
}
|
6904 |
|
|
|
|
|
|
|
|
|
6905 |
static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6906 |
const int64_t ne = ggml_nelements(src0);
|
6907 |
GGML_ASSERT(ne == ggml_nelements(src1));
|
|
|
6931 |
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6932 |
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6933 |
|
6934 |
+
const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6935 |
+
const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
6936 |
|
6937 |
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
6938 |
char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
|
|
|
6987 |
|
6988 |
const size_t nb1 = tensor->nb[1];
|
6989 |
|
6990 |
+
ggml_backend backend = tensor->backend;
|
6991 |
+
struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
6992 |
memset(extra, 0, sizeof(*extra));
|
6993 |
|
6994 |
for (int64_t id = 0; id < g_device_count; ++id) {
|
|
|
7042 |
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
|
7043 |
}
|
7044 |
|
7045 |
+
|
7046 |
CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
|
7047 |
|
7048 |
extra->data_device[id] = buf;
|
|
|
7081 |
delete extra;
|
7082 |
}
|
7083 |
|
7084 |
+
static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
|
7085 |
static size_t g_temp_tensor_extra_index = 0;
|
7086 |
|
7087 |
+
static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
7088 |
if (g_temp_tensor_extras == nullptr) {
|
7089 |
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
|
7090 |
}
|
7091 |
|
7092 |
size_t alloc_index = g_temp_tensor_extra_index;
|
7093 |
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
|
7094 |
+
struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
|
7095 |
memset(extra, 0, sizeof(*extra));
|
7096 |
|
7097 |
return extra;
|
|
|
7119 |
return;
|
7120 |
}
|
7121 |
|
7122 |
+
struct ggml_tensor_extra_gpu * extra;
|
7123 |
|
7124 |
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
7125 |
tensor->op == GGML_OP_VIEW ||
|
|
|
7128 |
|
7129 |
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7130 |
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
7131 |
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
7132 |
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
7133 |
size_t offset = 0;
|
7134 |
if (tensor->op == GGML_OP_VIEW) {
|
|
|
7137 |
extra = ggml_cuda_alloc_temp_tensor_extra();
|
7138 |
extra->data_device[g_main_device] = src0_ddc + offset;
|
7139 |
} else if (tensor->op == GGML_OP_CPY) {
|
7140 |
+
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
|
7141 |
void * src1_ddv = src1_extra->data_device[g_main_device];
|
7142 |
extra = ggml_cuda_alloc_temp_tensor_extra();
|
7143 |
extra->data_device[g_main_device] = src1_ddv;
|
|
|
7179 |
CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
|
7180 |
}
|
7181 |
|
7182 |
+
struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
|
7183 |
|
7184 |
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
7185 |
tensor->op == GGML_OP_VIEW;
|
7186 |
|
7187 |
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
7188 |
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
7189 |
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
7190 |
size_t view_offset = 0;
|
7191 |
if (tensor->op == GGML_OP_VIEW) {
|
|
|
7203 |
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
7204 |
GGML_ASSERT(ggml_is_contiguous(tensor));
|
7205 |
|
7206 |
+
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
7207 |
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7208 |
CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
|
7209 |
}
|
|
|
7260 |
g_scratch_buffer = nullptr;
|
7261 |
}
|
7262 |
|
7263 |
+
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
|
7264 |
ggml_cuda_func_t func;
|
7265 |
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
7266 |
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
7267 |
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
7268 |
|
|
|
|
|
|
|
|
|
7269 |
switch (tensor->op) {
|
|
|
|
|
|
|
|
|
|
|
|
|
7270 |
case GGML_OP_DUP:
|
7271 |
+
if (!any_on_device) {
|
7272 |
+
return false;
|
7273 |
+
}
|
7274 |
func = ggml_cuda_dup;
|
7275 |
break;
|
7276 |
case GGML_OP_ADD:
|
7277 |
+
if (!any_on_device) {
|
7278 |
+
return false;
|
7279 |
+
}
|
7280 |
func = ggml_cuda_add;
|
7281 |
break;
|
7282 |
case GGML_OP_MUL:
|
7283 |
+
if (!any_on_device) {
|
7284 |
+
return false;
|
7285 |
+
}
|
7286 |
func = ggml_cuda_mul;
|
7287 |
break;
|
7288 |
case GGML_OP_UNARY:
|
7289 |
switch (ggml_get_unary_op(tensor)) {
|
7290 |
case GGML_UNARY_OP_GELU:
|
7291 |
+
if (!any_on_device) {
|
7292 |
+
return false;
|
7293 |
+
}
|
7294 |
func = ggml_cuda_gelu;
|
7295 |
break;
|
7296 |
case GGML_UNARY_OP_SILU:
|
7297 |
+
if (!any_on_device) {
|
7298 |
+
return false;
|
7299 |
+
}
|
7300 |
func = ggml_cuda_silu;
|
7301 |
break;
|
7302 |
default:
|
7303 |
return false;
|
7304 |
} break;
|
7305 |
case GGML_OP_NORM:
|
7306 |
+
if (!any_on_device) {
|
7307 |
+
return false;
|
7308 |
+
}
|
7309 |
func = ggml_cuda_norm;
|
7310 |
break;
|
7311 |
case GGML_OP_RMS_NORM:
|
7312 |
+
if (!any_on_device) {
|
7313 |
+
return false;
|
7314 |
+
}
|
7315 |
func = ggml_cuda_rms_norm;
|
7316 |
break;
|
7317 |
case GGML_OP_MUL_MAT:
|
|
|
7321 |
func = ggml_cuda_mul_mat;
|
7322 |
break;
|
7323 |
case GGML_OP_SCALE:
|
|
|
|
|
|
|
7324 |
if (!any_on_device) {
|
7325 |
return false;
|
7326 |
}
|
7327 |
+
func = ggml_cuda_scale;
|
7328 |
break;
|
7329 |
case GGML_OP_CPY:
|
7330 |
+
if (!any_on_device) {
|
7331 |
+
return false;
|
7332 |
+
}
|
7333 |
func = ggml_cuda_cpy;
|
7334 |
break;
|
7335 |
case GGML_OP_CONT:
|
7336 |
+
if (!any_on_device) {
|
7337 |
+
return false;
|
7338 |
+
}
|
7339 |
func = ggml_cuda_dup;
|
7340 |
break;
|
7341 |
case GGML_OP_RESHAPE:
|
7342 |
case GGML_OP_VIEW:
|
7343 |
case GGML_OP_PERMUTE:
|
7344 |
case GGML_OP_TRANSPOSE:
|
7345 |
+
if (!any_on_device) {
|
7346 |
+
return false;
|
7347 |
+
}
|
7348 |
func = ggml_cuda_nop;
|
7349 |
break;
|
7350 |
case GGML_OP_DIAG_MASK_INF:
|
7351 |
+
if (!any_on_device) {
|
7352 |
+
return false;
|
7353 |
+
}
|
7354 |
func = ggml_cuda_diag_mask_inf;
|
7355 |
break;
|
7356 |
case GGML_OP_SOFT_MAX:
|
7357 |
+
if (!any_on_device) {
|
7358 |
+
return false;
|
7359 |
+
}
|
7360 |
func = ggml_cuda_soft_max;
|
7361 |
break;
|
7362 |
case GGML_OP_ROPE:
|
7363 |
+
if (!any_on_device) {
|
7364 |
+
return false;
|
7365 |
+
}
|
7366 |
func = ggml_cuda_rope;
|
7367 |
break;
|
7368 |
case GGML_OP_ALIBI:
|
7369 |
+
if (!any_on_device) {
|
7370 |
+
return false;
|
7371 |
+
}
|
7372 |
func = ggml_cuda_alibi;
|
7373 |
break;
|
7374 |
default:
|
|
|
7396 |
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
7397 |
snprintf(description, description_size, "%s", prop.name);
|
7398 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ggml-cuda.h
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
#pragma once
|
2 |
|
3 |
#include "ggml.h"
|
4 |
-
#include "ggml-backend.h"
|
5 |
|
6 |
#ifdef GGML_USE_HIPBLAS
|
7 |
#define GGML_CUDA_NAME "ROCm"
|
@@ -43,9 +42,6 @@ GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, s
|
|
43 |
GGML_API int ggml_cuda_get_device_count(void);
|
44 |
GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
|
45 |
|
46 |
-
// backend API
|
47 |
-
GGML_API ggml_backend_t ggml_backend_cuda_init(void); // TODO: take a list of devices to use
|
48 |
-
|
49 |
#ifdef __cplusplus
|
50 |
}
|
51 |
#endif
|
|
|
1 |
#pragma once
|
2 |
|
3 |
#include "ggml.h"
|
|
|
4 |
|
5 |
#ifdef GGML_USE_HIPBLAS
|
6 |
#define GGML_CUDA_NAME "ROCm"
|
|
|
42 |
GGML_API int ggml_cuda_get_device_count(void);
|
43 |
GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
|
44 |
|
|
|
|
|
|
|
45 |
#ifdef __cplusplus
|
46 |
}
|
47 |
#endif
|
ggml-metal.h
CHANGED
@@ -20,7 +20,6 @@
|
|
20 |
#pragma once
|
21 |
|
22 |
#include "ggml.h"
|
23 |
-
#include "ggml-backend.h"
|
24 |
|
25 |
#include <stddef.h>
|
26 |
#include <stdbool.h>
|
@@ -36,15 +35,10 @@ struct ggml_cgraph;
|
|
36 |
extern "C" {
|
37 |
#endif
|
38 |
|
39 |
-
|
40 |
-
// internal API
|
41 |
-
// temporary exposed to user-code
|
42 |
-
//
|
43 |
|
44 |
struct ggml_metal_context;
|
45 |
|
46 |
-
void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
47 |
-
|
48 |
// number of command buffers to use
|
49 |
struct ggml_metal_context * ggml_metal_init(int n_cb);
|
50 |
void ggml_metal_free(struct ggml_metal_context * ctx);
|
@@ -89,17 +83,6 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
|
|
89 |
// creates gf->n_threads command buffers in parallel
|
90 |
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
91 |
|
92 |
-
//
|
93 |
-
// backend API
|
94 |
-
// user-code should use only these functions
|
95 |
-
//
|
96 |
-
|
97 |
-
GGML_API ggml_backend_t ggml_backend_metal_init(void);
|
98 |
-
|
99 |
-
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
100 |
-
|
101 |
-
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
102 |
-
|
103 |
#ifdef __cplusplus
|
104 |
}
|
105 |
#endif
|
|
|
20 |
#pragma once
|
21 |
|
22 |
#include "ggml.h"
|
|
|
23 |
|
24 |
#include <stddef.h>
|
25 |
#include <stdbool.h>
|
|
|
35 |
extern "C" {
|
36 |
#endif
|
37 |
|
38 |
+
void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
|
|
|
|
|
|
39 |
|
40 |
struct ggml_metal_context;
|
41 |
|
|
|
|
|
42 |
// number of command buffers to use
|
43 |
struct ggml_metal_context * ggml_metal_init(int n_cb);
|
44 |
void ggml_metal_free(struct ggml_metal_context * ctx);
|
|
|
83 |
// creates gf->n_threads command buffers in parallel
|
84 |
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
#ifdef __cplusplus
|
87 |
}
|
88 |
#endif
|
ggml-metal.m
CHANGED
@@ -779,8 +779,8 @@ void ggml_metal_graph_compute(
|
|
779 |
} break;
|
780 |
case GGML_OP_CONCAT:
|
781 |
{
|
782 |
-
const int64_t nb = ne00;
|
783 |
|
|
|
784 |
[encoder setComputePipelineState:ctx->pipeline_concat];
|
785 |
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
786 |
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
@@ -812,7 +812,6 @@ void ggml_metal_graph_compute(
|
|
812 |
[encoder setBytes:&nb length:sizeof(nb) atIndex:27];
|
813 |
|
814 |
const int nth = MIN(1024, ne0);
|
815 |
-
|
816 |
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
817 |
} break;
|
818 |
case GGML_OP_ADD:
|
@@ -910,10 +909,9 @@ void ggml_metal_graph_compute(
|
|
910 |
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
911 |
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
|
912 |
|
913 |
-
const int64_t n = ggml_nelements(dst);
|
914 |
-
GGML_ASSERT(n % 4 == 0);
|
915 |
|
916 |
-
[encoder dispatchThreadgroups:MTLSizeMake(n
|
917 |
} break;
|
918 |
case GGML_OP_UNARY:
|
919 |
switch (ggml_get_unary_op(gf->nodes[i])) {
|
@@ -923,10 +921,9 @@ void ggml_metal_graph_compute(
|
|
923 |
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
924 |
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
925 |
|
926 |
-
const int64_t n = ggml_nelements(dst);
|
927 |
-
GGML_ASSERT(n % 4 == 0);
|
928 |
|
929 |
-
[encoder dispatchThreadgroups:MTLSizeMake(n
|
930 |
} break;
|
931 |
case GGML_UNARY_OP_RELU:
|
932 |
{
|
@@ -944,10 +941,9 @@ void ggml_metal_graph_compute(
|
|
944 |
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
945 |
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
946 |
|
947 |
-
const int64_t n = ggml_nelements(dst);
|
948 |
-
GGML_ASSERT(n % 4 == 0);
|
949 |
|
950 |
-
[encoder dispatchThreadgroups:MTLSizeMake(n
|
951 |
} break;
|
952 |
default:
|
953 |
{
|
@@ -1044,7 +1040,7 @@ void ggml_metal_graph_compute(
|
|
1044 |
!ggml_is_transposed(src0) &&
|
1045 |
!ggml_is_transposed(src1) &&
|
1046 |
src1t == GGML_TYPE_F32 &&
|
1047 |
-
ne00 % 32 == 0 &&
|
1048 |
ne11 > ne11_mm_min) {
|
1049 |
//printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
|
1050 |
switch (src0->type) {
|
@@ -1255,8 +1251,6 @@ void ggml_metal_graph_compute(
|
|
1255 |
} break;
|
1256 |
case GGML_OP_RMS_NORM:
|
1257 |
{
|
1258 |
-
GGML_ASSERT(ne00 % 4 == 0);
|
1259 |
-
|
1260 |
float eps;
|
1261 |
memcpy(&eps, dst->op_params, sizeof(float));
|
1262 |
|
@@ -1299,7 +1293,7 @@ void ggml_metal_graph_compute(
|
|
1299 |
|
1300 |
const int nth = MIN(1024, ne00);
|
1301 |
|
1302 |
-
|
1303 |
const int n_head = ((int32_t *) dst->op_params)[1];
|
1304 |
float max_bias;
|
1305 |
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
@@ -1477,140 +1471,3 @@ preferably one under the recommended max working set size, or else fall back to
|
|
1477 |
|
1478 |
}
|
1479 |
}
|
1480 |
-
|
1481 |
-
////////////////////////////////////////////////////////////////////////////////
|
1482 |
-
|
1483 |
-
// backend interface
|
1484 |
-
|
1485 |
-
static const char * ggml_backend_metal_name(ggml_backend_t backend) {
|
1486 |
-
return "Metal";
|
1487 |
-
|
1488 |
-
UNUSED(backend);
|
1489 |
-
}
|
1490 |
-
|
1491 |
-
static void ggml_backend_metal_free(ggml_backend_t backend) {
|
1492 |
-
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
|
1493 |
-
ggml_metal_free(ctx);
|
1494 |
-
free(backend);
|
1495 |
-
}
|
1496 |
-
|
1497 |
-
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
|
1498 |
-
return (void *)buffer->context;
|
1499 |
-
}
|
1500 |
-
|
1501 |
-
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
1502 |
-
free(buffer->context);
|
1503 |
-
UNUSED(buffer);
|
1504 |
-
}
|
1505 |
-
|
1506 |
-
static struct ggml_backend_buffer_i metal_backend_buffer_i = {
|
1507 |
-
/* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
|
1508 |
-
/* .get_base = */ ggml_backend_metal_buffer_get_base,
|
1509 |
-
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
1510 |
-
/* .init_tensor = */ NULL, // no initialization required
|
1511 |
-
/* .free_tensor = */ NULL, // no cleanup required
|
1512 |
-
};
|
1513 |
-
|
1514 |
-
static ggml_backend_buffer_t ggml_backend_metal_alloc_buffer(ggml_backend_t backend, size_t size) {
|
1515 |
-
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
|
1516 |
-
|
1517 |
-
void * data = ggml_metal_host_malloc(size);
|
1518 |
-
|
1519 |
-
// TODO: set proper name of the buffers
|
1520 |
-
ggml_metal_add_buffer(ctx, "backend", data, size, 0);
|
1521 |
-
|
1522 |
-
return ggml_backend_buffer_init(backend, metal_backend_buffer_i, data, size);
|
1523 |
-
}
|
1524 |
-
|
1525 |
-
static size_t ggml_backend_metal_get_alignment(ggml_backend_t backend) {
|
1526 |
-
return 32;
|
1527 |
-
UNUSED(backend);
|
1528 |
-
}
|
1529 |
-
|
1530 |
-
static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
1531 |
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
1532 |
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
1533 |
-
|
1534 |
-
memcpy((char *)tensor->data + offset, data, size);
|
1535 |
-
|
1536 |
-
UNUSED(backend);
|
1537 |
-
}
|
1538 |
-
|
1539 |
-
static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
1540 |
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
1541 |
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
1542 |
-
|
1543 |
-
memcpy(data, (const char *)tensor->data + offset, size);
|
1544 |
-
|
1545 |
-
UNUSED(backend);
|
1546 |
-
}
|
1547 |
-
|
1548 |
-
static void ggml_backend_metal_synchronize(ggml_backend_t backend) {
|
1549 |
-
UNUSED(backend);
|
1550 |
-
}
|
1551 |
-
|
1552 |
-
static void ggml_backend_metal_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
1553 |
-
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
|
1554 |
-
|
1555 |
-
UNUSED(backend);
|
1556 |
-
}
|
1557 |
-
|
1558 |
-
static void ggml_backend_metal_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
1559 |
-
ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src));
|
1560 |
-
|
1561 |
-
UNUSED(backend);
|
1562 |
-
}
|
1563 |
-
|
1564 |
-
static void ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
1565 |
-
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
|
1566 |
-
|
1567 |
-
ggml_metal_graph_compute(metal_ctx, cgraph);
|
1568 |
-
}
|
1569 |
-
|
1570 |
-
static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
1571 |
-
return true;
|
1572 |
-
UNUSED(backend);
|
1573 |
-
UNUSED(op);
|
1574 |
-
}
|
1575 |
-
|
1576 |
-
static struct ggml_backend_i metal_backend_i = {
|
1577 |
-
/* .get_name = */ ggml_backend_metal_name,
|
1578 |
-
/* .free = */ ggml_backend_metal_free,
|
1579 |
-
/* .alloc_buffer = */ ggml_backend_metal_alloc_buffer,
|
1580 |
-
/* .get_alignment = */ ggml_backend_metal_get_alignment,
|
1581 |
-
/* .set_tensor_async = */ ggml_backend_metal_set_tensor_async,
|
1582 |
-
/* .get_tensor_async = */ ggml_backend_metal_get_tensor_async,
|
1583 |
-
/* .synchronize = */ ggml_backend_metal_synchronize,
|
1584 |
-
/* .cpy_tensor_from = */ ggml_backend_metal_cpy_tensor_from,
|
1585 |
-
/* .cpy_tensor_to = */ ggml_backend_metal_cpy_tensor_to,
|
1586 |
-
/* .graph_plan_create = */ NULL, // the metal implementation does not require creating graph plans atm
|
1587 |
-
/* .graph_plan_free = */ NULL,
|
1588 |
-
/* .graph_plan_compute = */ NULL,
|
1589 |
-
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
1590 |
-
/* .supports_op = */ ggml_backend_metal_supports_op,
|
1591 |
-
};
|
1592 |
-
|
1593 |
-
ggml_backend_t ggml_backend_metal_init(void) {
|
1594 |
-
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
1595 |
-
|
1596 |
-
ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
|
1597 |
-
|
1598 |
-
ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend));
|
1599 |
-
|
1600 |
-
*metal_backend = (struct ggml_backend) {
|
1601 |
-
/* .interface = */ metal_backend_i,
|
1602 |
-
/* .context = */ ctx,
|
1603 |
-
};
|
1604 |
-
|
1605 |
-
return metal_backend;
|
1606 |
-
}
|
1607 |
-
|
1608 |
-
bool ggml_backend_is_metal(ggml_backend_t backend) {
|
1609 |
-
return backend->iface.get_name == ggml_backend_metal_name;
|
1610 |
-
}
|
1611 |
-
|
1612 |
-
void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
|
1613 |
-
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
|
1614 |
-
|
1615 |
-
ggml_metal_set_n_cb(ctx, n_cb);
|
1616 |
-
}
|
|
|
779 |
} break;
|
780 |
case GGML_OP_CONCAT:
|
781 |
{
|
|
|
782 |
|
783 |
+
int64_t nb = ne00;
|
784 |
[encoder setComputePipelineState:ctx->pipeline_concat];
|
785 |
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
786 |
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
|
|
812 |
[encoder setBytes:&nb length:sizeof(nb) atIndex:27];
|
813 |
|
814 |
const int nth = MIN(1024, ne0);
|
|
|
815 |
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
816 |
} break;
|
817 |
case GGML_OP_ADD:
|
|
|
909 |
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
910 |
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
|
911 |
|
912 |
+
const int64_t n = ggml_nelements(dst)/4;
|
|
|
913 |
|
914 |
+
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
915 |
} break;
|
916 |
case GGML_OP_UNARY:
|
917 |
switch (ggml_get_unary_op(gf->nodes[i])) {
|
|
|
921 |
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
922 |
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
923 |
|
924 |
+
const int64_t n = ggml_nelements(dst)/4;
|
|
|
925 |
|
926 |
+
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
927 |
} break;
|
928 |
case GGML_UNARY_OP_RELU:
|
929 |
{
|
|
|
941 |
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
942 |
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
943 |
|
944 |
+
const int64_t n = ggml_nelements(dst)/4;
|
|
|
945 |
|
946 |
+
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
947 |
} break;
|
948 |
default:
|
949 |
{
|
|
|
1040 |
!ggml_is_transposed(src0) &&
|
1041 |
!ggml_is_transposed(src1) &&
|
1042 |
src1t == GGML_TYPE_F32 &&
|
1043 |
+
ne00 % 32 == 0 &&
|
1044 |
ne11 > ne11_mm_min) {
|
1045 |
//printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
|
1046 |
switch (src0->type) {
|
|
|
1251 |
} break;
|
1252 |
case GGML_OP_RMS_NORM:
|
1253 |
{
|
|
|
|
|
1254 |
float eps;
|
1255 |
memcpy(&eps, dst->op_params, sizeof(float));
|
1256 |
|
|
|
1293 |
|
1294 |
const int nth = MIN(1024, ne00);
|
1295 |
|
1296 |
+
const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
|
1297 |
const int n_head = ((int32_t *) dst->op_params)[1];
|
1298 |
float max_bias;
|
1299 |
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
|
|
1471 |
|
1472 |
}
|
1473 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ggml-metal.metal
CHANGED
@@ -345,11 +345,10 @@ kernel void kernel_rms_norm(
|
|
345 |
uint sgitg[[simdgroup_index_in_threadgroup]],
|
346 |
uint tiisg[[thread_index_in_simdgroup]],
|
347 |
uint ntg[[threads_per_threadgroup]]) {
|
348 |
-
device const float4 * x
|
349 |
-
device const float
|
350 |
-
|
351 |
-
|
352 |
-
float all_sum = 0;
|
353 |
|
354 |
// parallel sum
|
355 |
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
@@ -362,7 +361,6 @@ kernel void kernel_rms_norm(
|
|
362 |
}
|
363 |
|
364 |
threadgroup_barrier(mem_flags::mem_threadgroup);
|
365 |
-
|
366 |
// broadcast, simd group number is ntg / 32
|
367 |
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
|
368 |
if (tpitg < i) {
|
@@ -370,9 +368,7 @@ kernel void kernel_rms_norm(
|
|
370 |
}
|
371 |
}
|
372 |
if (tpitg == 0) {
|
373 |
-
for (int i = 4 * (ne00 / 4); i < ne00; i++) {
|
374 |
-
sum[0] += x_scalar[i];
|
375 |
-
}
|
376 |
sum[0] /= ne00;
|
377 |
}
|
378 |
|
@@ -387,9 +383,7 @@ kernel void kernel_rms_norm(
|
|
387 |
y[i00] = x[i00] * scale;
|
388 |
}
|
389 |
if (tpitg == 0) {
|
390 |
-
for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {
|
391 |
-
y_scalar[i00] = x_scalar[i00] * scale;
|
392 |
-
}
|
393 |
}
|
394 |
}
|
395 |
|
|
|
345 |
uint sgitg[[simdgroup_index_in_threadgroup]],
|
346 |
uint tiisg[[thread_index_in_simdgroup]],
|
347 |
uint ntg[[threads_per_threadgroup]]) {
|
348 |
+
device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
|
349 |
+
device const float * x_scalar = (device const float *) x;
|
350 |
+
float4 sumf=0;
|
351 |
+
float all_sum=0;
|
|
|
352 |
|
353 |
// parallel sum
|
354 |
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
|
|
361 |
}
|
362 |
|
363 |
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
|
364 |
// broadcast, simd group number is ntg / 32
|
365 |
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
|
366 |
if (tpitg < i) {
|
|
|
368 |
}
|
369 |
}
|
370 |
if (tpitg == 0) {
|
371 |
+
for (int i = 4 * (ne00 / 4); i < ne00; i++) {sum[0] += x_scalar[i];}
|
|
|
|
|
372 |
sum[0] /= ne00;
|
373 |
}
|
374 |
|
|
|
383 |
y[i00] = x[i00] * scale;
|
384 |
}
|
385 |
if (tpitg == 0) {
|
386 |
+
for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {y_scalar[i00] = x_scalar[i00] * scale;}
|
|
|
|
|
387 |
}
|
388 |
}
|
389 |
|
ggml.c
CHANGED
@@ -162,16 +162,40 @@ typedef void * thread_ret_t;
|
|
162 |
|
163 |
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
164 |
|
165 |
-
//
|
166 |
-
// end of logging block
|
167 |
-
//
|
168 |
-
|
169 |
#ifdef GGML_USE_ACCELERATE
|
170 |
// uncomment to use vDSP for soft max computation
|
171 |
// note: not sure if it is actually faster
|
172 |
//#define GGML_SOFT_MAX_ACCELERATE
|
173 |
#endif
|
174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
#if defined(_MSC_VER) || defined(__MINGW32__)
|
176 |
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
|
177 |
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
@@ -4928,7 +4952,6 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4928 |
*result = (struct ggml_tensor) {
|
4929 |
/*.type =*/ type,
|
4930 |
/*.backend =*/ GGML_BACKEND_CPU,
|
4931 |
-
/*.buffer =*/ NULL,
|
4932 |
/*.n_dims =*/ n_dims,
|
4933 |
/*.ne =*/ { 1, 1, 1, 1 },
|
4934 |
/*.nb =*/ { 0, 0, 0, 0 },
|
@@ -11234,7 +11257,7 @@ static void ggml_compute_forward_silu_f32(
|
|
11234 |
|
11235 |
#ifndef NDEBUG
|
11236 |
for (int k = 0; k < nc; k++) {
|
11237 |
-
const float x = ((float *) ((char *) dst->data + i1*(dst->nb[1])))[k];
|
11238 |
UNUSED(x);
|
11239 |
assert(!isnan(x));
|
11240 |
assert(!isinf(x));
|
@@ -13060,22 +13083,24 @@ static void ggml_compute_forward_alibi_f32(
|
|
13060 |
return;
|
13061 |
}
|
13062 |
|
13063 |
-
|
13064 |
const int n_head = ((int32_t *) dst->op_params)[1];
|
13065 |
float max_bias;
|
13066 |
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
13067 |
|
13068 |
-
|
13069 |
-
|
13070 |
-
const
|
13071 |
-
|
|
|
|
|
13072 |
|
13073 |
-
const
|
13074 |
-
const
|
13075 |
|
13076 |
-
const
|
13077 |
-
const
|
13078 |
-
const
|
13079 |
//const int nb3 = src0->nb[3];
|
13080 |
|
13081 |
GGML_ASSERT(nb0 == sizeof(float));
|
@@ -13087,9 +13112,9 @@ static void ggml_compute_forward_alibi_f32(
|
|
13087 |
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
13088 |
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
13089 |
|
13090 |
-
for (
|
13091 |
-
for (
|
13092 |
-
for (
|
13093 |
float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
|
13094 |
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
|
13095 |
|
@@ -13104,6 +13129,7 @@ static void ggml_compute_forward_alibi_f32(
|
|
13104 |
}
|
13105 |
|
13106 |
pdst[0] = i * m_k + src[0];
|
|
|
13107 |
}
|
13108 |
}
|
13109 |
}
|
@@ -20174,10 +20200,6 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
20174 |
ggml_vec_cpy_f32(nx, xp, x);
|
20175 |
ggml_vec_cpy_f32(nx, gp, g);
|
20176 |
|
20177 |
-
// TODO: instead of passing &cancel here, use the return code of the linesearch
|
20178 |
-
// to determine if the optimization should be cancelled
|
20179 |
-
// this is a simple change, but not doing this atm, since I don't have a nice
|
20180 |
-
// way to test and don't want to break something with so many changes lined up
|
20181 |
ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
|
20182 |
if (cancel) {
|
20183 |
return GGML_OPT_CANCEL;
|
|
|
162 |
|
163 |
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
164 |
|
|
|
|
|
|
|
|
|
165 |
#ifdef GGML_USE_ACCELERATE
|
166 |
// uncomment to use vDSP for soft max computation
|
167 |
// note: not sure if it is actually faster
|
168 |
//#define GGML_SOFT_MAX_ACCELERATE
|
169 |
#endif
|
170 |
|
171 |
+
//
|
172 |
+
// logging
|
173 |
+
//
|
174 |
+
|
175 |
+
#if (GGML_DEBUG >= 1)
|
176 |
+
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
|
177 |
+
#else
|
178 |
+
#define GGML_PRINT_DEBUG(...)
|
179 |
+
#endif
|
180 |
+
|
181 |
+
#if (GGML_DEBUG >= 5)
|
182 |
+
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
|
183 |
+
#else
|
184 |
+
#define GGML_PRINT_DEBUG_5(...)
|
185 |
+
#endif
|
186 |
+
|
187 |
+
#if (GGML_DEBUG >= 10)
|
188 |
+
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
|
189 |
+
#else
|
190 |
+
#define GGML_PRINT_DEBUG_10(...)
|
191 |
+
#endif
|
192 |
+
|
193 |
+
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
194 |
+
|
195 |
+
//
|
196 |
+
// end of logging block
|
197 |
+
//
|
198 |
+
|
199 |
#if defined(_MSC_VER) || defined(__MINGW32__)
|
200 |
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
|
201 |
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
|
|
4952 |
*result = (struct ggml_tensor) {
|
4953 |
/*.type =*/ type,
|
4954 |
/*.backend =*/ GGML_BACKEND_CPU,
|
|
|
4955 |
/*.n_dims =*/ n_dims,
|
4956 |
/*.ne =*/ { 1, 1, 1, 1 },
|
4957 |
/*.nb =*/ { 0, 0, 0, 0 },
|
|
|
11257 |
|
11258 |
#ifndef NDEBUG
|
11259 |
for (int k = 0; k < nc; k++) {
|
11260 |
+
const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
11261 |
UNUSED(x);
|
11262 |
assert(!isnan(x));
|
11263 |
assert(!isinf(x));
|
|
|
13083 |
return;
|
13084 |
}
|
13085 |
|
13086 |
+
const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
|
13087 |
const int n_head = ((int32_t *) dst->op_params)[1];
|
13088 |
float max_bias;
|
13089 |
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
13090 |
|
13091 |
+
assert(n_past >= 0);
|
13092 |
+
|
13093 |
+
const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
|
13094 |
+
const int ne1 = src0->ne[1]; // seq_len_without_past
|
13095 |
+
const int ne2 = src0->ne[2]; // n_head -> this is k
|
13096 |
+
//const int ne3 = src0->ne[3]; // 1 -> bsz
|
13097 |
|
13098 |
+
const int n = ggml_nrows(src0);
|
13099 |
+
const int ne2_ne3 = n/ne1; // ne2*ne3
|
13100 |
|
13101 |
+
const int nb0 = src0->nb[0];
|
13102 |
+
const int nb1 = src0->nb[1];
|
13103 |
+
const int nb2 = src0->nb[2];
|
13104 |
//const int nb3 = src0->nb[3];
|
13105 |
|
13106 |
GGML_ASSERT(nb0 == sizeof(float));
|
|
|
13112 |
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
13113 |
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
13114 |
|
13115 |
+
for (int i = 0; i < ne0; i++) {
|
13116 |
+
for (int j = 0; j < ne1; j++) {
|
13117 |
+
for (int k = 0; k < ne2_ne3; k++) {
|
13118 |
float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
|
13119 |
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
|
13120 |
|
|
|
13129 |
}
|
13130 |
|
13131 |
pdst[0] = i * m_k + src[0];
|
13132 |
+
|
13133 |
}
|
13134 |
}
|
13135 |
}
|
|
|
20200 |
ggml_vec_cpy_f32(nx, xp, x);
|
20201 |
ggml_vec_cpy_f32(nx, gp, g);
|
20202 |
|
|
|
|
|
|
|
|
|
20203 |
ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
|
20204 |
if (cancel) {
|
20205 |
return GGML_OPT_CANCEL;
|
ggml.h
CHANGED
@@ -326,7 +326,7 @@ extern "C" {
|
|
326 |
GGML_TYPE_COUNT,
|
327 |
};
|
328 |
|
329 |
-
enum
|
330 |
GGML_BACKEND_CPU = 0,
|
331 |
GGML_BACKEND_GPU = 10,
|
332 |
GGML_BACKEND_GPU_SPLIT = 20,
|
@@ -479,10 +479,8 @@ extern "C" {
|
|
479 |
|
480 |
// n-dimensional tensor
|
481 |
struct ggml_tensor {
|
482 |
-
enum ggml_type
|
483 |
-
enum
|
484 |
-
|
485 |
-
struct ggml_backend_buffer * buffer;
|
486 |
|
487 |
int n_dims;
|
488 |
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
@@ -516,7 +514,7 @@ extern "C" {
|
|
516 |
|
517 |
void * extra; // extra things e.g. for ggml-cuda.cu
|
518 |
|
519 |
-
char padding[
|
520 |
};
|
521 |
|
522 |
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
@@ -1360,7 +1358,7 @@ extern "C" {
|
|
1360 |
|
1361 |
// alibi position embedding
|
1362 |
// in-place, returns view(a)
|
1363 |
-
|
1364 |
struct ggml_context * ctx,
|
1365 |
struct ggml_tensor * a,
|
1366 |
int n_past,
|
@@ -1369,7 +1367,7 @@ extern "C" {
|
|
1369 |
|
1370 |
// clamp
|
1371 |
// in-place, returns view(a)
|
1372 |
-
|
1373 |
struct ggml_context * ctx,
|
1374 |
struct ggml_tensor * a,
|
1375 |
float min,
|
@@ -2104,7 +2102,7 @@ extern "C" {
|
|
2104 |
enum ggml_type vec_dot_type;
|
2105 |
} ggml_type_traits_t;
|
2106 |
|
2107 |
-
|
2108 |
|
2109 |
#ifdef __cplusplus
|
2110 |
}
|
|
|
326 |
GGML_TYPE_COUNT,
|
327 |
};
|
328 |
|
329 |
+
enum ggml_backend {
|
330 |
GGML_BACKEND_CPU = 0,
|
331 |
GGML_BACKEND_GPU = 10,
|
332 |
GGML_BACKEND_GPU_SPLIT = 20,
|
|
|
479 |
|
480 |
// n-dimensional tensor
|
481 |
struct ggml_tensor {
|
482 |
+
enum ggml_type type;
|
483 |
+
enum ggml_backend backend;
|
|
|
|
|
484 |
|
485 |
int n_dims;
|
486 |
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
|
|
514 |
|
515 |
void * extra; // extra things e.g. for ggml-cuda.cu
|
516 |
|
517 |
+
char padding[4];
|
518 |
};
|
519 |
|
520 |
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
|
|
1358 |
|
1359 |
// alibi position embedding
|
1360 |
// in-place, returns view(a)
|
1361 |
+
struct ggml_tensor * ggml_alibi(
|
1362 |
struct ggml_context * ctx,
|
1363 |
struct ggml_tensor * a,
|
1364 |
int n_past,
|
|
|
1367 |
|
1368 |
// clamp
|
1369 |
// in-place, returns view(a)
|
1370 |
+
struct ggml_tensor * ggml_clamp(
|
1371 |
struct ggml_context * ctx,
|
1372 |
struct ggml_tensor * a,
|
1373 |
float min,
|
|
|
2102 |
enum ggml_type vec_dot_type;
|
2103 |
} ggml_type_traits_t;
|
2104 |
|
2105 |
+
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
2106 |
|
2107 |
#ifdef __cplusplus
|
2108 |
}
|
gguf-py/gguf/gguf.py
CHANGED
@@ -88,31 +88,29 @@ class MODEL_ARCH(IntEnum):
|
|
88 |
PERSIMMON : int = auto()
|
89 |
REFACT : int = auto()
|
90 |
BERT : int = auto()
|
91 |
-
BLOOM : int = auto()
|
92 |
|
93 |
|
94 |
class MODEL_TENSOR(IntEnum):
|
95 |
-
TOKEN_EMBD
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
ATTN_K_NORM : int = auto()
|
116 |
|
117 |
|
118 |
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
@@ -127,31 +125,29 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|
127 |
MODEL_ARCH.PERSIMMON: "persimmon",
|
128 |
MODEL_ARCH.REFACT: "refact",
|
129 |
MODEL_ARCH.BERT: "bert",
|
130 |
-
MODEL_ARCH.BLOOM: "bloom",
|
131 |
}
|
132 |
|
133 |
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
134 |
-
MODEL_TENSOR.TOKEN_EMBD:
|
135 |
-
MODEL_TENSOR.
|
136 |
-
MODEL_TENSOR.
|
137 |
-
MODEL_TENSOR.
|
138 |
-
MODEL_TENSOR.
|
139 |
-
MODEL_TENSOR.
|
140 |
-
MODEL_TENSOR.
|
141 |
-
MODEL_TENSOR.
|
142 |
-
MODEL_TENSOR.
|
143 |
-
MODEL_TENSOR.
|
144 |
-
MODEL_TENSOR.
|
145 |
-
MODEL_TENSOR.
|
146 |
-
MODEL_TENSOR.
|
147 |
-
MODEL_TENSOR.
|
148 |
-
MODEL_TENSOR.
|
149 |
-
MODEL_TENSOR.
|
150 |
-
MODEL_TENSOR.
|
151 |
-
MODEL_TENSOR.
|
152 |
-
MODEL_TENSOR.
|
153 |
-
MODEL_TENSOR.
|
154 |
-
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
155 |
}
|
156 |
|
157 |
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
@@ -286,18 +282,6 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
286 |
MODEL_TENSOR.FFN_DOWN,
|
287 |
MODEL_TENSOR.FFN_UP,
|
288 |
],
|
289 |
-
MODEL_ARCH.BLOOM: [
|
290 |
-
MODEL_TENSOR.TOKEN_EMBD,
|
291 |
-
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
292 |
-
MODEL_TENSOR.OUTPUT_NORM,
|
293 |
-
MODEL_TENSOR.OUTPUT,
|
294 |
-
MODEL_TENSOR.ATTN_NORM,
|
295 |
-
MODEL_TENSOR.ATTN_QKV,
|
296 |
-
MODEL_TENSOR.ATTN_OUT,
|
297 |
-
MODEL_TENSOR.FFN_NORM,
|
298 |
-
MODEL_TENSOR.FFN_DOWN,
|
299 |
-
MODEL_TENSOR.FFN_UP,
|
300 |
-
],
|
301 |
MODEL_ARCH.GPT2: [
|
302 |
# TODO
|
303 |
],
|
@@ -327,7 +311,6 @@ class TensorNameMap:
|
|
327 |
"gpt_neox.embed_in", # gptneox
|
328 |
"transformer.wte", # gpt2 gpt-j mpt refact
|
329 |
"transformer.word_embeddings", # falcon
|
330 |
-
"word_embeddings", # bloom
|
331 |
"model.embed_tokens", # llama-hf
|
332 |
"tok_embeddings", # llama-pth
|
333 |
"embeddings.word_embeddings", # bert
|
@@ -339,11 +322,6 @@ class TensorNameMap:
|
|
339 |
"embeddings.token_type_embeddings", # bert
|
340 |
),
|
341 |
|
342 |
-
# Normalization of token embeddings
|
343 |
-
MODEL_TENSOR.TOKEN_EMBD_NORM: (
|
344 |
-
"word_embeddings_layernorm", # bloom
|
345 |
-
),
|
346 |
-
|
347 |
# Position embeddings
|
348 |
MODEL_TENSOR.POS_EMBD: (
|
349 |
"transformer.wpe", # gpt2
|
@@ -354,7 +332,7 @@ class TensorNameMap:
|
|
354 |
MODEL_TENSOR.OUTPUT: (
|
355 |
"embed_out", # gptneox
|
356 |
"lm_head", # gpt2 mpt falcon llama-hf baichuan
|
357 |
-
"output", # llama-pth
|
358 |
"word_embeddings_for_head", # persimmon
|
359 |
),
|
360 |
|
@@ -366,7 +344,7 @@ class TensorNameMap:
|
|
366 |
"norm", # llama-pth
|
367 |
"embeddings.LayerNorm", # bert
|
368 |
"transformer.norm_f", # mpt
|
369 |
-
"ln_f", # refact
|
370 |
"language_model.encoder.final_layernorm", # persimmon
|
371 |
),
|
372 |
|
@@ -383,7 +361,6 @@ class TensorNameMap:
|
|
383 |
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact
|
384 |
"transformer.blocks.{bid}.norm_1", # mpt
|
385 |
"transformer.h.{bid}.input_layernorm", # falcon7b
|
386 |
-
"h.{bid}.input_layernorm", # bloom
|
387 |
"transformer.h.{bid}.ln_mlp", # falcon40b
|
388 |
"model.layers.{bid}.input_layernorm", # llama-hf
|
389 |
"layers.{bid}.attention_norm", # llama-pth
|
@@ -402,7 +379,6 @@ class TensorNameMap:
|
|
402 |
"transformer.h.{bid}.attn.c_attn", # gpt2
|
403 |
"transformer.blocks.{bid}.attn.Wqkv", # mpt
|
404 |
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
405 |
-
"h.{bid}.self_attention.query_key_value", # bloom
|
406 |
"language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
|
407 |
),
|
408 |
|
@@ -436,7 +412,6 @@ class TensorNameMap:
|
|
436 |
"transformer.h.{bid}.attn.c_proj", # gpt2 refact
|
437 |
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
438 |
"transformer.h.{bid}.self_attention.dense", # falcon
|
439 |
-
"h.{bid}.self_attention.dense", # bloom
|
440 |
"model.layers.{bid}.self_attn.o_proj", # llama-hf
|
441 |
"layers.{bid}.attention.wo", # llama-pth
|
442 |
"encoder.layer.{bid}.attention.output.dense", # bert
|
@@ -454,7 +429,6 @@ class TensorNameMap:
|
|
454 |
MODEL_TENSOR.FFN_NORM: (
|
455 |
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
|
456 |
"transformer.h.{bid}.ln_2", # gpt2 refact
|
457 |
-
"h.{bid}.post_attention_layernorm", # bloom
|
458 |
"transformer.blocks.{bid}.norm_2", # mpt
|
459 |
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
460 |
"layers.{bid}.ffn_norm", # llama-pth
|
@@ -468,7 +442,6 @@ class TensorNameMap:
|
|
468 |
"transformer.h.{bid}.mlp.c_fc", # gpt2
|
469 |
"transformer.blocks.{bid}.ffn.up_proj", # mpt
|
470 |
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
|
471 |
-
"h.{bid}.mlp.dense_h_to_4h", # bloom
|
472 |
"model.layers.{bid}.mlp.up_proj", # llama-hf refact
|
473 |
"layers.{bid}.feed_forward.w3", # llama-pth
|
474 |
"encoder.layer.{bid}.intermediate.dense", # bert
|
@@ -488,7 +461,6 @@ class TensorNameMap:
|
|
488 |
"transformer.h.{bid}.mlp.c_proj", # gpt2 refact
|
489 |
"transformer.blocks.{bid}.ffn.down_proj", # mpt
|
490 |
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
491 |
-
"h.{bid}.mlp.dense_4h_to_h", # bloom
|
492 |
"model.layers.{bid}.mlp.down_proj", # llama-hf
|
493 |
"layers.{bid}.feed_forward.w2", # llama-pth
|
494 |
"encoder.layer.{bid}.output.dense", # bert
|
|
|
88 |
PERSIMMON : int = auto()
|
89 |
REFACT : int = auto()
|
90 |
BERT : int = auto()
|
|
|
91 |
|
92 |
|
93 |
class MODEL_TENSOR(IntEnum):
|
94 |
+
TOKEN_EMBD : int = auto()
|
95 |
+
TOKEN_TYPES : int = auto()
|
96 |
+
POS_EMBD : int = auto()
|
97 |
+
OUTPUT : int = auto()
|
98 |
+
OUTPUT_NORM : int = auto()
|
99 |
+
ROPE_FREQS : int = auto()
|
100 |
+
ATTN_Q : int = auto()
|
101 |
+
ATTN_K : int = auto()
|
102 |
+
ATTN_V : int = auto()
|
103 |
+
ATTN_QKV : int = auto()
|
104 |
+
ATTN_OUT : int = auto()
|
105 |
+
ATTN_NORM : int = auto()
|
106 |
+
ATTN_NORM_2 : int = auto()
|
107 |
+
ATTN_ROT_EMBD: int = auto()
|
108 |
+
FFN_GATE : int = auto()
|
109 |
+
FFN_DOWN : int = auto()
|
110 |
+
FFN_UP : int = auto()
|
111 |
+
FFN_NORM : int = auto()
|
112 |
+
ATTN_Q_NORM : int = auto()
|
113 |
+
ATTN_K_NORM : int = auto()
|
|
|
114 |
|
115 |
|
116 |
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|
|
125 |
MODEL_ARCH.PERSIMMON: "persimmon",
|
126 |
MODEL_ARCH.REFACT: "refact",
|
127 |
MODEL_ARCH.BERT: "bert",
|
|
|
128 |
}
|
129 |
|
130 |
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
131 |
+
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
132 |
+
MODEL_TENSOR.TOKEN_TYPES: "token_types",
|
133 |
+
MODEL_TENSOR.POS_EMBD: "position_embd",
|
134 |
+
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
135 |
+
MODEL_TENSOR.OUTPUT: "output",
|
136 |
+
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
137 |
+
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
138 |
+
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
|
139 |
+
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
140 |
+
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
141 |
+
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
142 |
+
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
143 |
+
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
144 |
+
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
145 |
+
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
146 |
+
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
147 |
+
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
148 |
+
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
149 |
+
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
150 |
+
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
|
|
151 |
}
|
152 |
|
153 |
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
|
282 |
MODEL_TENSOR.FFN_DOWN,
|
283 |
MODEL_TENSOR.FFN_UP,
|
284 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
285 |
MODEL_ARCH.GPT2: [
|
286 |
# TODO
|
287 |
],
|
|
|
311 |
"gpt_neox.embed_in", # gptneox
|
312 |
"transformer.wte", # gpt2 gpt-j mpt refact
|
313 |
"transformer.word_embeddings", # falcon
|
|
|
314 |
"model.embed_tokens", # llama-hf
|
315 |
"tok_embeddings", # llama-pth
|
316 |
"embeddings.word_embeddings", # bert
|
|
|
322 |
"embeddings.token_type_embeddings", # bert
|
323 |
),
|
324 |
|
|
|
|
|
|
|
|
|
|
|
325 |
# Position embeddings
|
326 |
MODEL_TENSOR.POS_EMBD: (
|
327 |
"transformer.wpe", # gpt2
|
|
|
332 |
MODEL_TENSOR.OUTPUT: (
|
333 |
"embed_out", # gptneox
|
334 |
"lm_head", # gpt2 mpt falcon llama-hf baichuan
|
335 |
+
"output", # llama-pth
|
336 |
"word_embeddings_for_head", # persimmon
|
337 |
),
|
338 |
|
|
|
344 |
"norm", # llama-pth
|
345 |
"embeddings.LayerNorm", # bert
|
346 |
"transformer.norm_f", # mpt
|
347 |
+
"ln_f", # refact
|
348 |
"language_model.encoder.final_layernorm", # persimmon
|
349 |
),
|
350 |
|
|
|
361 |
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact
|
362 |
"transformer.blocks.{bid}.norm_1", # mpt
|
363 |
"transformer.h.{bid}.input_layernorm", # falcon7b
|
|
|
364 |
"transformer.h.{bid}.ln_mlp", # falcon40b
|
365 |
"model.layers.{bid}.input_layernorm", # llama-hf
|
366 |
"layers.{bid}.attention_norm", # llama-pth
|
|
|
379 |
"transformer.h.{bid}.attn.c_attn", # gpt2
|
380 |
"transformer.blocks.{bid}.attn.Wqkv", # mpt
|
381 |
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
|
|
382 |
"language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
|
383 |
),
|
384 |
|
|
|
412 |
"transformer.h.{bid}.attn.c_proj", # gpt2 refact
|
413 |
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
414 |
"transformer.h.{bid}.self_attention.dense", # falcon
|
|
|
415 |
"model.layers.{bid}.self_attn.o_proj", # llama-hf
|
416 |
"layers.{bid}.attention.wo", # llama-pth
|
417 |
"encoder.layer.{bid}.attention.output.dense", # bert
|
|
|
429 |
MODEL_TENSOR.FFN_NORM: (
|
430 |
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
|
431 |
"transformer.h.{bid}.ln_2", # gpt2 refact
|
|
|
432 |
"transformer.blocks.{bid}.norm_2", # mpt
|
433 |
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
434 |
"layers.{bid}.ffn_norm", # llama-pth
|
|
|
442 |
"transformer.h.{bid}.mlp.c_fc", # gpt2
|
443 |
"transformer.blocks.{bid}.ffn.up_proj", # mpt
|
444 |
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
|
|
|
445 |
"model.layers.{bid}.mlp.up_proj", # llama-hf refact
|
446 |
"layers.{bid}.feed_forward.w3", # llama-pth
|
447 |
"encoder.layer.{bid}.intermediate.dense", # bert
|
|
|
461 |
"transformer.h.{bid}.mlp.c_proj", # gpt2 refact
|
462 |
"transformer.blocks.{bid}.ffn.down_proj", # mpt
|
463 |
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
|
|
464 |
"model.layers.{bid}.mlp.down_proj", # llama-hf
|
465 |
"layers.{bid}.feed_forward.w2", # llama-pth
|
466 |
"encoder.layer.{bid}.output.dense", # bert
|
gpttype_adapter.cpp
CHANGED
@@ -1768,7 +1768,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|
1768 |
int realnpredict = params.n_predict-stopper_unused_tokens;
|
1769 |
float pt2 = (time2*1000.0/(realnpredict==0?1:realnpredict));
|
1770 |
float tokens_per_second = (realnpredict == 0 ? 0 : realnpredict / (time1 + time2));
|
1771 |
-
printf("\
|
1772 |
fflush(stdout);
|
1773 |
output.status = 1;
|
1774 |
generation_finished = true;
|
|
|
1768 |
int realnpredict = params.n_predict-stopper_unused_tokens;
|
1769 |
float pt2 = (time2*1000.0/(realnpredict==0?1:realnpredict));
|
1770 |
float tokens_per_second = (realnpredict == 0 ? 0 : realnpredict / (time1 + time2));
|
1771 |
+
printf("\nTime Taken - Processing:%.1fs (%.0fms/T), Generation:%.1fs (%.0fms/T), Total:%.1fs (%.1fT/s)", time1, pt1, time2, pt2, (time1 + time2), tokens_per_second);
|
1772 |
fflush(stdout);
|
1773 |
output.status = 1;
|
1774 |
generation_finished = true;
|
koboldcpp.py
CHANGED
@@ -184,10 +184,6 @@ def init_library():
|
|
184 |
os.add_dll_directory(dir_path)
|
185 |
os.add_dll_directory(abs_path)
|
186 |
os.add_dll_directory(os.getcwd())
|
187 |
-
if libname == lib_hipblas and "HIP_PATH" in os.environ:
|
188 |
-
os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "bin"))
|
189 |
-
if args.debugmode == 1:
|
190 |
-
print(f"HIP/ROCm SDK at {os.environ['HIP_PATH']} included in .DLL load path")
|
191 |
handle = ctypes.CDLL(os.path.join(dir_path, libname))
|
192 |
|
193 |
handle.load_model.argtypes = [load_model_inputs]
|
@@ -365,7 +361,7 @@ maxhordelen = 256
|
|
365 |
modelbusy = threading.Lock()
|
366 |
requestsinqueue = 0
|
367 |
defaultport = 5001
|
368 |
-
KcppVersion = "1.
|
369 |
showdebug = True
|
370 |
showsamplerwarning = True
|
371 |
showmaxctxwarning = True
|
@@ -373,8 +369,6 @@ session_kudos_earned = 0
|
|
373 |
session_jobs = 0
|
374 |
session_starttime = None
|
375 |
exitcounter = 0
|
376 |
-
punishcounter = 0 #causes a timeout if too many errors
|
377 |
-
rewardcounter = 0 #reduces error counts for successful jobs
|
378 |
totalgens = 0
|
379 |
currentusergenkey = "" #store a special key so polled streaming works even in multiuser
|
380 |
args = None #global args
|
@@ -418,34 +412,16 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|
418 |
elif api_format==4:
|
419 |
# translate openai chat completion messages format into one big string.
|
420 |
messages_array = genparams.get('messages', [])
|
421 |
-
adapter_obj = genparams.get('adapter', {})
|
422 |
messages_string = ""
|
423 |
-
system_message_start = adapter_obj.get("system_start", "\n### Instruction:\n")
|
424 |
-
system_message_end = adapter_obj.get("system_end", "")
|
425 |
-
user_message_start = adapter_obj.get("user_start", "\n### Instruction:\n")
|
426 |
-
user_message_end = adapter_obj.get("user_end", "")
|
427 |
-
assistant_message_start = adapter_obj.get("assistant_start", "\n### Response:\n")
|
428 |
-
assistant_message_end = adapter_obj.get("assistant_end", "")
|
429 |
-
|
430 |
for message in messages_array:
|
431 |
if message['role'] == "system":
|
432 |
-
messages_string
|
433 |
elif message['role'] == "user":
|
434 |
-
messages_string
|
435 |
elif message['role'] == "assistant":
|
436 |
-
messages_string
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
if message['role'] == "system":
|
441 |
-
messages_string += system_message_end
|
442 |
-
elif message['role'] == "user":
|
443 |
-
messages_string += user_message_end
|
444 |
-
elif message['role'] == "assistant":
|
445 |
-
messages_string += assistant_message_end
|
446 |
-
|
447 |
-
messages_string += assistant_message_start
|
448 |
-
|
449 |
genparams["prompt"] = messages_string
|
450 |
frqp = genparams.get('frequency_penalty', 0.1)
|
451 |
scaled_rep_pen = genparams.get('presence_penalty', frqp) + 1
|
@@ -521,9 +497,9 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|
521 |
async def handle_sse_stream(self, api_format):
|
522 |
global friendlymodelname
|
523 |
self.send_response(200)
|
524 |
-
self.send_header("
|
525 |
-
self.send_header("
|
526 |
-
self.end_headers(
|
527 |
|
528 |
current_token = 0
|
529 |
incomplete_token_buffer = bytearray()
|
@@ -590,10 +566,10 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|
590 |
global maxctx, maxhordelen, friendlymodelname, KcppVersion, totalgens
|
591 |
self.path = self.path.rstrip('/')
|
592 |
response_body = None
|
593 |
-
|
594 |
|
595 |
if self.path in ["", "/?"] or self.path.startswith(('/?','?')): #it's possible for the root url to have ?params without /
|
596 |
-
|
597 |
if self.embedded_kailite is None:
|
598 |
response_body = (f"Embedded Kobold Lite is not found.<br>You will have to connect via the main KoboldAI client, or <a href='https://lite.koboldai.net?local=1&port={self.port}'>use this URL</a> to connect.").encode()
|
599 |
else:
|
@@ -639,9 +615,9 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|
639 |
|
640 |
elif self.path.endswith('/v1/models'):
|
641 |
response_body = (json.dumps({"object":"list","data":[{"id":friendlymodelname,"object":"model","created":1,"owned_by":"koboldcpp","permission":[],"root":"koboldcpp"}]}).encode())
|
|
|
642 |
|
643 |
elif self.path=="/api":
|
644 |
-
content_type = 'text/html'
|
645 |
if self.embedded_kcpp_docs is None:
|
646 |
response_body = (f"KoboldCpp partial API reference can be found at the wiki: https://github.com/LostRuins/koboldcpp/wiki").encode()
|
647 |
else:
|
@@ -649,40 +625,41 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|
649 |
elif self.path.endswith(('/api')) or self.path.endswith(('/api/v1')):
|
650 |
self.path = "/api"
|
651 |
self.send_response(302)
|
652 |
-
self.send_header("
|
653 |
-
self.end_headers(
|
654 |
return None
|
655 |
|
656 |
if response_body is None:
|
657 |
self.send_response(404)
|
658 |
-
self.end_headers(
|
659 |
rp = 'Error: HTTP Server is running, but this endpoint does not exist. Please check the URL.'
|
660 |
self.wfile.write(rp.encode())
|
661 |
else:
|
662 |
self.send_response(200)
|
663 |
-
self.send_header('
|
664 |
-
self.end_headers(
|
665 |
self.wfile.write(response_body)
|
666 |
return
|
667 |
|
668 |
def do_POST(self):
|
669 |
global modelbusy, requestsinqueue, currentusergenkey, totalgens
|
670 |
-
content_length = int(self.headers['
|
671 |
body = self.rfile.read(content_length)
|
672 |
self.path = self.path.rstrip('/')
|
|
|
673 |
if self.path.endswith(('/api/extra/tokencount')):
|
674 |
try:
|
675 |
genparams = json.loads(body)
|
676 |
countprompt = genparams.get('prompt', "")
|
677 |
count = handle.token_count(countprompt.encode("UTF-8"))
|
678 |
self.send_response(200)
|
679 |
-
self.end_headers(
|
680 |
self.wfile.write(json.dumps({"value": count}).encode())
|
681 |
|
682 |
except ValueError as e:
|
683 |
utfprint("Count Tokens - Body Error: " + str(e))
|
684 |
self.send_response(400)
|
685 |
-
self.end_headers(
|
686 |
self.wfile.write(json.dumps({"value": -1}).encode())
|
687 |
return
|
688 |
|
@@ -695,11 +672,11 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|
695 |
multiuserkey = ""
|
696 |
pass
|
697 |
|
698 |
-
if (multiuserkey
|
699 |
ag = handle.abort_generate()
|
700 |
time.sleep(0.3) #short delay before replying
|
701 |
self.send_response(200)
|
702 |
-
self.end_headers(
|
703 |
self.wfile.write(json.dumps({"success": ("true" if ag else "false")}).encode())
|
704 |
print("\nGeneration Aborted")
|
705 |
else:
|
@@ -717,11 +694,11 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|
717 |
pass
|
718 |
|
719 |
if totalgens>0:
|
720 |
-
if (multiuserkey
|
721 |
pendtxt = handle.get_pending_output()
|
722 |
pendtxtStr = ctypes.string_at(pendtxt).decode("UTF-8","ignore")
|
723 |
self.send_response(200)
|
724 |
-
self.end_headers(
|
725 |
self.wfile.write(json.dumps({"results": [{"text": pendtxtStr}]}).encode())
|
726 |
return
|
727 |
|
@@ -731,7 +708,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|
731 |
requestsinqueue += 1
|
732 |
if not modelbusy.acquire(blocking=reqblocking):
|
733 |
self.send_response(503)
|
734 |
-
self.end_headers(
|
735 |
self.wfile.write(json.dumps({"detail": {
|
736 |
"msg": "Server is busy; please try again later.",
|
737 |
"type": "service_unavailable",
|
@@ -757,9 +734,11 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|
757 |
|
758 |
if self.path.endswith('/v1/completions'):
|
759 |
api_format = 3
|
|
|
760 |
|
761 |
if self.path.endswith('/v1/chat/completions'):
|
762 |
api_format = 4
|
|
|
763 |
|
764 |
if api_format > 0:
|
765 |
genparams = None
|
@@ -785,8 +764,8 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|
785 |
# Headers are already sent when streaming
|
786 |
if not sse_stream_flag:
|
787 |
self.send_response(200)
|
788 |
-
self.end_headers(
|
789 |
-
|
790 |
except:
|
791 |
print("Generate: The response could not be sent, maybe connection was terminated?")
|
792 |
return
|
@@ -794,23 +773,27 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|
794 |
modelbusy.release()
|
795 |
|
796 |
self.send_response(404)
|
797 |
-
self.end_headers(
|
798 |
|
799 |
|
800 |
def do_OPTIONS(self):
|
801 |
self.send_response(200)
|
802 |
-
self.end_headers(
|
803 |
|
804 |
def do_HEAD(self):
|
805 |
self.send_response(200)
|
806 |
-
self.end_headers(
|
807 |
-
|
808 |
-
def end_headers(self,
|
809 |
-
self.send_header('
|
810 |
-
self.send_header('
|
811 |
-
self.send_header('
|
812 |
-
if
|
813 |
-
|
|
|
|
|
|
|
|
|
814 |
return super(ServerRequestHandler, self).end_headers()
|
815 |
|
816 |
|
@@ -1034,8 +1017,7 @@ def show_new_gui():
|
|
1034 |
mmq_var = ctk.IntVar(value=1)
|
1035 |
blas_threads_var = ctk.StringVar()
|
1036 |
blas_size_var = ctk.IntVar()
|
1037 |
-
version_var =
|
1038 |
-
tensor_split_str_vars = ctk.StringVar(value="")
|
1039 |
|
1040 |
smartcontext = ctk.IntVar()
|
1041 |
context_var = ctk.IntVar()
|
@@ -1087,15 +1069,11 @@ def show_new_gui():
|
|
1087 |
quick_lowvram_box.grid(row=4, column=0, padx=8, pady=1, stick="nw")
|
1088 |
mmq_box.grid(row=4, column=1, padx=8, pady=1, stick="nw")
|
1089 |
quick_mmq_box.grid(row=4, column=1, padx=8, pady=1, stick="nw")
|
1090 |
-
tensor_split_label.grid(row=6, column=0, padx = 8, pady=1, stick="nw")
|
1091 |
-
tensor_split_entry.grid(row=6, column=1, padx=8, pady=1, stick="nw")
|
1092 |
else:
|
1093 |
lowvram_box.grid_forget()
|
1094 |
quick_lowvram_box.grid_forget()
|
1095 |
mmq_box.grid_forget()
|
1096 |
quick_mmq_box.grid_forget()
|
1097 |
-
tensor_split_label.grid_forget()
|
1098 |
-
tensor_split_entry.grid_forget()
|
1099 |
|
1100 |
if index == "Use CLBlast" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
|
1101 |
gpu_layers_label.grid(row=5, column=0, padx = 8, pady=1, stick="nw")
|
@@ -1108,7 +1086,6 @@ def show_new_gui():
|
|
1108 |
quick_gpu_layers_label.grid_forget()
|
1109 |
quick_gpu_layers_entry.grid_forget()
|
1110 |
|
1111 |
-
|
1112 |
# presets selector
|
1113 |
makelabel(quick_tab, "Presets:", 1)
|
1114 |
|
@@ -1141,7 +1118,7 @@ def show_new_gui():
|
|
1141 |
makeslider(quick_tab, "Context Size:", contextsize_text, context_var, 0, len(contextsize_text)-1, 30, set=2)
|
1142 |
|
1143 |
# load model
|
1144 |
-
makefileentry(quick_tab, "Model:", "Select GGML Model File", model_var, 40, 170)
|
1145 |
|
1146 |
# Hardware Tab
|
1147 |
hardware_tab = tabcontent["Hardware"]
|
@@ -1160,7 +1137,6 @@ def show_new_gui():
|
|
1160 |
gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3","4"], width=60, variable=gpu_choice_var, state="readonly")
|
1161 |
CUDA_gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3","4", "All"], width=60, variable=gpu_choice_var, state="readonly")
|
1162 |
gpu_layers_entry,gpu_layers_label = makelabelentry(hardware_tab,"GPU Layers:", gpulayers_var, 5, 50)
|
1163 |
-
tensor_split_entry,tensor_split_label = makelabelentry(hardware_tab, "Tensor Split:", tensor_split_str_vars, 6, 80)
|
1164 |
lowvram_box = makecheckbox(hardware_tab, "Low VRAM", lowvram_var, 4,0)
|
1165 |
mmq_box = makecheckbox(hardware_tab, "Use QuantMatMul (mmq)", mmq_var, 4,1)
|
1166 |
|
@@ -1209,7 +1185,7 @@ def show_new_gui():
|
|
1209 |
# Model Tab
|
1210 |
model_tab = tabcontent["Model"]
|
1211 |
|
1212 |
-
makefileentry(model_tab, "Model:", "Select GGML Model File", model_var, 1)
|
1213 |
makefileentry(model_tab, "Lora:", "Select Lora File",lora_var, 3)
|
1214 |
makefileentry(model_tab, "Lora Base:", "Select Lora Base File", lora_base_var, 5)
|
1215 |
|
@@ -1289,12 +1265,6 @@ def show_new_gui():
|
|
1289 |
args.noavx2 = True
|
1290 |
args.noblas = True
|
1291 |
args.nommap = True
|
1292 |
-
if tensor_split_str_vars.get()!="":
|
1293 |
-
tssv = tensor_split_str_vars.get()
|
1294 |
-
if "," in tssv:
|
1295 |
-
args.tensor_split = [float(x) for x in tssv.split(",")]
|
1296 |
-
else:
|
1297 |
-
args.tensor_split = [float(x) for x in tssv.split(" ")]
|
1298 |
|
1299 |
args.blasthreads = None if blas_threads_var.get()=="" else int(blas_threads_var.get())
|
1300 |
|
@@ -1359,9 +1329,6 @@ def show_new_gui():
|
|
1359 |
runopts_var.set(openblas_option)
|
1360 |
if "gpulayers" in dict and dict["gpulayers"]:
|
1361 |
gpulayers_var.set(dict["gpulayers"])
|
1362 |
-
if "tensor_split" in dict and dict["tensor_split"]:
|
1363 |
-
tssep = ','.join(map(str, dict["tensor_split"]))
|
1364 |
-
tensor_split_str_vars.set(tssep)
|
1365 |
if "blasthreads" in dict and dict["blasthreads"]:
|
1366 |
blas_threads_var.set(str(dict["blasthreads"]))
|
1367 |
else:
|
@@ -1480,7 +1447,7 @@ def show_gui_msgbox(title,message):
|
|
1480 |
def run_horde_worker(args, api_key, worker_name):
|
1481 |
import urllib.request
|
1482 |
from datetime import datetime
|
1483 |
-
global friendlymodelname, maxhordectx, maxhordelen, exitcounter,
|
1484 |
epurl = f"http://localhost:{args.port}"
|
1485 |
if args.host!="":
|
1486 |
epurl = f"http://{args.host}:{args.port}"
|
@@ -1489,11 +1456,10 @@ def run_horde_worker(args, api_key, worker_name):
|
|
1489 |
print(f"{datetime.now().strftime('[%H:%M:%S]')} " + txt)
|
1490 |
|
1491 |
def submit_completed_generation(url, jobid, sessionstart, submit_dict):
|
1492 |
-
global exitcounter,
|
1493 |
reply = make_url_request(url, submit_dict)
|
1494 |
if not reply:
|
1495 |
exitcounter += 1
|
1496 |
-
punishcounter += 1
|
1497 |
print_with_time(f"Error, Job submit failed.")
|
1498 |
else:
|
1499 |
reward = reply["reward"]
|
@@ -1507,11 +1473,6 @@ def run_horde_worker(args, api_key, worker_name):
|
|
1507 |
elapsedtimestr = f"{hrs:03d}h:{mins:02d}m:{secs:02d}s"
|
1508 |
earnrate = session_kudos_earned/(elapsedtime.seconds/3600)
|
1509 |
print_with_time(f'Submitted {jobid} and earned {reward:.0f} kudos\n[Total:{session_kudos_earned:.0f} kudos, Time:{elapsedtimestr}, Jobs:{session_jobs}, EarnRate:{earnrate:.0f} kudos/hr]')
|
1510 |
-
rewardcounter += 1
|
1511 |
-
if rewardcounter > 50:
|
1512 |
-
rewardcounter = 0
|
1513 |
-
if exitcounter > 5:
|
1514 |
-
exitcounter -= 1
|
1515 |
|
1516 |
def make_url_request(url, data, method='POST'):
|
1517 |
try:
|
@@ -1520,7 +1481,7 @@ def run_horde_worker(args, api_key, worker_name):
|
|
1520 |
if method=='POST':
|
1521 |
json_payload = json.dumps(data).encode('utf-8')
|
1522 |
request = urllib.request.Request(url, data=json_payload, headers=headers, method=method)
|
1523 |
-
request.add_header('
|
1524 |
else:
|
1525 |
request = urllib.request.Request(url, headers=headers, method=method)
|
1526 |
response_data = ""
|
@@ -1547,23 +1508,17 @@ def run_horde_worker(args, api_key, worker_name):
|
|
1547 |
print(f"===\nEmbedded Horde Worker '{worker_name}' Starting...\n(To use your own KAI Bridge/Scribe worker instead, don't set your API key)")
|
1548 |
BRIDGE_AGENT = f"KoboldCppEmbedWorker:2:https://github.com/LostRuins/koboldcpp"
|
1549 |
cluster = "https://horde.koboldai.net"
|
1550 |
-
while exitcounter <
|
1551 |
time.sleep(3)
|
1552 |
readygo = make_url_request(f'{epurl}/api/v1/info/version', None,'GET')
|
1553 |
if readygo:
|
1554 |
print_with_time(f"Embedded Horde Worker '{worker_name}' is started.")
|
1555 |
break
|
1556 |
|
1557 |
-
while exitcounter <
|
1558 |
currentjob_attempts = 0
|
1559 |
current_generation = None
|
1560 |
|
1561 |
-
if punishcounter >= 10:
|
1562 |
-
punishcounter = 0
|
1563 |
-
print_with_time(f"Horde Worker Paused for 10 min - Too many errors. It will resume automatically.")
|
1564 |
-
print_with_time(f"Caution: Too many failed jobs may lead to entering maintenance mode.")
|
1565 |
-
time.sleep(600)
|
1566 |
-
|
1567 |
#first, make sure we are not generating
|
1568 |
if modelbusy.locked():
|
1569 |
time.sleep(0.2)
|
@@ -1582,7 +1537,6 @@ def run_horde_worker(args, api_key, worker_name):
|
|
1582 |
pop = make_url_request(f'{cluster}/api/v2/generate/text/pop',gen_dict)
|
1583 |
if not pop:
|
1584 |
exitcounter += 1
|
1585 |
-
punishcounter += 1
|
1586 |
print_with_time(f"Failed to fetch job from {cluster}. Waiting 5 seconds...")
|
1587 |
time.sleep(5)
|
1588 |
continue
|
@@ -1601,7 +1555,7 @@ def run_horde_worker(args, api_key, worker_name):
|
|
1601 |
print_with_time(f"Job received from {cluster} for {current_payload.get('max_length',80)} tokens and {current_payload.get('max_context_length',1024)} max context. Starting generation...")
|
1602 |
|
1603 |
#do gen
|
1604 |
-
while exitcounter <
|
1605 |
if not modelbusy.locked():
|
1606 |
current_generation = make_url_request(f'{epurl}/api/v1/generate', current_payload)
|
1607 |
if current_generation:
|
@@ -1926,10 +1880,4 @@ if __name__ == '__main__':
|
|
1926 |
parser.add_argument("--multiuser", help="Runs in multiuser mode, which queues incoming requests instead of blocking them.", action='store_true')
|
1927 |
parser.add_argument("--foreground", help="Windows only. Sends the terminal to the foreground every time a new prompt is generated. This helps avoid some idle slowdown issues.", action='store_true')
|
1928 |
|
1929 |
-
# #deprecated hidden args. they do nothing. do not use
|
1930 |
-
# parser.add_argument("--psutil_set_threads", action='store_true', help=argparse.SUPPRESS)
|
1931 |
-
# parser.add_argument("--stream", action='store_true', help=argparse.SUPPRESS)
|
1932 |
-
# parser.add_argument("--unbantokens", action='store_true', help=argparse.SUPPRESS)
|
1933 |
-
# parser.add_argument("--usemirostat", action='store_true', help=argparse.SUPPRESS)
|
1934 |
-
|
1935 |
main(parser.parse_args(),start_server=True)
|
|
|
184 |
os.add_dll_directory(dir_path)
|
185 |
os.add_dll_directory(abs_path)
|
186 |
os.add_dll_directory(os.getcwd())
|
|
|
|
|
|
|
|
|
187 |
handle = ctypes.CDLL(os.path.join(dir_path, libname))
|
188 |
|
189 |
handle.load_model.argtypes = [load_model_inputs]
|
|
|
361 |
modelbusy = threading.Lock()
|
362 |
requestsinqueue = 0
|
363 |
defaultport = 5001
|
364 |
+
KcppVersion = "1.46.1"
|
365 |
showdebug = True
|
366 |
showsamplerwarning = True
|
367 |
showmaxctxwarning = True
|
|
|
369 |
session_jobs = 0
|
370 |
session_starttime = None
|
371 |
exitcounter = 0
|
|
|
|
|
372 |
totalgens = 0
|
373 |
currentusergenkey = "" #store a special key so polled streaming works even in multiuser
|
374 |
args = None #global args
|
|
|
412 |
elif api_format==4:
|
413 |
# translate openai chat completion messages format into one big string.
|
414 |
messages_array = genparams.get('messages', [])
|
|
|
415 |
messages_string = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
416 |
for message in messages_array:
|
417 |
if message['role'] == "system":
|
418 |
+
messages_string+="\n### Instruction:\n"
|
419 |
elif message['role'] == "user":
|
420 |
+
messages_string+="\n### Instruction:\n"
|
421 |
elif message['role'] == "assistant":
|
422 |
+
messages_string+="\n### Response:\n"
|
423 |
+
messages_string+=message['content']
|
424 |
+
messages_string += "\n### Response:\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
425 |
genparams["prompt"] = messages_string
|
426 |
frqp = genparams.get('frequency_penalty', 0.1)
|
427 |
scaled_rep_pen = genparams.get('presence_penalty', frqp) + 1
|
|
|
497 |
async def handle_sse_stream(self, api_format):
|
498 |
global friendlymodelname
|
499 |
self.send_response(200)
|
500 |
+
self.send_header("Cache-Control", "no-cache")
|
501 |
+
self.send_header("Connection", "keep-alive")
|
502 |
+
self.end_headers(force_json=True, sse_stream_flag=True)
|
503 |
|
504 |
current_token = 0
|
505 |
incomplete_token_buffer = bytearray()
|
|
|
566 |
global maxctx, maxhordelen, friendlymodelname, KcppVersion, totalgens
|
567 |
self.path = self.path.rstrip('/')
|
568 |
response_body = None
|
569 |
+
force_json = False
|
570 |
|
571 |
if self.path in ["", "/?"] or self.path.startswith(('/?','?')): #it's possible for the root url to have ?params without /
|
572 |
+
|
573 |
if self.embedded_kailite is None:
|
574 |
response_body = (f"Embedded Kobold Lite is not found.<br>You will have to connect via the main KoboldAI client, or <a href='https://lite.koboldai.net?local=1&port={self.port}'>use this URL</a> to connect.").encode()
|
575 |
else:
|
|
|
615 |
|
616 |
elif self.path.endswith('/v1/models'):
|
617 |
response_body = (json.dumps({"object":"list","data":[{"id":friendlymodelname,"object":"model","created":1,"owned_by":"koboldcpp","permission":[],"root":"koboldcpp"}]}).encode())
|
618 |
+
force_json = True
|
619 |
|
620 |
elif self.path=="/api":
|
|
|
621 |
if self.embedded_kcpp_docs is None:
|
622 |
response_body = (f"KoboldCpp partial API reference can be found at the wiki: https://github.com/LostRuins/koboldcpp/wiki").encode()
|
623 |
else:
|
|
|
625 |
elif self.path.endswith(('/api')) or self.path.endswith(('/api/v1')):
|
626 |
self.path = "/api"
|
627 |
self.send_response(302)
|
628 |
+
self.send_header("Location", self.path)
|
629 |
+
self.end_headers()
|
630 |
return None
|
631 |
|
632 |
if response_body is None:
|
633 |
self.send_response(404)
|
634 |
+
self.end_headers()
|
635 |
rp = 'Error: HTTP Server is running, but this endpoint does not exist. Please check the URL.'
|
636 |
self.wfile.write(rp.encode())
|
637 |
else:
|
638 |
self.send_response(200)
|
639 |
+
self.send_header('Content-Length', str(len(response_body)))
|
640 |
+
self.end_headers(force_json=force_json)
|
641 |
self.wfile.write(response_body)
|
642 |
return
|
643 |
|
644 |
def do_POST(self):
|
645 |
global modelbusy, requestsinqueue, currentusergenkey, totalgens
|
646 |
+
content_length = int(self.headers['Content-Length'])
|
647 |
body = self.rfile.read(content_length)
|
648 |
self.path = self.path.rstrip('/')
|
649 |
+
force_json = False
|
650 |
if self.path.endswith(('/api/extra/tokencount')):
|
651 |
try:
|
652 |
genparams = json.loads(body)
|
653 |
countprompt = genparams.get('prompt', "")
|
654 |
count = handle.token_count(countprompt.encode("UTF-8"))
|
655 |
self.send_response(200)
|
656 |
+
self.end_headers()
|
657 |
self.wfile.write(json.dumps({"value": count}).encode())
|
658 |
|
659 |
except ValueError as e:
|
660 |
utfprint("Count Tokens - Body Error: " + str(e))
|
661 |
self.send_response(400)
|
662 |
+
self.end_headers()
|
663 |
self.wfile.write(json.dumps({"value": -1}).encode())
|
664 |
return
|
665 |
|
|
|
672 |
multiuserkey = ""
|
673 |
pass
|
674 |
|
675 |
+
if (multiuserkey!="" and multiuserkey==currentusergenkey) or requestsinqueue==0:
|
676 |
ag = handle.abort_generate()
|
677 |
time.sleep(0.3) #short delay before replying
|
678 |
self.send_response(200)
|
679 |
+
self.end_headers()
|
680 |
self.wfile.write(json.dumps({"success": ("true" if ag else "false")}).encode())
|
681 |
print("\nGeneration Aborted")
|
682 |
else:
|
|
|
694 |
pass
|
695 |
|
696 |
if totalgens>0:
|
697 |
+
if (multiuserkey!="" and multiuserkey==currentusergenkey) or requestsinqueue==0:
|
698 |
pendtxt = handle.get_pending_output()
|
699 |
pendtxtStr = ctypes.string_at(pendtxt).decode("UTF-8","ignore")
|
700 |
self.send_response(200)
|
701 |
+
self.end_headers()
|
702 |
self.wfile.write(json.dumps({"results": [{"text": pendtxtStr}]}).encode())
|
703 |
return
|
704 |
|
|
|
708 |
requestsinqueue += 1
|
709 |
if not modelbusy.acquire(blocking=reqblocking):
|
710 |
self.send_response(503)
|
711 |
+
self.end_headers()
|
712 |
self.wfile.write(json.dumps({"detail": {
|
713 |
"msg": "Server is busy; please try again later.",
|
714 |
"type": "service_unavailable",
|
|
|
734 |
|
735 |
if self.path.endswith('/v1/completions'):
|
736 |
api_format = 3
|
737 |
+
force_json = True
|
738 |
|
739 |
if self.path.endswith('/v1/chat/completions'):
|
740 |
api_format = 4
|
741 |
+
force_json = True
|
742 |
|
743 |
if api_format > 0:
|
744 |
genparams = None
|
|
|
764 |
# Headers are already sent when streaming
|
765 |
if not sse_stream_flag:
|
766 |
self.send_response(200)
|
767 |
+
self.end_headers(force_json=force_json)
|
768 |
+
self.wfile.write(json.dumps(gen).encode())
|
769 |
except:
|
770 |
print("Generate: The response could not be sent, maybe connection was terminated?")
|
771 |
return
|
|
|
773 |
modelbusy.release()
|
774 |
|
775 |
self.send_response(404)
|
776 |
+
self.end_headers()
|
777 |
|
778 |
|
779 |
def do_OPTIONS(self):
|
780 |
self.send_response(200)
|
781 |
+
self.end_headers()
|
782 |
|
783 |
def do_HEAD(self):
|
784 |
self.send_response(200)
|
785 |
+
self.end_headers()
|
786 |
+
|
787 |
+
def end_headers(self, force_json=False, sse_stream_flag=False):
|
788 |
+
self.send_header('Access-Control-Allow-Origin', '*')
|
789 |
+
self.send_header('Access-Control-Allow-Methods', '*')
|
790 |
+
self.send_header('Access-Control-Allow-Headers', '*')
|
791 |
+
if ("/api" in self.path and self.path!="/api") or force_json:
|
792 |
+
if sse_stream_flag:
|
793 |
+
self.send_header('Content-type', 'text/event-stream')
|
794 |
+
self.send_header('Content-type', 'application/json')
|
795 |
+
else:
|
796 |
+
self.send_header('Content-type', 'text/html')
|
797 |
return super(ServerRequestHandler, self).end_headers()
|
798 |
|
799 |
|
|
|
1017 |
mmq_var = ctk.IntVar(value=1)
|
1018 |
blas_threads_var = ctk.StringVar()
|
1019 |
blas_size_var = ctk.IntVar()
|
1020 |
+
version_var =ctk.StringVar(value="0")
|
|
|
1021 |
|
1022 |
smartcontext = ctk.IntVar()
|
1023 |
context_var = ctk.IntVar()
|
|
|
1069 |
quick_lowvram_box.grid(row=4, column=0, padx=8, pady=1, stick="nw")
|
1070 |
mmq_box.grid(row=4, column=1, padx=8, pady=1, stick="nw")
|
1071 |
quick_mmq_box.grid(row=4, column=1, padx=8, pady=1, stick="nw")
|
|
|
|
|
1072 |
else:
|
1073 |
lowvram_box.grid_forget()
|
1074 |
quick_lowvram_box.grid_forget()
|
1075 |
mmq_box.grid_forget()
|
1076 |
quick_mmq_box.grid_forget()
|
|
|
|
|
1077 |
|
1078 |
if index == "Use CLBlast" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
|
1079 |
gpu_layers_label.grid(row=5, column=0, padx = 8, pady=1, stick="nw")
|
|
|
1086 |
quick_gpu_layers_label.grid_forget()
|
1087 |
quick_gpu_layers_entry.grid_forget()
|
1088 |
|
|
|
1089 |
# presets selector
|
1090 |
makelabel(quick_tab, "Presets:", 1)
|
1091 |
|
|
|
1118 |
makeslider(quick_tab, "Context Size:", contextsize_text, context_var, 0, len(contextsize_text)-1, 30, set=2)
|
1119 |
|
1120 |
# load model
|
1121 |
+
makefileentry(quick_tab, "Model:", "Select GGML Model File", model_var, 40, 170,filetypes=[("GGML Model Files", "*.gguf;*.bin;*.ggml")])
|
1122 |
|
1123 |
# Hardware Tab
|
1124 |
hardware_tab = tabcontent["Hardware"]
|
|
|
1137 |
gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3","4"], width=60, variable=gpu_choice_var, state="readonly")
|
1138 |
CUDA_gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3","4", "All"], width=60, variable=gpu_choice_var, state="readonly")
|
1139 |
gpu_layers_entry,gpu_layers_label = makelabelentry(hardware_tab,"GPU Layers:", gpulayers_var, 5, 50)
|
|
|
1140 |
lowvram_box = makecheckbox(hardware_tab, "Low VRAM", lowvram_var, 4,0)
|
1141 |
mmq_box = makecheckbox(hardware_tab, "Use QuantMatMul (mmq)", mmq_var, 4,1)
|
1142 |
|
|
|
1185 |
# Model Tab
|
1186 |
model_tab = tabcontent["Model"]
|
1187 |
|
1188 |
+
makefileentry(model_tab, "Model:", "Select GGML Model File", model_var, 1, filetypes=[("GGML Model Files", "*.gguf;*.bin;*.ggml")])
|
1189 |
makefileentry(model_tab, "Lora:", "Select Lora File",lora_var, 3)
|
1190 |
makefileentry(model_tab, "Lora Base:", "Select Lora Base File", lora_base_var, 5)
|
1191 |
|
|
|
1265 |
args.noavx2 = True
|
1266 |
args.noblas = True
|
1267 |
args.nommap = True
|
|
|
|
|
|
|
|
|
|
|
|
|
1268 |
|
1269 |
args.blasthreads = None if blas_threads_var.get()=="" else int(blas_threads_var.get())
|
1270 |
|
|
|
1329 |
runopts_var.set(openblas_option)
|
1330 |
if "gpulayers" in dict and dict["gpulayers"]:
|
1331 |
gpulayers_var.set(dict["gpulayers"])
|
|
|
|
|
|
|
1332 |
if "blasthreads" in dict and dict["blasthreads"]:
|
1333 |
blas_threads_var.set(str(dict["blasthreads"]))
|
1334 |
else:
|
|
|
1447 |
def run_horde_worker(args, api_key, worker_name):
|
1448 |
import urllib.request
|
1449 |
from datetime import datetime
|
1450 |
+
global friendlymodelname, maxhordectx, maxhordelen, exitcounter, modelbusy, session_starttime
|
1451 |
epurl = f"http://localhost:{args.port}"
|
1452 |
if args.host!="":
|
1453 |
epurl = f"http://{args.host}:{args.port}"
|
|
|
1456 |
print(f"{datetime.now().strftime('[%H:%M:%S]')} " + txt)
|
1457 |
|
1458 |
def submit_completed_generation(url, jobid, sessionstart, submit_dict):
|
1459 |
+
global exitcounter, session_kudos_earned, session_jobs
|
1460 |
reply = make_url_request(url, submit_dict)
|
1461 |
if not reply:
|
1462 |
exitcounter += 1
|
|
|
1463 |
print_with_time(f"Error, Job submit failed.")
|
1464 |
else:
|
1465 |
reward = reply["reward"]
|
|
|
1473 |
elapsedtimestr = f"{hrs:03d}h:{mins:02d}m:{secs:02d}s"
|
1474 |
earnrate = session_kudos_earned/(elapsedtime.seconds/3600)
|
1475 |
print_with_time(f'Submitted {jobid} and earned {reward:.0f} kudos\n[Total:{session_kudos_earned:.0f} kudos, Time:{elapsedtimestr}, Jobs:{session_jobs}, EarnRate:{earnrate:.0f} kudos/hr]')
|
|
|
|
|
|
|
|
|
|
|
1476 |
|
1477 |
def make_url_request(url, data, method='POST'):
|
1478 |
try:
|
|
|
1481 |
if method=='POST':
|
1482 |
json_payload = json.dumps(data).encode('utf-8')
|
1483 |
request = urllib.request.Request(url, data=json_payload, headers=headers, method=method)
|
1484 |
+
request.add_header('Content-Type', 'application/json')
|
1485 |
else:
|
1486 |
request = urllib.request.Request(url, headers=headers, method=method)
|
1487 |
response_data = ""
|
|
|
1508 |
print(f"===\nEmbedded Horde Worker '{worker_name}' Starting...\n(To use your own KAI Bridge/Scribe worker instead, don't set your API key)")
|
1509 |
BRIDGE_AGENT = f"KoboldCppEmbedWorker:2:https://github.com/LostRuins/koboldcpp"
|
1510 |
cluster = "https://horde.koboldai.net"
|
1511 |
+
while exitcounter < 10:
|
1512 |
time.sleep(3)
|
1513 |
readygo = make_url_request(f'{epurl}/api/v1/info/version', None,'GET')
|
1514 |
if readygo:
|
1515 |
print_with_time(f"Embedded Horde Worker '{worker_name}' is started.")
|
1516 |
break
|
1517 |
|
1518 |
+
while exitcounter < 10:
|
1519 |
currentjob_attempts = 0
|
1520 |
current_generation = None
|
1521 |
|
|
|
|
|
|
|
|
|
|
|
|
|
1522 |
#first, make sure we are not generating
|
1523 |
if modelbusy.locked():
|
1524 |
time.sleep(0.2)
|
|
|
1537 |
pop = make_url_request(f'{cluster}/api/v2/generate/text/pop',gen_dict)
|
1538 |
if not pop:
|
1539 |
exitcounter += 1
|
|
|
1540 |
print_with_time(f"Failed to fetch job from {cluster}. Waiting 5 seconds...")
|
1541 |
time.sleep(5)
|
1542 |
continue
|
|
|
1555 |
print_with_time(f"Job received from {cluster} for {current_payload.get('max_length',80)} tokens and {current_payload.get('max_context_length',1024)} max context. Starting generation...")
|
1556 |
|
1557 |
#do gen
|
1558 |
+
while exitcounter < 10:
|
1559 |
if not modelbusy.locked():
|
1560 |
current_generation = make_url_request(f'{epurl}/api/v1/generate', current_payload)
|
1561 |
if current_generation:
|
|
|
1880 |
parser.add_argument("--multiuser", help="Runs in multiuser mode, which queues incoming requests instead of blocking them.", action='store_true')
|
1881 |
parser.add_argument("--foreground", help="Windows only. Sends the terminal to the foreground every time a new prompt is generated. This helps avoid some idle slowdown issues.", action='store_true')
|
1882 |
|
|
|
|
|
|
|
|
|
|
|
|
|
1883 |
main(parser.parse_args(),start_server=True)
|
llama.cpp
CHANGED
@@ -189,7 +189,6 @@ enum llm_arch {
|
|
189 |
LLM_ARCH_STARCODER,
|
190 |
LLM_ARCH_PERSIMMON,
|
191 |
LLM_ARCH_REFACT,
|
192 |
-
LLM_ARCH_BLOOM,
|
193 |
LLM_ARCH_UNKNOWN,
|
194 |
};
|
195 |
|
@@ -203,8 +202,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
203 |
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
204 |
{ LLM_ARCH_STARCODER, "starcoder" },
|
205 |
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
206 |
-
{ LLM_ARCH_REFACT, "refact"
|
207 |
-
{ LLM_ARCH_BLOOM, "bloom" },
|
208 |
};
|
209 |
|
210 |
enum llm_kv {
|
@@ -307,7 +305,6 @@ struct LLM_KV {
|
|
307 |
|
308 |
enum llm_tensor {
|
309 |
LLM_TENSOR_TOKEN_EMBD,
|
310 |
-
LLM_TENSOR_TOKEN_EMBD_NORM,
|
311 |
LLM_TENSOR_POS_EMBD,
|
312 |
LLM_TENSOR_OUTPUT,
|
313 |
LLM_TENSOR_OUTPUT_NORM,
|
@@ -428,14 +425,6 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
428 |
LLM_ARCH_MPT,
|
429 |
{
|
430 |
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
431 |
-
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
432 |
-
{ LLM_TENSOR_OUTPUT, "output" },
|
433 |
-
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
434 |
-
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
435 |
-
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
436 |
-
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
437 |
-
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
438 |
-
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
439 |
},
|
440 |
},
|
441 |
{
|
@@ -470,21 +459,6 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
470 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
471 |
},
|
472 |
},
|
473 |
-
{
|
474 |
-
LLM_ARCH_BLOOM,
|
475 |
-
{
|
476 |
-
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
477 |
-
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
478 |
-
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
479 |
-
{ LLM_TENSOR_OUTPUT, "output" },
|
480 |
-
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
481 |
-
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
482 |
-
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
483 |
-
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
484 |
-
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
485 |
-
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
486 |
-
},
|
487 |
-
},
|
488 |
{
|
489 |
LLM_ARCH_UNKNOWN,
|
490 |
{
|
@@ -1042,9 +1016,6 @@ struct llama_hparams {
|
|
1042 |
float rope_freq_base_train;
|
1043 |
float rope_freq_scale_train;
|
1044 |
|
1045 |
-
float f_clamp_kqv;
|
1046 |
-
float f_max_alibi_bias;
|
1047 |
-
|
1048 |
bool operator!=(const llama_hparams & other) const {
|
1049 |
if (this->vocab_only != other.vocab_only) return true;
|
1050 |
if (this->n_vocab != other.n_vocab) return true;
|
@@ -1230,8 +1201,6 @@ struct llama_model {
|
|
1230 |
|
1231 |
struct ggml_tensor * tok_embeddings;
|
1232 |
struct ggml_tensor * pos_embeddings;
|
1233 |
-
struct ggml_tensor * tok_norm;
|
1234 |
-
struct ggml_tensor * tok_norm_b;
|
1235 |
|
1236 |
struct ggml_tensor * output_norm;
|
1237 |
struct ggml_tensor * output_norm_b;
|
@@ -1361,11 +1330,7 @@ static bool llama_kv_cache_init(
|
|
1361 |
cache.cells.clear();
|
1362 |
cache.cells.resize(n_ctx);
|
1363 |
|
1364 |
-
// TODO: this should be:
|
1365 |
-
// cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
|
1366 |
-
// change it and test that it works
|
1367 |
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
1368 |
-
memset(cache.buf.data, 0, cache.buf.size);
|
1369 |
|
1370 |
struct ggml_init_params params;
|
1371 |
params.mem_size = cache.buf.size;
|
@@ -1771,7 +1736,7 @@ struct llama_model_loader {
|
|
1771 |
}
|
1772 |
}
|
1773 |
|
1774 |
-
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta,
|
1775 |
if (backend != GGML_BACKEND_CPU) {
|
1776 |
ggml_set_no_alloc(ctx, true);
|
1777 |
}
|
@@ -1789,7 +1754,7 @@ struct llama_model_loader {
|
|
1789 |
return tensor;
|
1790 |
}
|
1791 |
|
1792 |
-
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne,
|
1793 |
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
|
1794 |
|
1795 |
if (cur == NULL) {
|
@@ -2082,13 +2047,13 @@ static void llm_load_hparams(
|
|
2082 |
}
|
2083 |
} break;
|
2084 |
case LLM_ARCH_PERSIMMON:
|
2085 |
-
|
2086 |
-
|
2087 |
-
|
2088 |
-
|
2089 |
-
|
2090 |
-
|
2091 |
-
|
2092 |
case LLM_ARCH_REFACT:
|
2093 |
{
|
2094 |
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
|
@@ -2097,33 +2062,6 @@ static void llm_load_hparams(
|
|
2097 |
default: model.type = e_model::MODEL_UNKNOWN;
|
2098 |
}
|
2099 |
} break;
|
2100 |
-
case LLM_ARCH_BLOOM:
|
2101 |
-
{
|
2102 |
-
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
2103 |
-
|
2104 |
-
switch (hparams.n_layer) {
|
2105 |
-
case 24: model.type = e_model::MODEL_1B; break;
|
2106 |
-
case 30:
|
2107 |
-
switch (hparams.n_embd) {
|
2108 |
-
case 2560: model.type = e_model::MODEL_3B; break;
|
2109 |
-
case 4096: model.type = e_model::MODEL_7B; break;
|
2110 |
-
} break;
|
2111 |
-
}
|
2112 |
-
} break;
|
2113 |
-
case LLM_ARCH_MPT:
|
2114 |
-
{
|
2115 |
-
hparams.f_clamp_kqv = 0.0f;
|
2116 |
-
|
2117 |
-
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
2118 |
-
GGUF_GET_KEY(ctx, hparams.f_clamp_kqv, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_CLAMP_KQV));
|
2119 |
-
GGUF_GET_KEY(ctx, hparams.f_max_alibi_bias, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS));
|
2120 |
-
|
2121 |
-
switch (hparams.n_layer) {
|
2122 |
-
case 32: model.type = e_model::MODEL_7B; break;
|
2123 |
-
case 48: model.type = e_model::MODEL_30B; break;
|
2124 |
-
default: model.type = e_model::MODEL_UNKNOWN;
|
2125 |
-
}
|
2126 |
-
} break;
|
2127 |
default: (void)0;
|
2128 |
}
|
2129 |
|
@@ -2268,8 +2206,6 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
2268 |
LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
2269 |
LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
|
2270 |
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
2271 |
-
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
2272 |
-
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
|
2273 |
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
2274 |
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
2275 |
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
@@ -2369,8 +2305,8 @@ static void llm_load_tensors(
|
|
2369 |
|
2370 |
// output
|
2371 |
{
|
2372 |
-
|
2373 |
-
|
2374 |
|
2375 |
if (n_gpu_layers > int(n_layer)) {
|
2376 |
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
@@ -2405,8 +2341,8 @@ static void llm_load_tensors(
|
|
2405 |
model.layers.resize(n_layer);
|
2406 |
|
2407 |
for (uint32_t i = 0; i < n_layer; ++i) {
|
2408 |
-
const
|
2409 |
-
const
|
2410 |
|
2411 |
auto & layer = model.layers[i];
|
2412 |
|
@@ -2435,8 +2371,8 @@ static void llm_load_tensors(
|
|
2435 |
{
|
2436 |
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2437 |
{
|
2438 |
-
|
2439 |
-
|
2440 |
|
2441 |
if (n_gpu_layers > int(n_layer)) {
|
2442 |
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
@@ -2471,8 +2407,8 @@ static void llm_load_tensors(
|
|
2471 |
model.layers.resize(n_layer);
|
2472 |
|
2473 |
for (uint32_t i = 0; i < n_layer; ++i) {
|
2474 |
-
const
|
2475 |
-
const
|
2476 |
|
2477 |
auto & layer = model.layers[i];
|
2478 |
|
@@ -2505,8 +2441,8 @@ static void llm_load_tensors(
|
|
2505 |
|
2506 |
// output
|
2507 |
{
|
2508 |
-
|
2509 |
-
|
2510 |
|
2511 |
if (n_gpu_layers > int(n_layer)) {
|
2512 |
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
@@ -2543,8 +2479,8 @@ static void llm_load_tensors(
|
|
2543 |
model.layers.resize(n_layer);
|
2544 |
|
2545 |
for (uint32_t i = 0; i < n_layer; ++i) {
|
2546 |
-
const
|
2547 |
-
const
|
2548 |
|
2549 |
auto & layer = model.layers[i];
|
2550 |
|
@@ -2582,8 +2518,8 @@ static void llm_load_tensors(
|
|
2582 |
|
2583 |
// output
|
2584 |
{
|
2585 |
-
|
2586 |
-
|
2587 |
|
2588 |
if (n_gpu_layers > int(n_layer)) {
|
2589 |
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
@@ -2620,8 +2556,8 @@ static void llm_load_tensors(
|
|
2620 |
model.layers.resize(n_layer);
|
2621 |
|
2622 |
for (uint32_t i = 0; i < n_layer; ++i) {
|
2623 |
-
const
|
2624 |
-
const
|
2625 |
|
2626 |
auto & layer = model.layers[i];
|
2627 |
|
@@ -2659,8 +2595,8 @@ static void llm_load_tensors(
|
|
2659 |
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2660 |
|
2661 |
{
|
2662 |
-
|
2663 |
-
|
2664 |
|
2665 |
if (n_gpu_layers > int(n_layer)) {
|
2666 |
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
@@ -2694,8 +2630,8 @@ static void llm_load_tensors(
|
|
2694 |
const int i_gpu_start = n_layer - n_gpu_layers;
|
2695 |
model.layers.resize(n_layer);
|
2696 |
for (uint32_t i = 0; i < n_layer; ++i) {
|
2697 |
-
const
|
2698 |
-
const
|
2699 |
auto & layer = model.layers[i];
|
2700 |
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2701 |
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
@@ -2715,155 +2651,6 @@ static void llm_load_tensors(
|
|
2715 |
layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend);
|
2716 |
}
|
2717 |
} break;
|
2718 |
-
case LLM_ARCH_BLOOM:
|
2719 |
-
{
|
2720 |
-
// TODO: CPU-only for now
|
2721 |
-
|
2722 |
-
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2723 |
-
model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU);
|
2724 |
-
model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU);
|
2725 |
-
|
2726 |
-
// output
|
2727 |
-
{
|
2728 |
-
ggml_backend_type backend_norm;
|
2729 |
-
ggml_backend_type backend_output;
|
2730 |
-
|
2731 |
-
if (n_gpu_layers > int(n_layer)) {
|
2732 |
-
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2733 |
-
// on Windows however this is detrimental unless everything is on the GPU
|
2734 |
-
#ifndef _WIN32
|
2735 |
-
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2736 |
-
#else
|
2737 |
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2738 |
-
#endif // _WIN32
|
2739 |
-
|
2740 |
-
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2741 |
-
} else {
|
2742 |
-
backend_norm = GGML_BACKEND_CPU;
|
2743 |
-
backend_output = GGML_BACKEND_CPU;
|
2744 |
-
}
|
2745 |
-
|
2746 |
-
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2747 |
-
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
2748 |
-
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2749 |
-
|
2750 |
-
if (backend_norm == GGML_BACKEND_GPU) {
|
2751 |
-
vram_weights += ggml_nbytes(model.output_norm);
|
2752 |
-
vram_weights += ggml_nbytes(model.output_norm_b);
|
2753 |
-
}
|
2754 |
-
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2755 |
-
vram_weights += ggml_nbytes(model.output);
|
2756 |
-
}
|
2757 |
-
}
|
2758 |
-
|
2759 |
-
const uint32_t n_ff = hparams.n_ff;
|
2760 |
-
|
2761 |
-
const int i_gpu_start = n_layer - n_gpu_layers;
|
2762 |
-
|
2763 |
-
model.layers.resize(n_layer);
|
2764 |
-
|
2765 |
-
for (uint32_t i = 0; i < n_layer; ++i) {
|
2766 |
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2767 |
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2768 |
-
|
2769 |
-
auto & layer = model.layers[i];
|
2770 |
-
|
2771 |
-
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2772 |
-
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
2773 |
-
|
2774 |
-
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
2775 |
-
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
|
2776 |
-
|
2777 |
-
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2778 |
-
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
|
2779 |
-
|
2780 |
-
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2781 |
-
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
2782 |
-
|
2783 |
-
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
2784 |
-
layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
|
2785 |
-
|
2786 |
-
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
2787 |
-
layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
|
2788 |
-
|
2789 |
-
if (backend == GGML_BACKEND_GPU) {
|
2790 |
-
vram_weights +=
|
2791 |
-
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
2792 |
-
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
|
2793 |
-
ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
|
2794 |
-
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
|
2795 |
-
ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3) +
|
2796 |
-
ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2);
|
2797 |
-
}
|
2798 |
-
}
|
2799 |
-
} break;
|
2800 |
-
case LLM_ARCH_MPT:
|
2801 |
-
{
|
2802 |
-
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2803 |
-
|
2804 |
-
// output
|
2805 |
-
{
|
2806 |
-
ggml_backend_type backend_norm;
|
2807 |
-
ggml_backend_type backend_output;
|
2808 |
-
|
2809 |
-
if (n_gpu_layers > int(n_layer)) {
|
2810 |
-
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2811 |
-
// on Windows however this is detrimental unless everything is on the GPU
|
2812 |
-
#ifndef _WIN32
|
2813 |
-
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2814 |
-
#else
|
2815 |
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2816 |
-
#endif // _WIN32
|
2817 |
-
|
2818 |
-
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2819 |
-
} else {
|
2820 |
-
backend_norm = GGML_BACKEND_CPU;
|
2821 |
-
backend_output = GGML_BACKEND_CPU;
|
2822 |
-
}
|
2823 |
-
|
2824 |
-
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2825 |
-
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2826 |
-
|
2827 |
-
if (backend_norm == GGML_BACKEND_GPU) {
|
2828 |
-
vram_weights += ggml_nbytes(model.output_norm);
|
2829 |
-
}
|
2830 |
-
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2831 |
-
vram_weights += ggml_nbytes(model.output);
|
2832 |
-
}
|
2833 |
-
}
|
2834 |
-
|
2835 |
-
const uint32_t n_ff = hparams.n_ff;
|
2836 |
-
|
2837 |
-
const int i_gpu_start = n_layer - n_gpu_layers;
|
2838 |
-
|
2839 |
-
model.layers.resize(n_layer);
|
2840 |
-
|
2841 |
-
for (uint32_t i = 0; i < n_layer; ++i) {
|
2842 |
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2843 |
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2844 |
-
|
2845 |
-
auto & layer = model.layers[i];
|
2846 |
-
|
2847 |
-
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2848 |
-
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3*n_embd}, backend_split);
|
2849 |
-
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2850 |
-
|
2851 |
-
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2852 |
-
|
2853 |
-
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
2854 |
-
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
2855 |
-
|
2856 |
-
if (backend == GGML_BACKEND_GPU) {
|
2857 |
-
vram_weights +=
|
2858 |
-
ggml_nbytes(layer.attn_norm) +
|
2859 |
-
ggml_nbytes(layer.wqkv) +
|
2860 |
-
ggml_nbytes(layer.wo) +
|
2861 |
-
ggml_nbytes(layer.ffn_norm) +
|
2862 |
-
ggml_nbytes(layer.w2) +
|
2863 |
-
ggml_nbytes(layer.w3);
|
2864 |
-
}
|
2865 |
-
}
|
2866 |
-
} break;
|
2867 |
default:
|
2868 |
throw std::runtime_error("unknown architecture");
|
2869 |
}
|
@@ -4720,6 +4507,7 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
4720 |
return gf;
|
4721 |
}
|
4722 |
|
|
|
4723 |
static struct ggml_cgraph * llm_build_persimmon(
|
4724 |
llama_context & lctx,
|
4725 |
const llama_batch & batch) {
|
@@ -5117,571 +4905,12 @@ static struct ggml_cgraph * llm_build_persimmon(
|
|
5117 |
return gf;
|
5118 |
}
|
5119 |
|
5120 |
-
static struct ggml_cgraph *
|
5121 |
llama_context & lctx,
|
5122 |
const llama_batch & batch) {
|
5123 |
-
const auto & model
|
5124 |
-
const auto & hparams = model.hparams;
|
5125 |
-
const auto & cparams = lctx.cparams;
|
5126 |
-
|
5127 |
-
const auto & kv_self = lctx.kv_self;
|
5128 |
-
|
5129 |
-
GGML_ASSERT(!!kv_self.ctx);
|
5130 |
-
|
5131 |
-
const int64_t n_embd = hparams.n_embd;
|
5132 |
-
const int64_t n_layer = hparams.n_layer;
|
5133 |
-
const int64_t n_ctx = cparams.n_ctx;
|
5134 |
-
const int64_t n_head = hparams.n_head;
|
5135 |
-
const int64_t n_head_kv = hparams.n_head_kv;
|
5136 |
-
const int64_t n_embd_head = hparams.n_embd_head();
|
5137 |
-
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
5138 |
-
|
5139 |
-
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
5140 |
-
|
5141 |
-
const float norm_eps = hparams.f_norm_eps;
|
5142 |
-
|
5143 |
-
const int32_t n_tokens = batch.n_tokens;
|
5144 |
-
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
5145 |
-
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
5146 |
-
|
5147 |
-
auto & buf_compute = lctx.buf_compute;
|
5148 |
-
|
5149 |
-
struct ggml_init_params params = {
|
5150 |
-
/*.mem_size =*/ buf_compute.size,
|
5151 |
-
/*.mem_buffer =*/ buf_compute.data,
|
5152 |
-
/*.no_alloc =*/ false,
|
5153 |
-
};
|
5154 |
-
|
5155 |
-
params.no_alloc = true;
|
5156 |
-
|
5157 |
-
struct ggml_context * ctx0 = ggml_init(params);
|
5158 |
-
|
5159 |
-
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
5160 |
-
|
5161 |
-
struct ggml_tensor * cur;
|
5162 |
-
struct ggml_tensor * token;
|
5163 |
-
struct ggml_tensor * inpL;
|
5164 |
-
|
5165 |
-
if (batch.token) {
|
5166 |
-
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5167 |
-
|
5168 |
-
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
5169 |
-
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5170 |
-
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
5171 |
-
}
|
5172 |
-
ggml_set_name(inp_tokens, "inp_tokens");
|
5173 |
-
|
5174 |
-
token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
5175 |
-
} else {
|
5176 |
-
#ifdef GGML_USE_MPI
|
5177 |
-
GGML_ASSERT(false && "not implemented");
|
5178 |
-
#endif
|
5179 |
-
|
5180 |
-
token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
5181 |
-
|
5182 |
-
ggml_allocr_alloc(lctx.alloc, token);
|
5183 |
-
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5184 |
-
memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
|
5185 |
-
}
|
5186 |
-
}
|
5187 |
-
|
5188 |
-
// KQ_scale
|
5189 |
-
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
5190 |
-
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
5191 |
-
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
5192 |
-
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5193 |
-
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
5194 |
-
}
|
5195 |
|
5196 |
-
|
5197 |
-
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5198 |
-
ggml_set_name(KQ_mask, "KQ_mask");
|
5199 |
-
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
5200 |
-
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5201 |
-
float * data = (float *) KQ_mask->data;
|
5202 |
-
memset(data, 0, ggml_nbytes(KQ_mask));
|
5203 |
-
|
5204 |
-
for (int h = 0; h < 1; ++h) {
|
5205 |
-
for (int j = 0; j < n_tokens; ++j) {
|
5206 |
-
const llama_pos pos = batch.pos[j];
|
5207 |
-
const llama_seq_id seq_id = batch.seq_id[j];
|
5208 |
-
|
5209 |
-
for (int i = 0; i < n_kv; ++i) {
|
5210 |
-
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
5211 |
-
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
5212 |
-
}
|
5213 |
-
}
|
5214 |
-
}
|
5215 |
-
}
|
5216 |
-
}
|
5217 |
-
|
5218 |
-
// norm
|
5219 |
-
{
|
5220 |
-
inpL = ggml_norm(ctx0, token, norm_eps);
|
5221 |
-
inpL = ggml_add(ctx0, ggml_mul(ctx0, inpL, model.tok_norm), model.tok_norm_b);
|
5222 |
-
}
|
5223 |
-
|
5224 |
-
ggml_set_name(inpL, "inpL");
|
5225 |
-
|
5226 |
-
for (int il = 0; il < n_layer; ++il) {
|
5227 |
-
{
|
5228 |
-
// Norm
|
5229 |
-
cur = ggml_norm(ctx0, inpL, norm_eps);
|
5230 |
-
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
|
5231 |
-
}
|
5232 |
-
|
5233 |
-
{
|
5234 |
-
// Self Attention
|
5235 |
-
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
|
5236 |
-
|
5237 |
-
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
|
5238 |
-
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
|
5239 |
-
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
|
5240 |
-
|
5241 |
-
struct ggml_tensor * Qcur = tmpq;
|
5242 |
-
struct ggml_tensor * Kcur = tmpk;
|
5243 |
-
|
5244 |
-
// store key and value to memory
|
5245 |
-
{
|
5246 |
-
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
5247 |
-
ggml_set_name(Vcur, "Vcur");
|
5248 |
-
|
5249 |
-
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
5250 |
-
ggml_set_name(k, "k");
|
5251 |
-
|
5252 |
-
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
5253 |
-
( n_ctx)*ggml_element_size(kv_self.v),
|
5254 |
-
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
5255 |
-
|
5256 |
-
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
5257 |
-
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
5258 |
-
}
|
5259 |
-
|
5260 |
-
struct ggml_tensor * Q =
|
5261 |
-
ggml_permute(ctx0,
|
5262 |
-
ggml_cpy(ctx0,
|
5263 |
-
Qcur,
|
5264 |
-
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
|
5265 |
-
0, 2, 1, 3);
|
5266 |
-
ggml_set_name(Q, "Q");
|
5267 |
-
|
5268 |
-
struct ggml_tensor * K =
|
5269 |
-
ggml_view_3d(ctx0, kv_self.k,
|
5270 |
-
n_embd_head, n_kv, n_head_kv,
|
5271 |
-
ggml_element_size(kv_self.k)*n_embd_gqa,
|
5272 |
-
ggml_element_size(kv_self.k)*n_embd_head,
|
5273 |
-
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
5274 |
-
ggml_set_name(K, "K");
|
5275 |
-
|
5276 |
-
// K * Q
|
5277 |
-
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
5278 |
-
ggml_set_name(KQ, "KQ");
|
5279 |
-
|
5280 |
-
// KQ_scaled = KQ / sqrt(n_embd_head)
|
5281 |
-
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
|
5282 |
-
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
5283 |
-
ggml_set_name(KQ_scaled, "KQ_scaled");
|
5284 |
-
|
5285 |
-
struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ kv_head, n_head, 8);
|
5286 |
-
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
5287 |
-
|
5288 |
-
// KQ_masked = mask_past(KQ_scaled)
|
5289 |
-
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
|
5290 |
-
ggml_set_name(KQ_masked, "KQ_masked");
|
5291 |
-
|
5292 |
-
// KQ = soft_max(KQ_masked)
|
5293 |
-
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
5294 |
-
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
5295 |
-
|
5296 |
-
// split cached V into n_head heads
|
5297 |
-
struct ggml_tensor * V =
|
5298 |
-
ggml_view_3d(ctx0, kv_self.v,
|
5299 |
-
n_kv, n_embd_head, n_head_kv,
|
5300 |
-
ggml_element_size(kv_self.v)*n_ctx,
|
5301 |
-
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
5302 |
-
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
5303 |
-
ggml_set_name(V, "V");
|
5304 |
-
|
5305 |
-
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
5306 |
-
ggml_set_name(KQV, "KQV");
|
5307 |
-
|
5308 |
-
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
5309 |
-
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
5310 |
-
ggml_set_name(KQV_merged, "KQV_merged");
|
5311 |
-
|
5312 |
-
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
5313 |
-
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
5314 |
-
ggml_set_name(cur, "KQV_merged_contiguous");
|
5315 |
-
}
|
5316 |
-
|
5317 |
-
// Projection
|
5318 |
-
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
|
5319 |
-
|
5320 |
-
// Add the input
|
5321 |
-
cur = ggml_add(ctx0, cur, inpL);
|
5322 |
-
|
5323 |
-
struct ggml_tensor * inpFF = cur;
|
5324 |
-
|
5325 |
-
// FF
|
5326 |
-
{
|
5327 |
-
// Norm
|
5328 |
-
{
|
5329 |
-
cur = ggml_norm(ctx0, inpFF, norm_eps);
|
5330 |
-
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
|
5331 |
-
}
|
5332 |
-
|
5333 |
-
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
|
5334 |
-
|
5335 |
-
// GELU activation
|
5336 |
-
cur = ggml_gelu(ctx0, cur);
|
5337 |
-
|
5338 |
-
// Projection
|
5339 |
-
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
|
5340 |
-
}
|
5341 |
-
|
5342 |
-
inpL = ggml_add(ctx0, cur, inpFF);
|
5343 |
-
}
|
5344 |
-
|
5345 |
-
// Output Norm
|
5346 |
-
{
|
5347 |
-
cur = ggml_norm(ctx0, inpL, norm_eps);
|
5348 |
-
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
|
5349 |
-
}
|
5350 |
-
ggml_set_name(cur, "result_norm");
|
5351 |
-
|
5352 |
-
cur = ggml_mul_mat(ctx0, model.output, cur);
|
5353 |
-
ggml_set_name(cur, "result_output");
|
5354 |
-
|
5355 |
-
ggml_build_forward_expand(gf, cur);
|
5356 |
-
|
5357 |
-
ggml_free(ctx0);
|
5358 |
-
|
5359 |
-
return gf;
|
5360 |
-
}
|
5361 |
-
|
5362 |
-
static struct ggml_cgraph * llm_build_mpt(
|
5363 |
-
llama_context & lctx,
|
5364 |
-
const llama_batch & batch) {
|
5365 |
-
const auto & model = lctx.model;
|
5366 |
-
const auto & hparams = model.hparams;
|
5367 |
-
const auto & cparams = lctx.cparams;
|
5368 |
-
|
5369 |
-
const auto & kv_self = lctx.kv_self;
|
5370 |
-
|
5371 |
-
GGML_ASSERT(!!kv_self.ctx);
|
5372 |
-
|
5373 |
-
const int64_t n_embd = hparams.n_embd;
|
5374 |
-
const int64_t n_layer = hparams.n_layer;
|
5375 |
-
const int64_t n_ctx = cparams.n_ctx;
|
5376 |
-
const int64_t n_head = hparams.n_head;
|
5377 |
-
const int64_t n_head_kv = hparams.n_head_kv; // == n_head for MPT, as there's no MQA/GQA
|
5378 |
-
const int64_t n_embd_head = hparams.n_embd_head();
|
5379 |
-
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
5380 |
-
|
5381 |
-
const float norm_eps = hparams.f_norm_eps;
|
5382 |
-
const float clamp_kqv = hparams.f_clamp_kqv;
|
5383 |
-
const float max_alibi_bias = hparams.f_max_alibi_bias;
|
5384 |
-
|
5385 |
-
const int n_gpu_layers = model.n_gpu_layers;
|
5386 |
-
|
5387 |
-
const int32_t n_tokens = batch.n_tokens;
|
5388 |
-
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
5389 |
-
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
5390 |
-
|
5391 |
-
auto & buf_compute = lctx.buf_compute;
|
5392 |
-
|
5393 |
-
struct ggml_init_params params = {
|
5394 |
-
/*.mem_size =*/ buf_compute.size,
|
5395 |
-
/*.mem_buffer =*/ buf_compute.data,
|
5396 |
-
/*.no_alloc =*/ false,
|
5397 |
-
};
|
5398 |
-
|
5399 |
-
params.no_alloc = true;
|
5400 |
-
|
5401 |
-
struct ggml_context * ctx0 = ggml_init(params);
|
5402 |
-
|
5403 |
-
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
5404 |
-
|
5405 |
-
struct ggml_tensor * cur;
|
5406 |
-
struct ggml_tensor * inpL;
|
5407 |
-
|
5408 |
-
//int warmup = 0;
|
5409 |
-
if (batch.token) {
|
5410 |
-
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5411 |
-
|
5412 |
-
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
5413 |
-
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5414 |
-
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
5415 |
-
//warmup = ((uint32_t*) inp_tokens->data)[0] == 0;
|
5416 |
-
}
|
5417 |
-
|
5418 |
-
ggml_set_name(inp_tokens, "inp_tokens");
|
5419 |
-
|
5420 |
-
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
5421 |
-
} else {
|
5422 |
-
#ifdef GGML_USE_MPI
|
5423 |
-
GGML_ASSERT(false && "not implemented");
|
5424 |
-
#endif
|
5425 |
-
|
5426 |
-
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
5427 |
-
|
5428 |
-
ggml_allocr_alloc(lctx.alloc, inpL);
|
5429 |
-
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5430 |
-
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
5431 |
-
}
|
5432 |
-
}
|
5433 |
-
|
5434 |
-
const int i_gpu_start = n_layer - n_gpu_layers;
|
5435 |
-
(void) i_gpu_start;
|
5436 |
-
|
5437 |
-
// offload functions set the tensor output backend to GPU
|
5438 |
-
// tensors are GPU-accelerated if any input or the output has been offloaded
|
5439 |
-
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
5440 |
-
offload_func_t offload_func_kq = llama_nop;
|
5441 |
-
offload_func_t offload_func_v = llama_nop;
|
5442 |
-
|
5443 |
-
#ifdef GGML_USE_CUBLAS
|
5444 |
-
if (n_gpu_layers > n_layer) {
|
5445 |
-
offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
|
5446 |
-
}
|
5447 |
-
if (n_gpu_layers > n_layer + 1) {
|
5448 |
-
offload_func_v = ggml_cuda_assign_buffers_no_alloc;
|
5449 |
-
}
|
5450 |
-
if (n_gpu_layers > n_layer + 2) {
|
5451 |
-
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
|
5452 |
-
}
|
5453 |
-
#endif // GGML_USE_CUBLAS
|
5454 |
-
|
5455 |
-
// KQ_scale
|
5456 |
-
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
5457 |
-
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
5458 |
-
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
5459 |
-
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5460 |
-
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
5461 |
-
}
|
5462 |
-
|
5463 |
-
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5464 |
-
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5465 |
-
offload_func_kq(KQ_mask);
|
5466 |
-
ggml_set_name(KQ_mask, "KQ_mask");
|
5467 |
-
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
5468 |
-
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5469 |
-
float * data = (float *) KQ_mask->data;
|
5470 |
-
memset(data, 0, ggml_nbytes(KQ_mask));
|
5471 |
-
|
5472 |
-
for (int h = 0; h < 1; ++h) {
|
5473 |
-
for (int j = 0; j < n_tokens; ++j) {
|
5474 |
-
const llama_pos pos = batch.pos[j];
|
5475 |
-
const llama_seq_id seq_id = batch.seq_id[j];
|
5476 |
-
|
5477 |
-
for (int i = 0; i < n_kv; ++i) {
|
5478 |
-
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
5479 |
-
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
5480 |
-
}
|
5481 |
-
}
|
5482 |
-
}
|
5483 |
-
}
|
5484 |
-
}
|
5485 |
-
|
5486 |
-
for (int il = 0; il < n_layer; ++il) {
|
5487 |
-
struct ggml_tensor * attn_norm;
|
5488 |
-
|
5489 |
-
offload_func_t offload_func = llama_nop;
|
5490 |
-
|
5491 |
-
#ifdef GGML_USE_CUBLAS
|
5492 |
-
if (il >= i_gpu_start) {
|
5493 |
-
offload_func = ggml_cuda_assign_buffers_no_alloc;
|
5494 |
-
}
|
5495 |
-
#endif // GGML_USE_CUBLAS
|
5496 |
-
|
5497 |
-
// self-attention
|
5498 |
-
// TODO: refactor into common function (shared with LLaMA)
|
5499 |
-
{
|
5500 |
-
attn_norm = ggml_norm(ctx0, inpL, norm_eps);
|
5501 |
-
offload_func(attn_norm);
|
5502 |
-
|
5503 |
-
attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm);
|
5504 |
-
offload_func(attn_norm);
|
5505 |
-
|
5506 |
-
if (1) {
|
5507 |
-
cur = attn_norm;
|
5508 |
-
}
|
5509 |
-
|
5510 |
-
// compute QKV
|
5511 |
-
|
5512 |
-
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
5513 |
-
offload_func_kq(cur);
|
5514 |
-
|
5515 |
-
if (clamp_kqv > 0.0f) {
|
5516 |
-
cur = ggml_clamp(ctx0, cur, -clamp_kqv, clamp_kqv);
|
5517 |
-
offload_func_kq(cur);
|
5518 |
-
}
|
5519 |
-
|
5520 |
-
const size_t wsize = ggml_type_size(cur->type);
|
5521 |
-
|
5522 |
-
struct ggml_tensor * Qcur = ggml_view_3d(
|
5523 |
-
ctx0, cur, n_embd_head, n_head, n_tokens,
|
5524 |
-
wsize * n_embd_head,
|
5525 |
-
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
5526 |
-
0);
|
5527 |
-
offload_func_kq(Qcur);
|
5528 |
-
|
5529 |
-
struct ggml_tensor * Kcur = ggml_view_3d(
|
5530 |
-
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
5531 |
-
wsize * n_embd_head,
|
5532 |
-
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
5533 |
-
wsize * n_embd_head * n_head);
|
5534 |
-
offload_func_kq(Kcur);
|
5535 |
-
|
5536 |
-
struct ggml_tensor * tmpv = ggml_view_3d(
|
5537 |
-
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
5538 |
-
wsize * n_embd_head,
|
5539 |
-
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
5540 |
-
wsize * n_embd_head * (n_head + n_head_kv));
|
5541 |
-
offload_func_kq(Kcur);
|
5542 |
-
|
5543 |
-
ggml_set_name(Qcur, "Qcur");
|
5544 |
-
ggml_set_name(Kcur, "Kcur");
|
5545 |
-
|
5546 |
-
{
|
5547 |
-
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
5548 |
-
offload_func_v(Vcur);
|
5549 |
-
offload_func_v(Vcur->src[0]->src[0]);
|
5550 |
-
ggml_set_name(Vcur, "Vcur");
|
5551 |
-
|
5552 |
-
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
5553 |
-
offload_func_kq(k);
|
5554 |
-
ggml_set_name(k, "k");
|
5555 |
-
|
5556 |
-
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
5557 |
-
( n_ctx)*ggml_element_size(kv_self.v),
|
5558 |
-
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
5559 |
-
offload_func_v(v);
|
5560 |
-
|
5561 |
-
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
5562 |
-
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
5563 |
-
}
|
5564 |
-
|
5565 |
-
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
5566 |
-
offload_func_kq(Q);
|
5567 |
-
ggml_set_name(Q, "Q");
|
5568 |
-
|
5569 |
-
struct ggml_tensor * K =
|
5570 |
-
ggml_view_3d(ctx0, kv_self.k,
|
5571 |
-
n_embd_head, n_kv, n_head_kv,
|
5572 |
-
ggml_element_size(kv_self.k)*n_embd_gqa,
|
5573 |
-
ggml_element_size(kv_self.k)*n_embd_head,
|
5574 |
-
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
5575 |
-
offload_func_kq(K);
|
5576 |
-
ggml_set_name(K, "K");
|
5577 |
-
|
5578 |
-
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
5579 |
-
offload_func_kq(KQ);
|
5580 |
-
ggml_set_name(KQ, "KQ");
|
5581 |
-
|
5582 |
-
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
5583 |
-
offload_func_kq(KQ_scaled);
|
5584 |
-
ggml_set_name(KQ_scaled, "KQ_scaled");
|
5585 |
-
|
5586 |
-
// TODO: replace with ggml_add()
|
5587 |
-
struct ggml_tensor * KQ_scaled_alibi =
|
5588 |
-
ggml_alibi(ctx0, KQ_scaled, 0, n_head, max_alibi_bias);
|
5589 |
-
offload_func_kq(KQ_scaled_alibi);
|
5590 |
-
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
5591 |
-
|
5592 |
-
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
|
5593 |
-
offload_func_kq(KQ_masked);
|
5594 |
-
ggml_set_name(KQ_masked, "KQ_masked");
|
5595 |
-
|
5596 |
-
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
5597 |
-
offload_func_v(KQ_soft_max);
|
5598 |
-
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
5599 |
-
|
5600 |
-
struct ggml_tensor * V =
|
5601 |
-
ggml_view_3d(ctx0, kv_self.v,
|
5602 |
-
n_kv, n_embd_head, n_head_kv,
|
5603 |
-
ggml_element_size(kv_self.v)*n_ctx,
|
5604 |
-
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
5605 |
-
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
5606 |
-
offload_func_v(V);
|
5607 |
-
ggml_set_name(V, "V");
|
5608 |
-
|
5609 |
-
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
5610 |
-
offload_func_v(KQV);
|
5611 |
-
ggml_set_name(KQV, "KQV");
|
5612 |
-
|
5613 |
-
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
5614 |
-
offload_func_v(KQV_merged);
|
5615 |
-
ggml_set_name(KQV_merged, "KQV_merged");
|
5616 |
-
|
5617 |
-
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
5618 |
-
offload_func_v(cur);
|
5619 |
-
ggml_set_name(cur, "KQV_merged_contiguous");
|
5620 |
-
|
5621 |
-
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
|
5622 |
-
offload_func(cur);
|
5623 |
-
ggml_set_name(cur, "result_wo");
|
5624 |
-
}
|
5625 |
-
|
5626 |
-
// Add the input
|
5627 |
-
cur = ggml_add(ctx0, cur, inpL);
|
5628 |
-
offload_func(cur);
|
5629 |
-
|
5630 |
-
struct ggml_tensor * attn_out = cur;
|
5631 |
-
|
5632 |
-
// feed forward
|
5633 |
-
{
|
5634 |
-
// Norm
|
5635 |
-
{
|
5636 |
-
cur = ggml_norm(ctx0, attn_out, norm_eps);
|
5637 |
-
offload_func(cur);
|
5638 |
-
|
5639 |
-
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
5640 |
-
offload_func(cur);
|
5641 |
-
}
|
5642 |
-
|
5643 |
-
cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
|
5644 |
-
offload_func(cur);
|
5645 |
-
|
5646 |
-
cur = ggml_gelu(ctx0, cur);
|
5647 |
-
offload_func(cur);
|
5648 |
-
cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
|
5649 |
-
offload_func(cur);
|
5650 |
-
}
|
5651 |
-
|
5652 |
-
cur = ggml_add(ctx0, cur, attn_out);
|
5653 |
-
offload_func(cur);
|
5654 |
-
// input for next layer
|
5655 |
-
inpL = cur;
|
5656 |
-
}
|
5657 |
-
|
5658 |
-
cur = inpL;
|
5659 |
-
|
5660 |
-
// norm
|
5661 |
-
{
|
5662 |
-
cur = ggml_norm(ctx0, cur, norm_eps);
|
5663 |
-
offload_func_nr(cur);
|
5664 |
-
|
5665 |
-
cur = ggml_mul(ctx0, cur, model.output_norm);
|
5666 |
-
ggml_set_name(cur, "result_norm");
|
5667 |
-
}
|
5668 |
-
|
5669 |
-
cur = ggml_mul_mat(ctx0, model.output, cur);
|
5670 |
-
ggml_set_name(cur, "result_output");
|
5671 |
-
|
5672 |
-
ggml_build_forward_expand(gf, cur);
|
5673 |
-
|
5674 |
-
ggml_free(ctx0);
|
5675 |
-
|
5676 |
-
return gf;
|
5677 |
-
}
|
5678 |
-
|
5679 |
-
static struct ggml_cgraph * llama_build_graph(
|
5680 |
-
llama_context & lctx,
|
5681 |
-
const llama_batch & batch) {
|
5682 |
-
const auto & model = lctx.model;
|
5683 |
-
|
5684 |
-
struct ggml_cgraph * result = NULL;
|
5685 |
|
5686 |
switch (model.arch) {
|
5687 |
case LLM_ARCH_LLAMA:
|
@@ -5708,14 +4937,6 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5708 |
{
|
5709 |
result = llm_build_refact(lctx, batch);
|
5710 |
} break;
|
5711 |
-
case LLM_ARCH_BLOOM:
|
5712 |
-
{
|
5713 |
-
result = llm_build_bloom(lctx, batch);
|
5714 |
-
} break;
|
5715 |
-
case LLM_ARCH_MPT:
|
5716 |
-
{
|
5717 |
-
result = llm_build_mpt(lctx, batch);
|
5718 |
-
} break;
|
5719 |
default:
|
5720 |
GGML_ASSERT(false);
|
5721 |
}
|
@@ -5846,8 +5067,7 @@ static int llama_decode_internal(
|
|
5846 |
const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
|
5847 |
model.arch == LLM_ARCH_BAICHUAN ||
|
5848 |
model.arch == LLM_ARCH_FALCON ||
|
5849 |
-
model.arch == LLM_ARCH_REFACT
|
5850 |
-
model.arch == LLM_ARCH_MPT;
|
5851 |
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
5852 |
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
5853 |
n_threads = 1;
|
@@ -6348,6 +5568,7 @@ private:
|
|
6348 |
for (int i = 0; i < (int)text_utf.size(); i++) {
|
6349 |
const std::string & utf_char = text_utf[i];
|
6350 |
bool split_condition = false;
|
|
|
6351 |
int bytes_remain = text_utf.size() - i;
|
6352 |
// forward backward lookups
|
6353 |
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
|
@@ -6373,9 +5594,9 @@ private:
|
|
6373 |
if (!split_condition && bytes_remain >= 3) {
|
6374 |
// 're|'ve|'ll
|
6375 |
if (utf_char == "\'" && (
|
6376 |
-
(utf_char_next == "r"
|
6377 |
-
(utf_char_next == "v"
|
6378 |
-
(utf_char_next == "l"
|
6379 |
) {
|
6380 |
split_condition = true;
|
6381 |
}
|
@@ -6426,7 +5647,7 @@ private:
|
|
6426 |
else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
6427 |
split_condition = true;
|
6428 |
}
|
6429 |
-
else if (collecting_whitespace_lookahead &&
|
6430 |
split_condition = true;
|
6431 |
}
|
6432 |
}
|
@@ -7945,7 +7166,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
7945 |
const std::string name = ggml_get_name(meta);
|
7946 |
|
7947 |
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
7948 |
-
if (name.find("attn_v.weight") != std::string::npos
|
7949 |
++n_attention_wv;
|
7950 |
}
|
7951 |
else if (name.find("ffn_down.weight") != std::string::npos) {
|
|
|
189 |
LLM_ARCH_STARCODER,
|
190 |
LLM_ARCH_PERSIMMON,
|
191 |
LLM_ARCH_REFACT,
|
|
|
192 |
LLM_ARCH_UNKNOWN,
|
193 |
};
|
194 |
|
|
|
202 |
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
203 |
{ LLM_ARCH_STARCODER, "starcoder" },
|
204 |
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
205 |
+
{ LLM_ARCH_REFACT, "refact" },
|
|
|
206 |
};
|
207 |
|
208 |
enum llm_kv {
|
|
|
305 |
|
306 |
enum llm_tensor {
|
307 |
LLM_TENSOR_TOKEN_EMBD,
|
|
|
308 |
LLM_TENSOR_POS_EMBD,
|
309 |
LLM_TENSOR_OUTPUT,
|
310 |
LLM_TENSOR_OUTPUT_NORM,
|
|
|
425 |
LLM_ARCH_MPT,
|
426 |
{
|
427 |
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
428 |
},
|
429 |
},
|
430 |
{
|
|
|
459 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
460 |
},
|
461 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
462 |
{
|
463 |
LLM_ARCH_UNKNOWN,
|
464 |
{
|
|
|
1016 |
float rope_freq_base_train;
|
1017 |
float rope_freq_scale_train;
|
1018 |
|
|
|
|
|
|
|
1019 |
bool operator!=(const llama_hparams & other) const {
|
1020 |
if (this->vocab_only != other.vocab_only) return true;
|
1021 |
if (this->n_vocab != other.n_vocab) return true;
|
|
|
1201 |
|
1202 |
struct ggml_tensor * tok_embeddings;
|
1203 |
struct ggml_tensor * pos_embeddings;
|
|
|
|
|
1204 |
|
1205 |
struct ggml_tensor * output_norm;
|
1206 |
struct ggml_tensor * output_norm_b;
|
|
|
1330 |
cache.cells.clear();
|
1331 |
cache.cells.resize(n_ctx);
|
1332 |
|
|
|
|
|
|
|
1333 |
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
|
|
1334 |
|
1335 |
struct ggml_init_params params;
|
1336 |
params.mem_size = cache.buf.size;
|
|
|
1736 |
}
|
1737 |
}
|
1738 |
|
1739 |
+
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend backend) {
|
1740 |
if (backend != GGML_BACKEND_CPU) {
|
1741 |
ggml_set_no_alloc(ctx, true);
|
1742 |
}
|
|
|
1754 |
return tensor;
|
1755 |
}
|
1756 |
|
1757 |
+
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend backend) {
|
1758 |
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
|
1759 |
|
1760 |
if (cur == NULL) {
|
|
|
2047 |
}
|
2048 |
} break;
|
2049 |
case LLM_ARCH_PERSIMMON:
|
2050 |
+
{
|
2051 |
+
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
2052 |
+
switch (hparams.n_layer) {
|
2053 |
+
case 36: model.type = e_model::MODEL_8B; break;
|
2054 |
+
default: model.type = e_model::MODEL_UNKNOWN;
|
2055 |
+
}
|
2056 |
+
} break;
|
2057 |
case LLM_ARCH_REFACT:
|
2058 |
{
|
2059 |
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
|
|
|
2062 |
default: model.type = e_model::MODEL_UNKNOWN;
|
2063 |
}
|
2064 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2065 |
default: (void)0;
|
2066 |
}
|
2067 |
|
|
|
2206 |
LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
2207 |
LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
|
2208 |
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
|
|
|
|
2209 |
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
2210 |
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
2211 |
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
|
|
2305 |
|
2306 |
// output
|
2307 |
{
|
2308 |
+
ggml_backend backend_norm;
|
2309 |
+
ggml_backend backend_output;
|
2310 |
|
2311 |
if (n_gpu_layers > int(n_layer)) {
|
2312 |
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
|
|
2341 |
model.layers.resize(n_layer);
|
2342 |
|
2343 |
for (uint32_t i = 0; i < n_layer; ++i) {
|
2344 |
+
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2345 |
+
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2346 |
|
2347 |
auto & layer = model.layers[i];
|
2348 |
|
|
|
2371 |
{
|
2372 |
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2373 |
{
|
2374 |
+
ggml_backend backend_norm;
|
2375 |
+
ggml_backend backend_output;
|
2376 |
|
2377 |
if (n_gpu_layers > int(n_layer)) {
|
2378 |
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
|
|
2407 |
model.layers.resize(n_layer);
|
2408 |
|
2409 |
for (uint32_t i = 0; i < n_layer; ++i) {
|
2410 |
+
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2411 |
+
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2412 |
|
2413 |
auto & layer = model.layers[i];
|
2414 |
|
|
|
2441 |
|
2442 |
// output
|
2443 |
{
|
2444 |
+
ggml_backend backend_norm;
|
2445 |
+
ggml_backend backend_output;
|
2446 |
|
2447 |
if (n_gpu_layers > int(n_layer)) {
|
2448 |
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
|
|
2479 |
model.layers.resize(n_layer);
|
2480 |
|
2481 |
for (uint32_t i = 0; i < n_layer; ++i) {
|
2482 |
+
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2483 |
+
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2484 |
|
2485 |
auto & layer = model.layers[i];
|
2486 |
|
|
|
2518 |
|
2519 |
// output
|
2520 |
{
|
2521 |
+
ggml_backend backend_norm;
|
2522 |
+
ggml_backend backend_output;
|
2523 |
|
2524 |
if (n_gpu_layers > int(n_layer)) {
|
2525 |
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
|
|
2556 |
model.layers.resize(n_layer);
|
2557 |
|
2558 |
for (uint32_t i = 0; i < n_layer; ++i) {
|
2559 |
+
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2560 |
+
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2561 |
|
2562 |
auto & layer = model.layers[i];
|
2563 |
|
|
|
2595 |
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2596 |
|
2597 |
{
|
2598 |
+
ggml_backend backend_norm;
|
2599 |
+
ggml_backend backend_output;
|
2600 |
|
2601 |
if (n_gpu_layers > int(n_layer)) {
|
2602 |
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
|
|
2630 |
const int i_gpu_start = n_layer - n_gpu_layers;
|
2631 |
model.layers.resize(n_layer);
|
2632 |
for (uint32_t i = 0; i < n_layer; ++i) {
|
2633 |
+
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2634 |
+
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2635 |
auto & layer = model.layers[i];
|
2636 |
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2637 |
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
|
|
2651 |
layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend);
|
2652 |
}
|
2653 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2654 |
default:
|
2655 |
throw std::runtime_error("unknown architecture");
|
2656 |
}
|
|
|
4507 |
return gf;
|
4508 |
}
|
4509 |
|
4510 |
+
|
4511 |
static struct ggml_cgraph * llm_build_persimmon(
|
4512 |
llama_context & lctx,
|
4513 |
const llama_batch & batch) {
|
|
|
4905 |
return gf;
|
4906 |
}
|
4907 |
|
4908 |
+
static struct ggml_cgraph * llama_build_graph(
|
4909 |
llama_context & lctx,
|
4910 |
const llama_batch & batch) {
|
4911 |
+
const auto & model = lctx.model;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4912 |
|
4913 |
+
struct ggml_cgraph * result = NULL;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4914 |
|
4915 |
switch (model.arch) {
|
4916 |
case LLM_ARCH_LLAMA:
|
|
|
4937 |
{
|
4938 |
result = llm_build_refact(lctx, batch);
|
4939 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4940 |
default:
|
4941 |
GGML_ASSERT(false);
|
4942 |
}
|
|
|
5067 |
const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
|
5068 |
model.arch == LLM_ARCH_BAICHUAN ||
|
5069 |
model.arch == LLM_ARCH_FALCON ||
|
5070 |
+
model.arch == LLM_ARCH_REFACT;
|
|
|
5071 |
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
5072 |
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
5073 |
n_threads = 1;
|
|
|
5568 |
for (int i = 0; i < (int)text_utf.size(); i++) {
|
5569 |
const std::string & utf_char = text_utf[i];
|
5570 |
bool split_condition = false;
|
5571 |
+
// const char* text_pos = raw_text_p + utf_char.seq_offset_bytes;
|
5572 |
int bytes_remain = text_utf.size() - i;
|
5573 |
// forward backward lookups
|
5574 |
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
|
|
|
5594 |
if (!split_condition && bytes_remain >= 3) {
|
5595 |
// 're|'ve|'ll
|
5596 |
if (utf_char == "\'" && (
|
5597 |
+
(utf_char_next == "r" || utf_char_next_next == "e") ||
|
5598 |
+
(utf_char_next == "v" || utf_char_next_next == "e") ||
|
5599 |
+
(utf_char_next == "l" || utf_char_next_next == "l"))
|
5600 |
) {
|
5601 |
split_condition = true;
|
5602 |
}
|
|
|
5647 |
else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
5648 |
split_condition = true;
|
5649 |
}
|
5650 |
+
else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) {
|
5651 |
split_condition = true;
|
5652 |
}
|
5653 |
}
|
|
|
7166 |
const std::string name = ggml_get_name(meta);
|
7167 |
|
7168 |
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
7169 |
+
if (name.find("attn_v.weight") != std::string::npos) {
|
7170 |
++n_attention_wv;
|
7171 |
}
|
7172 |
else if (name.find("ffn_down.weight") != std::string::npos) {
|
otherarch/llama_v3.cpp
CHANGED
@@ -63,8 +63,9 @@ static void llama_v3_log_callback_default(llama_v3_log_level level, const char *
|
|
63 |
#define LLAMA_V3_LOG_WARN(...) llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_WARN , __VA_ARGS__)
|
64 |
#define LLAMA_V3_LOG_ERROR(...) llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_ERROR, __VA_ARGS__)
|
65 |
|
66 |
-
|
67 |
#if !defined(GGML_USE_CUBLAS)
|
|
|
68 |
#define LLAMA_V3_USE_ALLOCATOR
|
69 |
#else
|
70 |
#define LLAMA_V3_USE_SCRATCH
|
@@ -724,7 +725,7 @@ struct llama_v3_model_loader {
|
|
724 |
}
|
725 |
}
|
726 |
|
727 |
-
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne,
|
728 |
auto it = tensors_map.name_to_idx.find(name);
|
729 |
if (it == tensors_map.name_to_idx.end()) {
|
730 |
throw std::runtime_error(std::runtime_error(format_old("llama.cpp: tensor '%s' is missing from model", name.c_str())));
|
@@ -738,7 +739,7 @@ struct llama_v3_model_loader {
|
|
738 |
return get_tensor_for(lt, backend);
|
739 |
}
|
740 |
|
741 |
-
struct ggml_tensor * get_tensor_for(llama_v3_load_tensor & lt,
|
742 |
struct ggml_tensor * tensor;
|
743 |
if (backend != GGML_BACKEND_CPU) {
|
744 |
ggml_set_no_alloc(ggml_ctx, true);
|
@@ -1229,8 +1230,8 @@ static void llama_v3_model_load_internal(
|
|
1229 |
|
1230 |
// "output" tensor
|
1231 |
{
|
1232 |
-
|
1233 |
-
|
1234 |
if (n_gpu_layers > int(n_layer)) { // NOLINT
|
1235 |
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
1236 |
// on Windows however this is detrimental unless everything is on the GPU
|
@@ -1260,8 +1261,8 @@ static void llama_v3_model_load_internal(
|
|
1260 |
|
1261 |
model.layers.resize(n_layer);
|
1262 |
for (uint32_t i = 0; i < n_layer; ++i) {
|
1263 |
-
const
|
1264 |
-
const
|
1265 |
|
1266 |
auto & layer = model.layers[i];
|
1267 |
|
|
|
63 |
#define LLAMA_V3_LOG_WARN(...) llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_WARN , __VA_ARGS__)
|
64 |
#define LLAMA_V3_LOG_ERROR(...) llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_ERROR, __VA_ARGS__)
|
65 |
|
66 |
+
|
67 |
#if !defined(GGML_USE_CUBLAS)
|
68 |
+
#include "ggml-alloc.h"
|
69 |
#define LLAMA_V3_USE_ALLOCATOR
|
70 |
#else
|
71 |
#define LLAMA_V3_USE_SCRATCH
|
|
|
725 |
}
|
726 |
}
|
727 |
|
728 |
+
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
|
729 |
auto it = tensors_map.name_to_idx.find(name);
|
730 |
if (it == tensors_map.name_to_idx.end()) {
|
731 |
throw std::runtime_error(std::runtime_error(format_old("llama.cpp: tensor '%s' is missing from model", name.c_str())));
|
|
|
739 |
return get_tensor_for(lt, backend);
|
740 |
}
|
741 |
|
742 |
+
struct ggml_tensor * get_tensor_for(llama_v3_load_tensor & lt, ggml_backend backend) {
|
743 |
struct ggml_tensor * tensor;
|
744 |
if (backend != GGML_BACKEND_CPU) {
|
745 |
ggml_set_no_alloc(ggml_ctx, true);
|
|
|
1230 |
|
1231 |
// "output" tensor
|
1232 |
{
|
1233 |
+
ggml_backend backend_norm;
|
1234 |
+
ggml_backend backend_output;
|
1235 |
if (n_gpu_layers > int(n_layer)) { // NOLINT
|
1236 |
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
1237 |
// on Windows however this is detrimental unless everything is on the GPU
|
|
|
1261 |
|
1262 |
model.layers.resize(n_layer);
|
1263 |
for (uint32_t i = 0; i < n_layer; ++i) {
|
1264 |
+
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_V3_BACKEND_OFFLOAD; // NOLINT
|
1265 |
+
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_V3_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
1266 |
|
1267 |
auto & layer = model.layers[i];
|
1268 |
|
spm-headers/ggml.h
CHANGED
@@ -326,7 +326,7 @@ extern "C" {
|
|
326 |
GGML_TYPE_COUNT,
|
327 |
};
|
328 |
|
329 |
-
enum
|
330 |
GGML_BACKEND_CPU = 0,
|
331 |
GGML_BACKEND_GPU = 10,
|
332 |
GGML_BACKEND_GPU_SPLIT = 20,
|
@@ -479,10 +479,8 @@ extern "C" {
|
|
479 |
|
480 |
// n-dimensional tensor
|
481 |
struct ggml_tensor {
|
482 |
-
enum ggml_type
|
483 |
-
enum
|
484 |
-
|
485 |
-
struct ggml_backend_buffer * buffer;
|
486 |
|
487 |
int n_dims;
|
488 |
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
@@ -516,7 +514,7 @@ extern "C" {
|
|
516 |
|
517 |
void * extra; // extra things e.g. for ggml-cuda.cu
|
518 |
|
519 |
-
char padding[
|
520 |
};
|
521 |
|
522 |
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
@@ -1360,7 +1358,7 @@ extern "C" {
|
|
1360 |
|
1361 |
// alibi position embedding
|
1362 |
// in-place, returns view(a)
|
1363 |
-
|
1364 |
struct ggml_context * ctx,
|
1365 |
struct ggml_tensor * a,
|
1366 |
int n_past,
|
@@ -1369,7 +1367,7 @@ extern "C" {
|
|
1369 |
|
1370 |
// clamp
|
1371 |
// in-place, returns view(a)
|
1372 |
-
|
1373 |
struct ggml_context * ctx,
|
1374 |
struct ggml_tensor * a,
|
1375 |
float min,
|
@@ -2104,7 +2102,7 @@ extern "C" {
|
|
2104 |
enum ggml_type vec_dot_type;
|
2105 |
} ggml_type_traits_t;
|
2106 |
|
2107 |
-
|
2108 |
|
2109 |
#ifdef __cplusplus
|
2110 |
}
|
|
|
326 |
GGML_TYPE_COUNT,
|
327 |
};
|
328 |
|
329 |
+
enum ggml_backend {
|
330 |
GGML_BACKEND_CPU = 0,
|
331 |
GGML_BACKEND_GPU = 10,
|
332 |
GGML_BACKEND_GPU_SPLIT = 20,
|
|
|
479 |
|
480 |
// n-dimensional tensor
|
481 |
struct ggml_tensor {
|
482 |
+
enum ggml_type type;
|
483 |
+
enum ggml_backend backend;
|
|
|
|
|
484 |
|
485 |
int n_dims;
|
486 |
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
|
|
514 |
|
515 |
void * extra; // extra things e.g. for ggml-cuda.cu
|
516 |
|
517 |
+
char padding[4];
|
518 |
};
|
519 |
|
520 |
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
|
|
1358 |
|
1359 |
// alibi position embedding
|
1360 |
// in-place, returns view(a)
|
1361 |
+
struct ggml_tensor * ggml_alibi(
|
1362 |
struct ggml_context * ctx,
|
1363 |
struct ggml_tensor * a,
|
1364 |
int n_past,
|
|
|
1367 |
|
1368 |
// clamp
|
1369 |
// in-place, returns view(a)
|
1370 |
+
struct ggml_tensor * ggml_clamp(
|
1371 |
struct ggml_context * ctx,
|
1372 |
struct ggml_tensor * a,
|
1373 |
float min,
|
|
|
2102 |
enum ggml_type vec_dot_type;
|
2103 |
} ggml_type_traits_t;
|
2104 |
|
2105 |
+
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
2106 |
|
2107 |
#ifdef __cplusplus
|
2108 |
}
|
tests/test-tokenizer-0-falcon.cpp
CHANGED
@@ -36,8 +36,6 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
|
|
36 |
{ " Hello" , { 258, 23090, }, },
|
37 |
{ " Hello" , { 466, 23090, }, },
|
38 |
{ " Hello\n Hello" , { 466, 23090, 742, 23090, }, },
|
39 |
-
{ "\n =" , { 1212, 40, }, },
|
40 |
-
{ "' era" , { 18, 4932, }, },
|
41 |
};
|
42 |
|
43 |
return _k_tests;
|
@@ -157,7 +155,7 @@ int main(int argc, char **argv) {
|
|
157 |
|
158 |
fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
|
159 |
|
160 |
-
const std::vector<llama_token> res = llama_tokenize(ctx, text,
|
161 |
|
162 |
fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
|
163 |
|
@@ -171,8 +169,10 @@ int main(int argc, char **argv) {
|
|
171 |
}
|
172 |
|
173 |
for (const auto & tok : res) {
|
174 |
-
ofs << tok << "
|
175 |
}
|
|
|
|
|
176 |
}
|
177 |
|
178 |
fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
|
|
|
36 |
{ " Hello" , { 258, 23090, }, },
|
37 |
{ " Hello" , { 466, 23090, }, },
|
38 |
{ " Hello\n Hello" , { 466, 23090, 742, 23090, }, },
|
|
|
|
|
39 |
};
|
40 |
|
41 |
return _k_tests;
|
|
|
155 |
|
156 |
fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
|
157 |
|
158 |
+
const std::vector<llama_token> res = llama_tokenize(ctx, text, true);
|
159 |
|
160 |
fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
|
161 |
|
|
|
169 |
}
|
170 |
|
171 |
for (const auto & tok : res) {
|
172 |
+
ofs << tok << " ";
|
173 |
}
|
174 |
+
|
175 |
+
ofs << "\n";
|
176 |
}
|
177 |
|
178 |
fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
|
tests/test-tokenizer-0-falcon.py
CHANGED
@@ -41,8 +41,6 @@ tests = [
|
|
41 |
" Hello",
|
42 |
" Hello",
|
43 |
" Hello\n Hello",
|
44 |
-
"\n =",
|
45 |
-
"' era",
|
46 |
]
|
47 |
|
48 |
for text in tests:
|
@@ -71,14 +69,15 @@ fname_tok = args.fname_tok
|
|
71 |
if fname_tok:
|
72 |
print('tokenizing file: ', fname_tok)
|
73 |
fname_out = fname_tok + '.tok'
|
74 |
-
with open(fname_tok, 'r'
|
75 |
lines = f.readlines()
|
76 |
s = ''.join(lines)
|
77 |
res = tokenizer.encode(s)
|
78 |
# write to file
|
79 |
-
with open(fname_out, 'w'
|
80 |
for x in res:
|
81 |
-
f.write(str(x) + '
|
|
|
82 |
print('len(res): ', len(res))
|
83 |
print('len(lines): ', len(lines))
|
84 |
print('results written to: ', fname_out)
|
|
|
41 |
" Hello",
|
42 |
" Hello",
|
43 |
" Hello\n Hello",
|
|
|
|
|
44 |
]
|
45 |
|
46 |
for text in tests:
|
|
|
69 |
if fname_tok:
|
70 |
print('tokenizing file: ', fname_tok)
|
71 |
fname_out = fname_tok + '.tok'
|
72 |
+
with open(fname_tok, 'r') as f:
|
73 |
lines = f.readlines()
|
74 |
s = ''.join(lines)
|
75 |
res = tokenizer.encode(s)
|
76 |
# write to file
|
77 |
+
with open(fname_out, 'w') as f:
|
78 |
for x in res:
|
79 |
+
f.write(str(x) + ' ')
|
80 |
+
f.write('\n')
|
81 |
print('len(res): ', len(res))
|
82 |
print('len(lines): ', len(lines))
|
83 |
print('results written to: ', fname_out)
|
tests/test-tokenizer-0-llama.cpp
CHANGED
@@ -174,8 +174,10 @@ int main(int argc, char **argv) {
|
|
174 |
}
|
175 |
|
176 |
for (const auto & tok : res) {
|
177 |
-
ofs << tok << "
|
178 |
}
|
|
|
|
|
179 |
}
|
180 |
|
181 |
fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
|
|
|
174 |
}
|
175 |
|
176 |
for (const auto & tok : res) {
|
177 |
+
ofs << tok << " ";
|
178 |
}
|
179 |
+
|
180 |
+
ofs << "\n";
|
181 |
}
|
182 |
|
183 |
fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
|
tests/test-tokenizer-0-llama.py
CHANGED
@@ -81,14 +81,15 @@ fname_tok = args.fname_tok
|
|
81 |
if fname_tok:
|
82 |
print('tokenizing file: ', fname_tok)
|
83 |
fname_out = fname_tok + '.tok'
|
84 |
-
with open(fname_tok, 'r'
|
85 |
lines = f.readlines()
|
86 |
s = ''.join(lines)
|
87 |
res = tokenizer.encode(s, add_bos=True)
|
88 |
# write to file
|
89 |
-
with open(fname_out, 'w'
|
90 |
for x in res:
|
91 |
-
f.write(str(x) + '
|
|
|
92 |
print('len(res): ', len(res))
|
93 |
print('len(lines): ', len(lines))
|
94 |
print('results written to: ', fname_out)
|
|
|
81 |
if fname_tok:
|
82 |
print('tokenizing file: ', fname_tok)
|
83 |
fname_out = fname_tok + '.tok'
|
84 |
+
with open(fname_tok, 'r') as f:
|
85 |
lines = f.readlines()
|
86 |
s = ''.join(lines)
|
87 |
res = tokenizer.encode(s, add_bos=True)
|
88 |
# write to file
|
89 |
+
with open(fname_out, 'w') as f:
|
90 |
for x in res:
|
91 |
+
f.write(str(x) + ' ')
|
92 |
+
f.write('\n')
|
93 |
print('len(res): ', len(res))
|
94 |
print('len(lines): ', len(lines))
|
95 |
print('results written to: ', fname_out)
|