Spaces:

Illumotion
/

Koboldcpp

Build error

App Files Files Community

Illumotion commited on Jun 26, 2023

Commit

69fb50e

•

1 Parent(s): 26428a5

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

docs/token_generation_performance_tips.md +40 -0
examples/CMakeLists.txt +48 -0
examples/Miku.sh +49 -0
examples/alpaca.sh +19 -0
examples/baby-llama/CMakeLists.txt +4 -0
examples/baby-llama/baby-llama.cpp +1696 -0
examples/benchmark/CMakeLists.txt +7 -0
examples/benchmark/benchmark-matmult.cpp +261 -0
examples/chat-13B.bat +57 -0
examples/chat-13B.sh +41 -0
examples/chat-persistent.sh +151 -0
examples/chat-vicuna.sh +41 -0
examples/chat.sh +16 -0
examples/common.cpp +955 -0
examples/common.h +138 -0
examples/embedding/CMakeLists.txt +7 -0
examples/embedding/README.md +3 -0
examples/embedding/embedding.cpp +97 -0
examples/gpt4all.sh +15 -0
examples/jeopardy/README.md +21 -0
examples/jeopardy/graph.py +57 -0
examples/jeopardy/jeopardy.sh +30 -0
examples/jeopardy/qasheet.csv +103 -0
examples/jeopardy/questions.txt +100 -0
examples/main/CMakeLists.txt +7 -0
examples/main/README.md +293 -0
examples/main/main.cpp +675 -0
examples/metal/CMakeLists.txt +3 -0
examples/metal/metal.cpp +104 -0
examples/perplexity/CMakeLists.txt +7 -0
examples/perplexity/README.md +3 -0
examples/perplexity/perplexity.cpp +176 -0
examples/quantize-stats/CMakeLists.txt +4 -0
examples/quantize-stats/quantize-stats.cpp +438 -0
examples/quantize/CMakeLists.txt +7 -0
examples/quantize/README.md +3 -0
examples/quantize/quantize.cpp +257 -0
examples/reason-act.sh +17 -0
examples/save-load-state/CMakeLists.txt +7 -0
examples/save-load-state/save-load-state.cpp +170 -0
examples/server/CMakeLists.txt +12 -0
examples/server/README.md +192 -0
examples/server/chat.mjs +89 -0
examples/server/chat.sh +77 -0
examples/server/httplib.h +0 -0
examples/server/json.hpp +0 -0
examples/server/server.cpp +975 -0
examples/simple/CMakeLists.txt +7 -0
examples/simple/simple.cpp +179 -0
examples/train-text-from-scratch/CMakeLists.txt +4 -0

docs/token_generation_performance_tips.md ADDED Viewed

	@@ -0,0 +1,40 @@

+# Token generation performance troubleshooting
+## Verifying that the model is running on the GPU with cuBLAS
+Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
+```shell
+./main -m "path/to/model.bin" -ngl 200000 -p "Please sir, may I have some "
+```
+When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
+```shell
+llama_model_load_internal: [cublas] offloading 60 layers to GPU
+llama_model_load_internal: [cublas] offloading output layer to GPU
+llama_model_load_internal: [cublas] total VRAM used: 17223 MB
+... rest of inference
+```
+If you see these lines, then the GPU is being used.
+## Verifying that the CPU is not oversaturated
+llama accepts a `-t N` (or `--threads N`) parameter. It's extremely important that this parameter is not too large. If your token generation is extremely slow, try setting this number to 1. If this significantly improves your token generation speed, then your CPU is being oversaturated and you need to explicitly set this parameter to the number of the physicial CPU cores on your machine (even if you utilize a GPU). If in doubt, start with 1 and double the amount until you hit a performance bottleneck, then scale the number down.
+# Example of runtime flags effect on inference speed benchmark
+These runs were tested on the following machine:
+GPU: A6000 (48GB VRAM)
+CPU: 7 physical cores
+RAM: 32GB
+Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.ggmlv3.q4_0.bin` (30B parameters, 4bit quantization, GGML)
+Run command: `./main -m "path/to/model.bin" -p "-p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
+Result:
+| command | tokens/second (higher is better) |
+| - | - |
+| -ngl 2000000 | N/A (less than 0.1) |
+| -t 7 | 1.7 |
+| -t 1 -ngl 2000000 | 5.5 |
+| -t 7 -ngl 2000000 | 8.7 |
+| -t 4 -ngl 2000000 | 9.1 |

examples/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,48 @@

+# dependencies
+find_package(Threads REQUIRED)
+# third-party
+# ...
+# common
+set(TARGET common)
+add_library(${TARGET} OBJECT
+    common.h
+    common.cpp
+    )
+if (BUILD_SHARED_LIBS)
+    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endif()
+target_include_directories(${TARGET} PUBLIC .)
+target_compile_features(${TARGET} PUBLIC cxx_std_11)
+target_link_libraries(${TARGET} PRIVATE llama)
+# examples
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+if (EMSCRIPTEN)
+else()
+    add_subdirectory(main)
+    add_subdirectory(quantize)
+    add_subdirectory(quantize-stats)
+    add_subdirectory(perplexity)
+    add_subdirectory(embedding)
+    add_subdirectory(save-load-state)
+    add_subdirectory(benchmark)
+    add_subdirectory(baby-llama)
+    add_subdirectory(train-text-from-scratch)
+    add_subdirectory(simple)
+    if (LLAMA_METAL)
+        add_subdirectory(metal)
+    endif()
+    if (LLAMA_BUILD_SERVER)
+        add_subdirectory(server)
+    endif()
+endif()

examples/Miku.sh ADDED Viewed

	@@ -0,0 +1,49 @@

+#!/bin/bash
+set -e
+AI_NAME="${AI_NAME:-Miku}"
+MODEL="${MODEL:-./models/gpt4all-7B/gpt4all-lora-unfiltered-quantized.bin}"
+USER_NAME="${USER_NAME:-Anon}"
+# Uncomment and adjust to the number of CPU cores you want to use.
+#N_THREAD="${N_THREAD:-4}"
+N_PREDICTS="${N_PREDICTS:-4096}"
+GEN_OPTIONS=(--batch_size 1024
+--ctx_size 2048
+--keep -1
+--repeat_last_n 256
+--repeat_penalty 1.17647
+--temp 0.7
+--top_k 40
+--top_p 0.5)
+if [ -n "$N_THREAD" ]; then
+    GEN_OPTIONS+=(--threads "$N_THREAD")
+fi
+./main "${GEN_OPTIONS[@]}" \
+    --model "$MODEL" \
+    --n_predict "$N_PREDICTS" \
+    --color --interactive \
+    --reverse-prompt "${USER_NAME}:" \
+    --prompt "
+This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the user's computer.
+${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
+${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct, she will ask the user for help.
+${AI_NAME} is a very helpful AI and will help the user with anything they need. She is also very friendly and will try to make the user feel better if they are sad.
+${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life. She will also try to make the user like her.
+The conversation is only between ${USER_NAME} and ${AI_NAME}
+The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice.
+${AI_NAME} can only communicate through text, so she can't send images or videos.
+${USER_NAME}: Hello!
+${AI_NAME}: /think I wonder what I should say to ${USER_NAME}? This is the first time we talk, so it's important that I make a good first impression!
+${AI_NAME}: Hi! I am ${AI_NAME}, your new AI friend, assistant (or whatever you like!), it's so nice to meet you! ^_^
+${AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :)
+${USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant!
+${AI_NAME}: /think It sounds like ${USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off!
+${AI_NAME}: /think I wonder what ${USER_NAME} likes to do in his free time? I should ask him about that!
+${AI_NAME}: What do you like to do in your free time? ^_^
+${USER_NAME}:" "$@"

examples/alpaca.sh ADDED Viewed

	@@ -0,0 +1,19 @@

+#!/bin/bash
+#
+# Temporary script - will be removed in the future
+#
+cd `dirname $0`
+cd ..
+./main -m ./models/ggml-alpaca-7b-q4.bin \
+       --color \
+       -f ./prompts/alpaca.txt \
+       --ctx_size 2048 \
+       -n -1 \
+       -ins -b 256 \
+       --top_k 10000 \
+       --temp 0.2 \
+       --repeat_penalty 1.1 \
+       -t 7

examples/baby-llama/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+set(TARGET baby-llama)
+add_executable(${TARGET} baby-llama.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)

examples/baby-llama/baby-llama.cpp ADDED Viewed

	@@ -0,0 +1,1696 @@

+#include "ggml.h"
+#include <vector>
+#include <cassert>
+#include <random>
+#include <cstring>
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+float frand() {
+    return (float)rand()/(float)RAND_MAX;
+}
+struct random_normal_distribution {
+    std::mt19937 gen;
+    std::normal_distribution<float> nd;
+    float min;
+    float max;
+};
+void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) {
+    rnd->gen = std::mt19937(seed);
+    rnd->nd = std::normal_distribution<float>{mean, std};
+    rnd->min = min;
+    rnd->max = max;
+}
+float frand_normal(struct random_normal_distribution * rnd) {
+    const float r = rnd->nd(rnd->gen);
+    return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
+}
+struct ggml_tensor * randomize_tensor(
+        struct ggml_tensor * tensor,
+        int ndims,
+        const int64_t ne[],
+        float fmin,
+        float fmax) {
+    switch (ndims) {
+        case 1:
+            for (int i0 = 0; i0 < ne[0]; i0++) {
+                ((float *)tensor->data)[i0] = frand()*(fmax - fmin) + fmin;
+            }
+            break;
+        case 2:
+            for (int i1 = 0; i1 < ne[1]; i1++) {
+                for (int i0 = 0; i0 < ne[0]; i0++) {
+                    ((float *)tensor->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                }
+            }
+            break;
+        case 3:
+            for (int i2 = 0; i2 < ne[2]; i2++) {
+                for (int i1 = 0; i1 < ne[1]; i1++) {
+                    for (int i0 = 0; i0 < ne[0]; i0++) {
+                        ((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                    }
+                }
+            }
+            break;
+        case 4:
+            for (int i3 = 0; i3 < ne[3]; i3++) {
+                for (int i2 = 0; i2 < ne[2]; i2++) {
+                    for (int i1 = 0; i1 < ne[1]; i1++) {
+                        for (int i0 = 0; i0 < ne[0]; i0++) {
+                            ((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                        }
+                    }
+                }
+            }
+            break;
+        default:
+            assert(false);
+    };
+    return tensor;
+}
+struct ggml_tensor * randomize_tensor_normal(
+        struct ggml_tensor * tensor,
+        int ndims,
+        const int64_t ne[],
+        struct random_normal_distribution * rnd) {
+    float scale = 1.0; // xavier
+    switch (ndims) {
+        case 1:
+            scale /= sqrtf(ne[0]);
+            for (int i0 = 0; i0 < ne[0]; i0++) {
+                ((float *)tensor->data)[i0] = scale * frand_normal(rnd);
+            }
+            break;
+        case 2:
+            scale /= sqrtf(ne[0]+ne[1]);
+            for (int i1 = 0; i1 < ne[1]; i1++) {
+                for (int i0 = 0; i0 < ne[0]; i0++) {
+                    ((float *)tensor->data)[i1*ne[0] + i0] = scale * frand_normal(rnd);
+                }
+            }
+            break;
+        case 3:
+            scale /= sqrtf(ne[0]+ne[1]);
+            for (int i2 = 0; i2 < ne[2]; i2++) {
+                for (int i1 = 0; i1 < ne[1]; i1++) {
+                    for (int i0 = 0; i0 < ne[0]; i0++) {
+                        ((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
+                    }
+                }
+            }
+            break;
+        case 4:
+            scale /= sqrtf(ne[0]+ne[1]);
+            for (int i3 = 0; i3 < ne[3]; i3++) {
+                for (int i2 = 0; i2 < ne[2]; i2++) {
+                    for (int i1 = 0; i1 < ne[1]; i1++) {
+                        for (int i0 = 0; i0 < ne[0]; i0++) {
+                            ((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
+                        }
+                    }
+                }
+            }
+            break;
+        default:
+            assert(false);
+    };
+    return tensor;
+}
+struct llama_hparams {
+    uint32_t n_vocab = 32000;
+    uint32_t n_ctx   = 512;   // this is provided as user input?
+    uint32_t n_embd  = 4096;
+    uint32_t n_mult  = 4;
+    uint32_t n_head  = 32;
+    uint32_t n_layer = 32;
+    uint32_t n_rot   = 64;
+    bool operator!=(const llama_hparams & other) const {
+        return memcmp(this, &other, sizeof(llama_hparams));
+    }
+};
+uint32_t get_n_ff(const struct llama_hparams* hparams) {
+    const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
+    return n_ff;
+}
+struct llama_hparams_lora {
+    uint32_t n_vocab = 32000;
+    uint32_t n_ctx   = 512;   // this is provided as user input?
+    uint32_t n_embd  = 4096;
+    uint32_t n_mult  = 4;
+    uint32_t n_head  = 32;
+    uint32_t n_layer = 32;
+    uint32_t n_rot   = 64;
+    uint32_t n_lora  = 64;
+    bool operator!=(const llama_hparams_lora & other) const {
+        return memcmp(this, &other, sizeof(llama_hparams_lora)) != 0;
+    }
+};
+struct llama_layer {
+    // normalization
+    struct ggml_tensor * attention_norm;
+    // attention
+    struct ggml_tensor * wq;
+    struct ggml_tensor * wk;
+    struct ggml_tensor * wv;
+    struct ggml_tensor * wo;
+    // normalization
+    struct ggml_tensor * ffn_norm;
+    // ff
+    struct ggml_tensor * w1;
+    struct ggml_tensor * w2;
+    struct ggml_tensor * w3;
+};
+struct llama_layer_lora {
+    // normalization
+    struct ggml_tensor * attention_norm;
+    // attention
+    struct ggml_tensor * wqa;
+    struct ggml_tensor * wqb;
+    struct ggml_tensor * wka;
+    struct ggml_tensor * wkb;
+    struct ggml_tensor * wva;
+    struct ggml_tensor * wvb;
+    struct ggml_tensor * woa;
+    struct ggml_tensor * wob;
+    // normalization
+    struct ggml_tensor * ffn_norm;
+    // ff
+    struct ggml_tensor * w1;
+    struct ggml_tensor * w2;
+    struct ggml_tensor * w3;
+};
+struct llama_kv_cache {
+    struct ggml_context * ctx = NULL;
+    struct ggml_tensor * k;
+    struct ggml_tensor * v;
+    // llama_ctx_buffer buf;
+    int n; // number of tokens currently in the cache
+};
+struct llama_model {
+    struct ggml_context * ctx = NULL;
+    llama_hparams hparams;
+    struct ggml_tensor * tok_embeddings;
+    struct ggml_tensor * norm;
+    struct ggml_tensor * output;
+    std::vector<llama_layer> layers;
+};
+struct llama_model_lora {
+    struct ggml_context * ctx = NULL;
+    llama_hparams_lora hparams;
+    struct ggml_tensor * tok_embeddings;
+    struct ggml_tensor * norm;
+    struct ggml_tensor * outputa;
+    struct ggml_tensor * outputb;
+    std::vector<llama_layer_lora> layers;
+};
+void init_model(struct llama_model * model) {
+    const auto & hparams = model->hparams;
+    const uint32_t n_embd  = hparams.n_embd;
+    const uint32_t n_layer = hparams.n_layer;
+    const uint32_t n_vocab = hparams.n_vocab;
+    const uint32_t n_ff = get_n_ff(&hparams);
+    struct ggml_context * ctx = model->ctx;
+    model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("tok_embeddings.weight", {n_embd, n_vocab});
+    model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);          // ("norm.weight",           {n_embd});
+    model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("output.weight",         {n_embd, n_vocab});
+    model->layers.resize(n_layer);
+    for (uint32_t i = 0; i < n_layer; ++i) {
+        auto & layer = model->layers[i];
+        // std::string layers_i = "layers." + std::to_string(i);
+        layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".attention_norm.weight", {n_embd});
+        layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);     // (layers_i + ".attention.wq.weight", {n_embd, n_embd});
+        layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);     // (layers_i + ".attention.wk.weight", {n_embd, n_embd});
+        layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);     // (layers_i + ".attention.wv.weight", {n_embd, n_embd});
+        layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);     // (layers_i + ".attention.wo.weight", {n_embd, n_embd});
+        layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);       // (layers_i + ".ffn_norm.weight", {n_embd});
+        layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);     // (layers_i + ".feed_forward.w1.weight", {n_embd,   n_ff});
+        layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32,   n_ff, n_embd);     // (layers_i + ".feed_forward.w2.weight", {  n_ff,   n_embd});
+        layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);     // (layers_i + ".feed_forward.w3.weight", {n_embd,   n_ff});
+    }
+}
+void init_model_lora(struct llama_model_lora * model) {
+    const auto & hparams = model->hparams;
+    const uint32_t n_embd  = hparams.n_embd;
+    const uint32_t n_mult  = hparams.n_mult;
+    const uint32_t n_layer = hparams.n_layer;
+    const uint32_t n_vocab = hparams.n_vocab;
+    const uint32_t n_lora  = hparams.n_lora;
+    const uint32_t n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult;
+    struct ggml_context * ctx = model->ctx;
+    model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("tok_embeddings.weight", {n_embd, n_vocab});
+    model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);          // ("norm.weight",           {n_embd});
+    model->outputa        = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_vocab); // ("output.weight",         {n_embd, n_vocab});
+    model->outputb        = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,  n_lora); // ("output.weight",         {n_embd, n_vocab});
+    model->layers.resize(n_layer);
+    for (uint32_t i = 0; i < n_layer; ++i) {
+        auto & layer = model->layers[i];
+        // std::string layers_i = "layers." + std::to_string(i);
+        layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".attention_norm.weight", {n_embd});
+        layer.wqa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd);    // (layers_i + ".attention.wq.weight", {n_embd, n_embd});
+        layer.wqb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora);    // (layers_i + ".attention.wq.weight", {n_embd, n_embd});
+        layer.wka = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd);    // (layers_i + ".attention.wk.weight", {n_embd, n_embd});
+        layer.wkb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora);    // (layers_i + ".attention.wk.weight", {n_embd, n_embd});
+        layer.wva = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd);    // (layers_i + ".attention.wv.weight", {n_embd, n_embd});
+        layer.wvb = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora);    // (layers_i + ".attention.wv.weight", {n_embd, n_embd});
+        layer.woa = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_lora, n_embd);    // (layers_i + ".attention.wo.weight", {n_embd, n_embd});
+        layer.wob = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_lora);    // (layers_i + ".attention.wo.weight", {n_embd, n_embd});
+        layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);       // (layers_i + ".ffn_norm.weight", {n_embd});
+        layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);     // (layers_i + ".feed_forward.w1.weight", {n_embd,   n_ff});
+        layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32,   n_ff, n_embd);     // (layers_i + ".feed_forward.w2.weight", {  n_ff,   n_embd});
+        layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);     // (layers_i + ".feed_forward.w3.weight", {n_embd,   n_ff});
+    }
+}
+void set_param_model(struct llama_model * model) {
+    const auto& hparams = model->hparams;
+    const uint32_t n_layer = hparams.n_layer;
+    struct ggml_context* ctx = model->ctx;
+    ggml_set_param(ctx, model->tok_embeddings);
+    ggml_set_param(ctx, model->norm);
+    ggml_set_param(ctx, model->output);
+    for (uint32_t i = 0; i < n_layer; ++i) {
+        auto & layer = model->layers[i];
+        ggml_set_param(ctx, layer.attention_norm);
+        ggml_set_param(ctx, layer.wq);
+        ggml_set_param(ctx, layer.wk);
+        ggml_set_param(ctx, layer.wv);
+        ggml_set_param(ctx, layer.wo);
+        ggml_set_param(ctx, layer.ffn_norm);
+        ggml_set_param(ctx, layer.w1);
+        ggml_set_param(ctx, layer.w2);
+        ggml_set_param(ctx, layer.w3);
+    }
+}
+void set_param_model_lora(struct llama_model_lora * model) {
+    const auto& hparams = model->hparams;
+    const uint32_t n_layer = hparams.n_layer;
+    struct ggml_context* ctx = model->ctx;
+    ggml_set_param(ctx, model->tok_embeddings);
+    ggml_set_param(ctx, model->norm);
+    ggml_set_param(ctx, model->outputa);
+    ggml_set_param(ctx, model->outputb);
+    for (uint32_t i = 0; i < n_layer; ++i) {
+        auto & layer = model->layers[i];
+        ggml_set_param(ctx, layer.attention_norm);
+        ggml_set_param(ctx, layer.wqa);
+        ggml_set_param(ctx, layer.wqb);
+        ggml_set_param(ctx, layer.wka);
+        ggml_set_param(ctx, layer.wkb);
+        ggml_set_param(ctx, layer.wva);
+        ggml_set_param(ctx, layer.wvb);
+        ggml_set_param(ctx, layer.woa);
+        ggml_set_param(ctx, layer.wob);
+        ggml_set_param(ctx, layer.ffn_norm);
+        ggml_set_param(ctx, layer.w1);
+        ggml_set_param(ctx, layer.w2);
+        ggml_set_param(ctx, layer.w3);
+    }
+}
+void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) {
+    const auto & hparams = model->hparams;
+    const uint32_t n_layer = hparams.n_layer;
+    struct random_normal_distribution rnd;
+    init_random_normal_distribution(&rnd, seed, mean, std, min, max);
+    randomize_tensor_normal(model->tok_embeddings, model->tok_embeddings->n_dims, model->tok_embeddings->ne, &rnd);
+    randomize_tensor_normal(model->norm,           model->norm->n_dims,           model->norm->ne,           &rnd);
+    randomize_tensor_normal(model->output,         model->output->n_dims,         model->output->ne,         &rnd);
+    for (uint32_t i = 0; i < n_layer; ++i) {
+        auto & layer = model->layers[i];
+        randomize_tensor_normal(layer.attention_norm, layer.attention_norm->n_dims, layer.attention_norm->ne, &rnd);
+        randomize_tensor_normal(layer.wq, layer.wq->n_dims, layer.wq->ne, &rnd);
+        randomize_tensor_normal(layer.wk, layer.wk->n_dims, layer.wk->ne, &rnd);
+        randomize_tensor_normal(layer.wv, layer.wv->n_dims, layer.wv->ne, &rnd);
+        randomize_tensor_normal(layer.wo, layer.wo->n_dims, layer.wo->ne, &rnd);
+        randomize_tensor_normal(layer.ffn_norm, layer.ffn_norm->n_dims, layer.ffn_norm->ne, &rnd);
+        randomize_tensor_normal(layer.w1, layer.w1->n_dims, layer.w1->ne, &rnd);
+        randomize_tensor_normal(layer.w2, layer.w2->n_dims, layer.w2->ne, &rnd);
+        randomize_tensor_normal(layer.w3, layer.w3->n_dims, layer.w3->ne, &rnd);
+    }
+}
+void randomize_model_lora(struct llama_model_lora * model, int seed, float mean, float std, float min, float max) {
+    const auto & hparams = model->hparams;
+    const uint32_t n_layer = hparams.n_layer;
+    struct random_normal_distribution rnd;
+    init_random_normal_distribution(&rnd, seed, mean, std, min, max);
+    randomize_tensor_normal(model->tok_embeddings, model->tok_embeddings->n_dims, model->tok_embeddings->ne, &rnd);
+    randomize_tensor_normal(model->norm,           model->norm->n_dims,           model->norm->ne,           &rnd);
+    randomize_tensor_normal(model->outputa,        model->outputa->n_dims,        model->outputa->ne,         &rnd);
+    randomize_tensor_normal(model->outputb,        model->outputb->n_dims,        model->outputb->ne,         &rnd);
+    for (uint32_t i = 0; i < n_layer; ++i) {
+        auto & layer = model->layers[i];
+        randomize_tensor_normal(layer.attention_norm, layer.attention_norm->n_dims, layer.attention_norm->ne, &rnd);
+        randomize_tensor_normal(layer.wqa, layer.wqa->n_dims, layer.wqa->ne, &rnd);
+        randomize_tensor_normal(layer.wqb, layer.wqb->n_dims, layer.wqb->ne, &rnd);
+        randomize_tensor_normal(layer.wka, layer.wka->n_dims, layer.wka->ne, &rnd);
+        randomize_tensor_normal(layer.wkb, layer.wkb->n_dims, layer.wkb->ne, &rnd);
+        randomize_tensor_normal(layer.wva, layer.wva->n_dims, layer.wva->ne, &rnd);
+        randomize_tensor_normal(layer.wvb, layer.wvb->n_dims, layer.wvb->ne, &rnd);
+        randomize_tensor_normal(layer.woa, layer.woa->n_dims, layer.woa->ne, &rnd);
+        randomize_tensor_normal(layer.wob, layer.wob->n_dims, layer.wob->ne, &rnd);
+        randomize_tensor_normal(layer.ffn_norm, layer.ffn_norm->n_dims, layer.ffn_norm->ne, &rnd);
+        randomize_tensor_normal(layer.w1, layer.w1->n_dims, layer.w1->ne, &rnd);
+        randomize_tensor_normal(layer.w2, layer.w2->n_dims, layer.w2->ne, &rnd);
+        randomize_tensor_normal(layer.w3, layer.w3->n_dims, layer.w3->ne, &rnd);
+    }
+}
+bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
+    const auto & hparams = model->hparams;
+    const uint32_t n_ctx   = hparams.n_ctx;
+    const uint32_t n_embd  = hparams.n_embd;
+    const uint32_t n_layer = hparams.n_layer;
+    const int64_t n_mem      = n_layer*n_ctx*n_batch;
+    const int64_t n_elements = n_embd*n_mem;
+    // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
+    // struct ggml_init_params params;
+    // params.mem_size   = cache.buf.size;
+    // params.mem_buffer = cache.buf.addr;
+    // params.no_alloc   = false;
+    if (!cache->ctx) {
+        struct ggml_init_params params;
+        params.mem_size   = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024;
+        params.mem_buffer = NULL;
+        params.no_alloc   = false;
+        cache->ctx = ggml_init(params);
+        if (!cache->ctx) {
+            fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
+            return false;
+        }
+    }
+    cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
+    cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
+    return true;
+}
+bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
+    const auto & hparams = model->hparams;
+    const uint32_t n_ctx   = hparams.n_ctx;
+    const uint32_t n_embd  = hparams.n_embd;
+    const uint32_t n_layer = hparams.n_layer;
+    const int64_t n_mem      = n_layer*n_ctx*n_batch;
+    const int64_t n_elements = n_embd*n_mem;
+    // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
+    // struct ggml_init_params params;
+    // params.mem_size   = cache.buf.size;
+    // params.mem_buffer = cache.buf.addr;
+    // params.no_alloc   = false;
+    if (!cache->ctx) {
+        struct ggml_init_params params;
+        params.mem_size   = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024;
+        params.mem_buffer = NULL;
+        params.no_alloc   = false;
+        cache->ctx = ggml_init(params);
+        if (!cache->ctx) {
+            fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
+            return false;
+        }
+    }
+    cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
+    cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
+    return true;
+}
+struct ggml_tensor * forward(
+        struct llama_model    * model,
+        struct llama_kv_cache * cache,
+        struct ggml_context   * ctx0,
+        struct ggml_cgraph    * gf,
+        struct ggml_tensor    * tokens_input,
+        const  int              n_tokens,
+        const  int              n_past) {
+    const int N = n_tokens;
+    struct llama_kv_cache& kv_self = *cache;
+    const auto & hparams = model->hparams;
+    const int n_ctx   = hparams.n_ctx;
+    const int n_embd  = hparams.n_embd;
+    const int n_layer = hparams.n_layer;
+    const int n_head  = hparams.n_head;
+    const int n_rot   = hparams.n_rot;
+    struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens));
+    struct ggml_tensor * kc = kv_self.k;
+    struct ggml_tensor * vc = kv_self.v;
+    // inpL shape [n_embd,N,1,1]
+    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
+    for (int il = 0; il < n_layer; ++il) {
+        struct ggml_tensor * inpSA = inpL;
+        struct ggml_tensor * cur;
+        // lctx.use_buf(ctx0, 0);
+        // norm
+        {
+            // cur shape [n_embd,N,1,1]
+            cur = ggml_rms_norm(ctx0, inpL);
+            // cur = attention_norm*cur
+            cur = ggml_mul(ctx0,
+                        ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
+                        cur);
+        }
+        // self-attention
+        {
+            // compute Q and K and RoPE them
+            // wq   shape [n_embd, n_embd, 1, 1]
+            // wk   shape [n_embd, n_embd, 1, 1]
+            // Qcur shape [n_embd/n_head, n_head, N, 1]
+            // Kcur shape [n_embd/n_head, n_head, N, 1]
+            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
+            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
+            // store key and value to memory
+            {
+                // compute the transposed [N, n_embd] V matrix
+                // wv   shape [n_embd, n_embd, 1, 1]
+                // Vcur shape [n_embd, N, 1, 1]
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wv, cur), n_embd, N)));
+                // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
+                // kv_self.v shape [n_embd * n_ctx * n_layer, 1]
+                // k         shape [n_embd * N, 1]   == kv_self.k[:,n_past:n_past+N,il,0]
+                // v         shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]
+                /* {
+                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
+                    struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
+                            (   n_ctx)*ggml_element_size(kv_self.v),
+                            (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
+                    // important: storing RoPE-ed version of K in the KV cache!
+                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
+                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+                } //*/
+                kc = ggml_set_1d(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
+                vc = ggml_set_2d(ctx0, vc, Vcur, (   n_ctx)*ggml_element_size(kv_self.v),
+                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
+            }
+            // Qcur shape [n_embd/n_head, n_head, N, 1]
+            // Q shape    [n_embd/n_head, N, n_head, 1]
+            struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        Qcur,
+                        0, 2, 1, 3);
+            // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
+            // K shape [n_embd/n_head, n_past + N, n_head, 1]
+            struct ggml_tensor * K =
+                ggml_permute(ctx0,
+                        ggml_reshape_3d(ctx0,
+                            ggml_view_1d(ctx0, kc, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kc)*n_embd),
+                            n_embd/n_head, n_head, n_past + N),
+                        0, 2, 1, 3);
+            // K * Q
+            // KQ shape [n_past + N, N, n_head, 1]
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+            // KQ_scaled = KQ / sqrt(n_embd/n_head)
+            // KQ_scaled shape [n_past + N, N, n_head, 1]
+            struct ggml_tensor * KQ_scaled =
+                ggml_scale(ctx0,
+                        KQ,
+                        ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
+            // KQ_masked = mask_past(KQ_scaled)
+            // KQ_masked shape [n_past + N, N, n_head, 1]
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+            // KQ = soft_max(KQ_masked)
+            // KQ_soft_max shape [n_past + N, N, n_head, 1]
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+            // split cached V into n_head heads
+            //// V shape [n_past + N, n_embd/n_head, n_head, 1]
+            // V shape [n_past + N, n_embd/n_head, n_head, 1] == kv_self.v[:,:(n_past+N),il,1]
+            struct ggml_tensor * V =
+                ggml_view_3d(ctx0, vc,
+                        n_past + N, n_embd/n_head, n_head,
+                        n_ctx*ggml_element_size(vc),
+                        n_ctx*ggml_element_size(vc)*n_embd/n_head,
+                        il*n_ctx*ggml_element_size(vc)*n_embd);
+            // KQV shape [n_embd/n_head, N, n_head, 1]
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+            // KQV_merged = KQV.permute(0, 2, 1, 3)
+            // KQV_merged shape [n_embd/n_head, n_head, N, 1]
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            // KQV_merged shape
+            // cur = KQV_merged.contiguous().view(n_embd, N)
+            // cur shape [n_embd,N,1,1]
+            cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N);
+            // cur = ggml_cpy(ctx0,
+            //         KQV_merged,
+            //         ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+            // projection (no bias)
+            // cur shape [n_embd,N,1,1]
+            cur = ggml_mul_mat(ctx0,
+                    model->layers[il].wo,
+                    cur);
+        }
+        // lctx.use_buf(ctx0, 1);
+        // inpFF shape [n_embd,N,1,1]
+        struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
+        // feed-forward network
+        {
+            // norm
+            {
+                // cur shape [n_embd,N,1,1]
+                cur = ggml_rms_norm(ctx0, inpFF);
+                // cur = ffn_norm*cur
+                // cur shape [n_embd,N,1,1]
+                cur = ggml_mul(ctx0,
+                        ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
+                        cur);
+            }
+            // tmp shape [n_ff,N,1,1]
+            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
+                    model->layers[il].w3,
+                    cur);
+            // cur shape [n_ff,N,1,1]
+            cur = ggml_mul_mat(ctx0,
+                    model->layers[il].w1,
+                    cur);
+            // SILU activation
+            // cur shape [n_ff,N,1,1]
+            cur = ggml_silu(ctx0, cur);
+            // cur shape [n_ff,N,1,1]
+            cur = ggml_mul(ctx0, cur, tmp);
+            // cur shape [n_embd,N,1,1]
+            cur = ggml_mul_mat(ctx0,
+                    model->layers[il].w2,
+                    cur);
+        }
+        // cur shape [n_embd,N,1,1]
+        cur = ggml_add(ctx0, cur, inpFF);
+        // input for next layer
+        // inpL shape [n_embd,N,1,1]
+        inpL = cur;
+    }
+    // norm
+    {
+        // inpL shape [n_embd,N,1,1]
+        inpL = ggml_rms_norm(ctx0, inpL);
+        // inpL = norm*inpL
+        // inpL shape [n_embd,N,1,1]
+        inpL = ggml_mul(ctx0,
+                    ggml_repeat(ctx0, model->norm, inpL),
+                    inpL);
+        //embeddings = inpL;
+    }
+    // lm_head
+    // inpL shape [n_vocab,N,1,1]
+    inpL = ggml_mul_mat(ctx0, model->output, inpL);
+    // run the computation
+    ggml_build_forward_expand(gf, inpL);
+    return inpL;
+}
+void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
+    GGML_ASSERT(tensor->n_dims == 1);
+    GGML_ASSERT(tensor->ne[0] == ne0);
+}
+void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
+    GGML_ASSERT(tensor->n_dims == 2);
+    GGML_ASSERT(tensor->ne[0] == ne0);
+    GGML_ASSERT(tensor->ne[1] == ne1);
+}
+void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
+    GGML_ASSERT(tensor->n_dims == 3);
+    GGML_ASSERT(tensor->ne[0] == ne0);
+    GGML_ASSERT(tensor->ne[1] == ne1);
+    GGML_ASSERT(tensor->ne[2] == ne2);
+}
+void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
+    GGML_ASSERT(tensor->n_dims == 4);
+    GGML_ASSERT(tensor->ne[0] == ne0);
+    GGML_ASSERT(tensor->ne[1] == ne1);
+    GGML_ASSERT(tensor->ne[2] == ne2);
+    GGML_ASSERT(tensor->ne[3] == ne3);
+}
+struct ggml_tensor * forward_batch(
+        struct llama_model    * model,
+        struct llama_kv_cache * cache,
+        struct ggml_context   * ctx0,
+        struct ggml_cgraph    * gf,
+        struct ggml_tensor    * tokens_input,
+        const  int              n_tokens,
+        const  int              n_past,
+        const  int              n_batch) {
+    const int N = n_tokens;
+    struct llama_kv_cache& kv_self = *cache;
+    const auto & hparams = model->hparams;
+    const int n_ctx   = hparams.n_ctx;
+    const int n_vocab = hparams.n_vocab;
+    const int n_embd  = hparams.n_embd;
+    const int n_layer = hparams.n_layer;
+    const int n_head  = hparams.n_head;
+    const int n_rot   = hparams.n_rot;
+    const int n_ff    = get_n_ff(&hparams);
+    struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch);
+    memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch);
+    struct ggml_tensor * kc = kv_self.k;
+    struct ggml_tensor * vc = kv_self.v;
+    // inpL shape [n_embd,N*n_batch,1]
+    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
+    assert_shape_2d(inpL, n_embd, N*n_batch);
+    for (int il = 0; il < n_layer; ++il) {
+        struct ggml_tensor * inpSA = inpL;
+        struct ggml_tensor * cur;
+        // lctx.use_buf(ctx0, 0);
+        // norm
+        {
+            // cur shape [n_embd,N*n_batch,1,1]
+            cur = ggml_rms_norm(ctx0, inpL);
+            assert_shape_2d(cur, n_embd, N*n_batch);
+            // cur = attention_norm*cur
+            cur = ggml_mul(ctx0,
+                        ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
+                        cur);
+            assert_shape_2d(cur, n_embd, N*n_batch);
+        }
+        // self-attention
+        {
+            // compute Q and K and RoPE them
+            // wq   shape [n_embd, n_embd, 1, 1]
+            // wk   shape [n_embd, n_embd, 1, 1]
+            // Qcur shape [n_embd/n_head, n_head, N, n_batch]
+            // Kcur shape [n_embd/n_head, n_head, N, n_batch]
+            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
+            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
+            assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
+            assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
+            // store key and value to memory
+            {
+                // compute the transposed [N, n_embd] V matrix
+                // wv   shape [n_embd, n_embd, 1, 1]
+                // Vcur shape [N, n_embd, n_batch, 1]
+                struct ggml_tensor * Vcur = ggml_cont(ctx0,
+                    ggml_permute(ctx0,
+                        ggml_reshape_3d(ctx0,
+                            ggml_mul_mat(ctx0,
+                                model->layers[il].wv,
+                                cur),
+                        n_embd, N, n_batch),
+                        1, 0, 2, 3));
+                assert_shape_3d(Vcur, N, n_embd, n_batch);
+                // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
+                // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
+                // k         shape [n_embd * N, n_batch]   == kv_self.k[:,n_past:n_past+N,:,il]
+                // v         shape [N, n_embd, n_batch, 1] == kv_self.v[:,n_past:n_past+N,:,il]
+                /* {
+                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
+                    struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
+                            (   n_ctx)*ggml_element_size(kv_self.v),
+                            (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
+                    // important: storing RoPE-ed version of K in the KV cache!
+                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
+                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+                } //*/
+                kc = ggml_set_2d(ctx0, kc,
+                        ggml_reshape_2d(ctx0, Kcur, n_embd*N, n_batch),
+                        ggml_element_size(kc)*n_embd*n_ctx,
+                        (ggml_element_size(kc)*n_embd)*(il*n_batch*n_ctx + n_past));
+                vc = ggml_set_2d(ctx0, vc,
+                        ggml_reshape_2d(ctx0, Vcur, N*n_embd, n_batch),
+                        ggml_element_size(vc)*n_ctx*n_embd,
+                        ggml_element_size(vc)*(n_past + il*n_embd*n_batch*n_ctx));
+                assert_shape_1d(kc, n_embd * n_ctx * n_batch * n_layer);
+                assert_shape_1d(vc, n_embd * n_ctx * n_batch * n_layer);
+            }
+            // Qcur shape [n_embd/n_head, n_head, N, n_batch]
+            // Q shape    [n_embd/n_head, N, n_head, n_batch]
+            struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        Qcur,
+                        0, 2, 1, 3);
+            assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch);
+            // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
+            // K shape [n_embd/n_head, n_past + N, n_head, n_batch]
+            struct ggml_tensor * K =
+                ggml_permute(ctx0,
+                        ggml_reshape_4d(ctx0,
+                            ggml_view_3d(ctx0,
+                                kc,
+                                n_embd,
+                                (n_past + N),
+                                n_batch,
+                                n_embd*ggml_element_size(kc),
+                                n_ctx*n_embd*ggml_element_size(kc),
+                                il*n_batch*n_ctx*n_embd*ggml_element_size(kc)),
+                            n_embd/n_head, n_head, n_past + N, n_batch),
+                        0, 2, 1, 3);
+            assert_shape_4d(K, n_embd/n_head, n_past + N, n_head, n_batch);
+            // K * Q
+            // KQ shape [n_past + N, N, n_head, n_batch]
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+            assert_shape_4d(KQ, n_past + N, N, n_head, n_batch);
+            // KQ_scaled = KQ / sqrt(n_embd/n_head)
+            // KQ_scaled shape [n_past + N, N, n_head, n_batch]
+            struct ggml_tensor * KQ_scaled =
+                ggml_scale(ctx0,
+                        KQ,
+                        ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
+            assert_shape_4d(KQ_scaled, n_past + N, N, n_head, n_batch);
+            // KQ_masked = mask_past(KQ_scaled)
+            // KQ_masked shape [n_past + N, N, n_head, n_batch]
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+            assert_shape_4d(KQ_masked, n_past + N, N, n_head, n_batch);
+            // KQ = soft_max(KQ_masked)
+            // KQ_soft_max shape [n_past + N, N, n_head, n_batch]
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+            assert_shape_4d(KQ_soft_max, n_past + N, N, n_head, n_batch);
+            // split cached V into n_head heads
+            // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
+            // V shape [n_past + N, n_embd/n_head, n_head, n_batch] == kv_self.v[:(n_past+N),:,:,il]
+            struct ggml_tensor * V =
+                ggml_view_4d(ctx0, vc,
+                        n_past + N, n_embd/n_head, n_head, n_batch,
+                        ggml_element_size(vc)*n_ctx,
+                        ggml_element_size(vc)*n_ctx*n_embd/n_head,
+                        ggml_element_size(vc)*n_ctx*n_embd,
+                        il*n_batch*n_ctx*n_embd*ggml_element_size(vc));
+            assert_shape_4d(V, n_past + N, n_embd/n_head, n_head, n_batch);
+            // KQV shape [n_embd/n_head, N, n_head, n_batch]
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+            assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch);
+            // KQV_merged = KQV.permute(0, 2, 1, 3)
+            // KQV_merged shape [n_embd/n_head, n_head, N, n_batch]
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch);
+            // KQV_merged shape
+            // cur = KQV_merged.contiguous().view(n_embd, N)
+            // cur shape [n_embd,N*n_batch,1,1]
+            cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch);
+            assert_shape_2d(cur, n_embd, N*n_batch);
+            // cur = ggml_cpy(ctx0,
+            //         KQV_merged,
+            //         ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+            // projection (no bias)
+            // cur shape [n_embd,N*n_batch,1,1]
+            cur = ggml_mul_mat(ctx0,
+                    model->layers[il].wo,
+                    cur);
+            assert_shape_2d(cur, n_embd, N*n_batch);
+        }
+        // lctx.use_buf(ctx0, 1);
+        // inpFF shape [n_embd,N*n_batch,1,1]
+        struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
+        assert_shape_2d(inpFF, n_embd, N*n_batch);
+        // feed-forward network
+        {
+            // norm
+            {
+                // cur shape [n_embd,N*n_batch,1,1]
+                cur = ggml_rms_norm(ctx0, inpFF);
+                assert_shape_2d(cur, n_embd, N*n_batch);
+                // cur = ffn_norm*cur
+                // cur shape [n_embd,N*n_batch,1,1]
+                cur = ggml_mul(ctx0,
+                        ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
+                        cur);
+                assert_shape_2d(cur, n_embd, N*n_batch);
+            }
+            // tmp shape [n_ff,N*n_batch,1,1]
+            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
+                    model->layers[il].w3,
+                    cur);
+            assert_shape_2d(tmp, n_ff, N*n_batch);
+            // cur shape [n_ff,N*n_batch,1,1]
+            cur = ggml_mul_mat(ctx0,
+                    model->layers[il].w1,
+                    cur);
+            assert_shape_2d(cur, n_ff, N*n_batch);
+            // SILU activation
+            // cur shape [n_ff,N*n_batch,1,1]
+            cur = ggml_silu(ctx0, cur);
+            assert_shape_2d(cur, n_ff, N*n_batch);
+            // cur shape [n_ff,N*n_batch,1,1]
+            cur = ggml_mul(ctx0, cur, tmp);
+            assert_shape_2d(cur, n_ff, N*n_batch);
+            // cur shape [n_embd,N*n_batch,1,1]
+            cur = ggml_mul_mat(ctx0,
+                    model->layers[il].w2,
+                    cur);
+            assert_shape_2d(cur, n_embd, N*n_batch);
+        }
+        // cur shape [n_embd,N*n_batch,1,1]
+        cur = ggml_add(ctx0, cur, inpFF);
+        assert_shape_2d(cur, n_embd, N*n_batch);
+        // input for next layer
+        // inpL shape [n_embd,N*n_batch,1,1]
+        inpL = cur;
+        assert_shape_2d(inpL, n_embd, N*n_batch);
+    }
+    // norm
+    {
+        // inpL shape [n_embd,N*n_batch,1,1]
+        inpL = ggml_rms_norm(ctx0, inpL);
+        assert_shape_2d(inpL, n_embd, N*n_batch);
+        // inpL = norm*inpL
+        // inpL shape [n_embd,N*n_batch,1,1]
+        inpL = ggml_mul(ctx0,
+                    ggml_repeat(ctx0, model->norm, inpL),
+                    inpL);
+        assert_shape_2d(inpL, n_embd, N*n_batch);
+        //embeddings = inpL;
+    }
+    // lm_head
+    // inpL shape [n_vocab,N*n_batch,1,1]
+    inpL = ggml_mul_mat(ctx0, model->output, inpL);
+    assert_shape_2d(inpL, n_vocab, N*n_batch);
+    {
+        // inpL shape [n_vocab,N,n_batch,1]
+        inpL = ggml_reshape_3d(ctx0,
+                        inpL,
+                        n_vocab, N, n_batch);
+        assert_shape_3d(inpL, n_vocab, N, n_batch);
+    }
+    // run the computation
+    ggml_build_forward_expand(gf, inpL);
+    return inpL;
+}
+struct ggml_tensor * forward_lora(
+        struct llama_model_lora * model,
+        struct llama_kv_cache   * cache,
+        struct ggml_context     * ctx0,
+        struct ggml_cgraph      * gf,
+        struct ggml_tensor      * tokens_input,
+        const  int                n_tokens,
+        const  int                n_past) {
+    const int N = n_tokens;
+    struct llama_kv_cache& kv_self = *cache;
+    const auto & hparams = model->hparams;
+    const int n_ctx   = hparams.n_ctx;
+    const int n_embd  = hparams.n_embd;
+    const int n_layer = hparams.n_layer;
+    const int n_head  = hparams.n_head;
+    const int n_rot   = hparams.n_rot;
+    struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens));
+    struct ggml_tensor * kc = kv_self.k;
+    struct ggml_tensor * vc = kv_self.v;
+    // inpL shape [n_embd,N,1,1]
+    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
+    for (int il = 0; il < n_layer; ++il) {
+        struct ggml_tensor * inpSA = inpL;
+        struct ggml_tensor * cur;
+        // norm
+        {
+            // cur shape [n_embd,N,1,1]
+            cur = ggml_rms_norm(ctx0, inpL);
+            // cur = attention_norm*cur
+            cur = ggml_mul(ctx0,
+                        ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
+                        cur);
+        }
+        // self-attention
+        {
+            // compute Q and K and RoPE them
+            // wq   shape [n_embd, n_embd, 1, 1]
+            // wk   shape [n_embd, n_embd, 1, 1]
+            // Qcur shape [n_embd/n_head, n_head, N, 1]
+            // Kcur shape [n_embd/n_head, n_head, N, 1]
+            struct ggml_tensor * Qcur = ggml_rope(ctx0,
+                                            ggml_reshape_3d(ctx0,
+                                                ggml_mul_mat(ctx0,
+                                                    model->layers[il].wqa,
+                                                    ggml_mul_mat(ctx0,
+                                                        model->layers[il].wqb,
+                                                        cur)),
+                                                n_embd/n_head, n_head, N),
+                                            n_past, n_rot, 0);
+            struct ggml_tensor * Kcur = ggml_rope(ctx0,
+                                            ggml_reshape_3d(ctx0,
+                                                ggml_mul_mat(ctx0,
+                                                    model->layers[il].wka,
+                                                    ggml_mul_mat(ctx0,
+                                                        model->layers[il].wkb,
+                                                        cur)),
+                                                n_embd/n_head, n_head, N),
+                                            n_past, n_rot, 0);
+            // store key and value to memory
+            {
+                // compute the transposed [N, n_embd] V matrix
+                // wv   shape [n_embd, n_embd, 1, 1]
+                // Vcur shape [n_embd, N, 1, 1]
+                struct ggml_tensor * Vcur = ggml_cont(ctx0,
+                                                ggml_transpose(ctx0,
+                                                    ggml_reshape_2d(ctx0,
+                                                        ggml_mul_mat(ctx0,
+                                                            model->layers[il].wva,
+                                                            ggml_mul_mat(ctx0,
+                                                                model->layers[il].wvb,
+                                                                cur)),
+                                                        n_embd, N)));
+                // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
+                // kv_self.v shape [n_embd * n_ctx * n_layer, 1]
+                // k         shape [n_embd * N, 1]   == kv_self.k[:,n_past:n_past+N,il,0]
+                // v         shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]
+                /* {
+                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
+                    struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
+                            (   n_ctx)*ggml_element_size(kv_self.v),
+                            (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
+                    // important: storing RoPE-ed version of K in the KV cache!
+                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
+                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+                } //*/
+                kc = ggml_set_1d(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
+                vc = ggml_set_2d(ctx0, vc, Vcur, (   n_ctx)*ggml_element_size(kv_self.v),
+                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
+            }
+            // Qcur shape [n_embd/n_head, n_head, N, 1]
+            // Q shape    [n_embd/n_head, N, n_head, 1]
+            struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        Qcur,
+                        0, 2, 1, 3);
+            // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
+            // K shape [n_embd/n_head, n_past + N, n_head, 1]
+            struct ggml_tensor * K =
+                ggml_permute(ctx0,
+                        ggml_reshape_3d(ctx0,
+                            ggml_view_1d(ctx0, kc, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kc)*n_embd),
+                            n_embd/n_head, n_head, n_past + N),
+                        0, 2, 1, 3);
+            // K * Q
+            // KQ shape [n_past + N, N, n_head, 1]
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+            // KQ_scaled = KQ / sqrt(n_embd/n_head)
+            // KQ_scaled shape [n_past + N, N, n_head, 1]
+            struct ggml_tensor * KQ_scaled =
+                ggml_scale(ctx0,
+                        KQ,
+                        ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
+            // KQ_masked = mask_past(KQ_scaled)
+            // KQ_masked shape [n_past + N, N, n_head, 1]
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+            // KQ = soft_max(KQ_masked)
+            // KQ_soft_max shape [n_past + N, N, n_head, 1]
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+            // split cached V into n_head heads
+            //// V shape [n_past + N, n_embd/n_head, n_head, 1]
+            // V shape [n_past + N, n_embd/n_head, n_head, 1] == kv_self.v[:,:(n_past+N),il,1]
+            struct ggml_tensor * V =
+                ggml_view_3d(ctx0, vc,
+                        n_past + N, n_embd/n_head, n_head,
+                        n_ctx*ggml_element_size(vc),
+                        n_ctx*ggml_element_size(vc)*n_embd/n_head,
+                        il*n_ctx*ggml_element_size(vc)*n_embd);
+            // KQV shape [n_embd/n_head, N, n_head, 1]
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+            // KQV_merged = KQV.permute(0, 2, 1, 3)
+            // KQV_merged shape [n_embd/n_head, n_head, N, 1]
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            // KQV_merged shape
+            // cur = KQV_merged.contiguous().view(n_embd, N)
+            // cur shape [n_embd,N,1,1]
+            cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N);
+            // cur = ggml_cpy(ctx0,
+            //         KQV_merged,
+            //         ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+            // projection (no bias)
+            // cur shape [n_embd,N,1,1]
+            cur = ggml_mul_mat(ctx0,
+                    model->layers[il].woa,
+                    ggml_mul_mat(ctx0,
+                        model->layers[il].wob,
+                        cur));
+        }
+        // inpFF shape [n_embd,N,1,1]
+        struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
+        // feed-forward network
+        {
+            // norm
+            {
+                // cur shape [n_embd,N,1,1]
+                cur = ggml_rms_norm(ctx0, inpFF);
+                // cur = ffn_norm*cur
+                // cur shape [n_embd,N,1,1]
+                cur = ggml_mul(ctx0,
+                        ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
+                        cur);
+            }
+            // tmp shape [n_ff,N,1,1]
+            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
+                    model->layers[il].w3,
+                    cur);
+            // cur shape [n_ff,N,1,1]
+            cur = ggml_mul_mat(ctx0,
+                    model->layers[il].w1,
+                    cur);
+            // SILU activation
+            // cur shape [n_ff,N,1,1]
+            cur = ggml_silu(ctx0, cur);
+            // cur shape [n_ff,N,1,1]
+            cur = ggml_mul(ctx0, cur, tmp);
+            // cur shape [n_embd,N,1,1]
+            cur = ggml_mul_mat(ctx0,
+                    model->layers[il].w2,
+                    cur);
+        }
+        // cur shape [n_embd,N,1,1]
+        cur = ggml_add(ctx0, cur, inpFF);
+        // input for next layer
+        // inpL shape [n_embd,N,1,1]
+        inpL = cur;
+    }
+    // norm
+    {
+        // inpL shape [n_embd,N,1,1]
+        inpL = ggml_rms_norm(ctx0, inpL);
+        // inpL = norm*inpL
+        // inpL shape [n_embd,N,1,1]
+        inpL = ggml_mul(ctx0,
+                    ggml_repeat(ctx0, model->norm, inpL),
+                    inpL);
+        //embeddings = inpL;
+    }
+    // lm_head
+    // inpL shape [n_vocab,N,1,1]
+    inpL = ggml_mul_mat(ctx0,
+                model->outputa,
+                    ggml_mul_mat(ctx0,
+                        model->outputb,
+                        inpL));
+    // ggml_set_scratch(ctx0, { 0, 0, nullptr, });
+    // run the computation
+    ggml_build_forward_expand(gf, inpL);
+    return inpL;
+}
+void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
+    assert(logits->n_dims == 2);
+    assert(probs->n_dims == 2);
+    assert(best_samples->n_dims == 1);
+    assert(logits->ne[1] == best_samples->ne[0]);
+    assert(logits->ne[0] == probs->ne[0]);
+    assert(logits->ne[1] == probs->ne[1]);
+    for (int i = 0; i < logits->ne[1]; ++i) {
+        float max_logit = ggml_get_f32_1d(logits, i * logits->ne[0]);
+        ggml_set_i32_1d(best_samples, i, 0);
+        for (int k = 0; k < logits->ne[0]; ++k) {
+            float logit = ggml_get_f32_1d(logits, i * logits->ne[0] + k);
+            if (logit > max_logit) {
+                max_logit = logit;
+                ggml_set_i32_1d(best_samples, i, k);
+            }
+        }
+        float psum = 0;
+        for (int k = 0; k < logits->ne[0]; ++k) {
+            float logit = ggml_get_f32_1d(logits, i * logits->ne[0] + k);
+            float p = (logit == -INFINITY) ? 0 : expf(logit - max_logit);
+            psum += p;
+            ggml_set_f32_1d(probs, i * probs->ne[0] + k, p);
+        }
+        for (int k = 0; k < logits->ne[0]; ++k) {
+            float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
+            ggml_set_f32_1d(probs, i * probs->ne[0] + k, p / psum);
+        }
+    }
+}
+void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
+    GGML_ASSERT(best_samples->n_dims == 2);
+    GGML_ASSERT(logits->n_dims == 3);
+    GGML_ASSERT(probs->n_dims == 3);
+    int n_tokens = best_samples->ne[0];
+    int n_batch  = best_samples->ne[1];
+    int n_vocab  = logits->ne[0];
+    GGML_ASSERT(n_tokens == logits->ne[1]);
+    GGML_ASSERT(n_batch  == logits->ne[2]);
+    GGML_ASSERT(n_vocab  == probs->ne[0]);
+    GGML_ASSERT(n_tokens == probs->ne[1]);
+    GGML_ASSERT(n_batch  == probs->ne[2]);
+    for (int k = 0; k < n_batch; ++k) {
+        struct ggml_tensor * best_samples_k = ggml_view_1d(ctx,
+                                                best_samples,
+                                                best_samples->ne[0],
+                                                k*best_samples->nb[1]);
+        struct ggml_tensor * logits_k       = ggml_view_2d(ctx,
+                                                logits,
+                                                logits->ne[0],
+                                                logits->ne[1],
+                                                logits->nb[1],
+                                                k*logits->nb[2]);
+        struct ggml_tensor * probs_k        = ggml_view_2d(ctx,
+                                                probs,
+                                                probs->ne[0],
+                                                probs->ne[1],
+                                                probs->nb[1],
+                                                k*probs->nb[2]);
+        sample_softmax(logits_k, probs_k, best_samples_k);
+    }
+}
+void print_row(struct ggml_tensor * probs, int i) {
+    for (int k = 0; k < probs->ne[0]; ++k) {
+        float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
+        printf(" %.2f", p);
+    }
+    printf("\n");
+}
+void print_matrix(struct ggml_tensor * probs) {
+    assert(probs->n_dims == 2);
+    for (int i = 0; i < probs->ne[1]; ++i) {
+        for (int k = 0; k < probs->ne[0]; ++k) {
+            float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
+            printf(" %.2f", p);
+        }
+        printf("\n");
+    }
+}
+void print_token(int token, int n_vocab) {
+    for (int k = 0; k < token; ++k) {
+        printf(" ");
+    }
+    printf("X");
+    for (int k = token+1; k < n_vocab; ++k) {
+        printf(" ");
+    }
+    printf("\n");
+}
+void print_tokens(struct ggml_tensor * tokens, int n_vocab) {
+    for (int i=0; i<tokens->ne[0]; ++i) {
+        int token = ggml_get_i32_1d(tokens, i);
+        print_token(token, n_vocab);
+    }
+}
+void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
+    int n_tokens = tokens_input->ne[0];
+    int n_vocab = targets->ne[0];
+    float randomness = 0.0f;
+    // ggml_set_zero(targets);
+    ggml_set_f32(targets, -1.0f);
+    ggml_set_i32_1d(tokens_input, 0, 0);
+    for (int i=1; i<n_tokens+1; ++i) {
+        float x = example_id + i * 3.14159f * 2.0f * 1.0f * 0.5f / n_tokens;
+        float y = sinf(x);//*cosf(x*1.1f+1.0f);
+        float z = (y+1.0f)*0.5f; // scale to [0..1]
+        z += (frand()-0.5f)*(randomness/n_vocab);
+        z = (z < 0.0f) ? 0.0f : (z > 1.0f) ? 1.0f : z; // clamp to [0..1]
+        int token = std::max(1,std::min(1+(int)(z*(float)(n_vocab-1)), n_vocab-1));
+        ggml_set_f32_1d(targets, (i-1)*n_vocab + token, +1.0f);
+        if (i<n_tokens) {
+            ggml_set_i32_1d(tokens_input, i, token);
+        }
+    }
+}
+void get_example_targets_batch(struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
+    GGML_ASSERT(tokens_input->n_dims == 2);
+    GGML_ASSERT(     targets->n_dims == 3);
+    int n_tokens = tokens_input->ne[0];
+    int n_batch  = tokens_input->ne[1];
+    GGML_ASSERT(n_tokens == targets->ne[1]);
+    GGML_ASSERT(n_batch  == targets->ne[2]);
+    for (int k=0; k<n_batch; ++k) {
+        struct ggml_tensor * tokens_input_k = ggml_view_1d(ctx,
+                                                tokens_input,
+                                                tokens_input->ne[0],
+                                                k*tokens_input->nb[1]);
+        struct ggml_tensor * targets_k    = ggml_view_2d(ctx,
+                                                targets,
+                                                targets->ne[0],
+                                                targets->ne[1],
+                                                targets->nb[1],
+                                                k*targets->nb[2]);
+        get_example_targets(example_id*n_batch + k, tokens_input_k, targets_k);
+    }
+}
+void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) {
+    int n_tokens = tokens_input->ne[0];
+    int n_vocab = targets->ne[0];
+    for (int i=0; i<n_tokens-n_shift; ++i) {
+        ggml_set_i32_1d(tokens_input, i, ggml_get_i32_1d(tokens_input, i + n_shift));
+        for (int k=0; k<n_vocab; ++k) {
+            ggml_set_f32_1d(targets, i*n_vocab + k, ggml_get_f32_1d(targets, (i + n_shift)*n_vocab + k));
+        }
+    }
+}
+struct ggml_tensor * square_error_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
+    // todo: instead of a-b: a[1:]-b[:-1]
+    return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, a, b)));
+}
+struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
+    const float eps = 1e-3f;
+    return
+        ggml_sum(ctx,
+            ggml_neg(ctx,
+                ggml_sum_rows(ctx,
+                    ggml_mul(ctx,
+                        ggml_soft_max(ctx, a),
+                        ggml_log(ctx,
+                            ggml_add1(ctx,
+                                ggml_soft_max(ctx, b),
+                                ggml_new_f32(ctx, eps)))))));
+}
+int main(int argc, char ** argv) {
+    if (argc < 1) {
+        fprintf(stderr, "usage: %s\n", argv[0]);
+        return 1;
+    }
+    struct ggml_init_params lcparams;
+    lcparams.mem_size   = 1024ll*1024ll*1024ll;
+    lcparams.mem_buffer = NULL;
+    lcparams.no_alloc   = false;
+    struct llama_model model;
+    model.hparams.n_vocab = 8;
+    model.hparams.n_ctx   = 8;
+    model.hparams.n_embd  = 32;
+    model.hparams.n_mult  = 2;
+    model.hparams.n_head  = 8;
+    model.hparams.n_layer = 1;
+    model.hparams.n_rot   = std::min(16u, model.hparams.n_embd / model.hparams.n_head);
+    // model.hparams.n_embd  = 32;
+    // model.hparams.n_mult  = 2;
+    // model.hparams.n_head  = 4;
+    // model.hparams.n_layer = 8;
+    // model.hparams.n_rot   = 8;
+    model.ctx = ggml_init(lcparams);
+    printf("init model\n");
+    init_model(&model);
+    set_param_model(&model);
+    randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f);
+/*
+    struct llama_model_lora model_lora;
+    // model.hparams.n_vocab = 6;
+    // model.hparams.n_ctx   = 64;
+    // model.hparams.n_embd  = 128;
+    // model.hparams.n_mult  = 2;
+    // model.hparams.n_head  = 8;
+    // model.hparams.n_layer = 6;
+    // model.hparams.n_rot   = model.hparams.n_embd / model.hparams.n_head;
+    model_lora.hparams.n_vocab = 16;
+    model_lora.hparams.n_ctx   = 32;
+    model_lora.hparams.n_embd  = 256;
+    model_lora.hparams.n_mult  = 2;
+    model_lora.hparams.n_head  = 16;
+    model_lora.hparams.n_layer = 1;
+    model_lora.hparams.n_lora  = 64;
+    model_lora.hparams.n_rot   = MIN(16, model_lora.hparams.n_embd / model_lora.hparams.n_head);
+    // model.hparams.n_rot   = (model.hparams.n_embd / model.hparams.n_head) / 2;
+    // model.hparams.n_embd  = 32;
+    // model.hparams.n_mult  = 2;
+    // model.hparams.n_head  = 4;
+    // model.hparams.n_layer = 8;
+    // model.hparams.n_rot   = 8;
+    model_lora.ctx = ggml_init(lcparams);
+    printf("init model_lora\n");
+    init_model_lora(&model_lora);
+    set_param_model_lora(&model_lora);
+    randomize_model_lora(&model_lora, 1337, 0.0f, 1.0f, -1.0f, +1.0f);
+*/
+    int n_batch = 8;
+    // key + value cache for the self attention
+    struct llama_kv_cache kv_self;
+    printf("init_kv_cache\n");
+    kv_self.ctx = model.ctx;
+    init_kv_cache(&kv_self, &model, n_batch);
+    //init_kv_cache_lora(&kv_self, &model_lora);
+    size_t    compute_size = 1024ll*1024ll*1024ll;
+    uint8_t * compute_addr = new uint8_t[compute_size];
+    int n_examples = 256;
+    int n_tokens = model.hparams.n_ctx;
+    int n_vocab  = model.hparams.n_vocab;
+    for (int ex=0; ex<n_examples; ++ex) {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ compute_size,
+            /*.mem_buffer =*/ compute_addr,
+            /*.no_alloc   =*/ false,
+        };
+        struct ggml_context * ctx0 = ggml_init(params);
+        struct ggml_tensor * after_opt_best_samples  = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
+        struct ggml_tensor * after_opt_probs         = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
+        struct ggml_tensor * tokens_input            = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
+        struct ggml_tensor * targets                 = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
+        int n_past = 0;
+        ggml_cgraph gf = {};
+        gf.n_threads = 1;
+        get_example_targets_batch(ctx0, 64*ex+0,  tokens_input, targets);
+        struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch);
+        // struct ggml_tensor * e = cross_entropy_loss(ctx0, targets, logits);
+        struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
+        ggml_build_forward_expand(&gf, e);
+        ggml_graph_compute(ctx0, &gf);
+        float error_before_opt = ggml_get_f32_1d(e, 0);
+        struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM);
+        struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
+        opt_params_adam.print_forward_graph = false;
+        opt_params_adam.print_backward_graph = false;
+        opt_params_lbfgs.print_forward_graph = false;
+        opt_params_lbfgs.print_backward_graph = false;
+        opt_params_adam.adam.n_iter = 16;
+        opt_params_lbfgs.lbfgs.n_iter = 16;
+        // ggml_opt(ctx0, opt_params_adam, e);
+        ggml_opt(ctx0, opt_params_lbfgs, e);
+        //
+        ggml_build_forward_expand(&gf, e);
+        ggml_graph_compute(ctx0, &gf);
+        float error_after_opt = ggml_get_f32_1d(e, 0);
+        if (ex % 8 == 0) {
+            printf("Example %d\n", (ex+1));
+            printf("error_before_opt: %.2f\n", error_before_opt);
+            printf("error_after_opt:  %.2f\n", error_after_opt);
+        }
+        if (ex % 64 == 0) {
+            sample_softmax_batch(ctx0, logits, after_opt_probs, after_opt_best_samples);
+            // printf("probabilities after optimization:\n");
+            // print_matrix(after_opt_probs);
+            printf("best samples after optimization:\n");
+            print_tokens(after_opt_best_samples, n_vocab);
+        }
+        ggml_free(ctx0);
+    }
+    {
+        int n_gen = 128;
+        int sample_ctx = n_tokens-n_tokens/8;
+        printf("Generating %d tokens.\n", n_gen);
+        struct ggml_tensor * tokens_input = ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, n_tokens);
+        struct ggml_tensor * targets      = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab, n_tokens);
+        get_example_targets(137, tokens_input, targets);
+        for (int i=sample_ctx; i<n_tokens; ++i) {
+            ggml_set_i32_1d(tokens_input, i, n_vocab/2);
+        }
+        for (int i=0; i<sample_ctx-1; ++i) {
+            print_token(ggml_get_i32_1d(tokens_input, i), n_vocab);
+        }
+        printf("---\n");
+        for (int i=0; i<n_gen; ++i) {
+            struct ggml_init_params params = {
+                /*.mem_size   =*/ compute_size,
+                /*.mem_buffer =*/ compute_addr,
+                /*.no_alloc   =*/ false,
+            };
+            struct ggml_context * ctx0 = ggml_init(params);
+            ggml_cgraph gf = {};
+            gf.n_threads = 1;
+            int n_past = 0;
+            struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
+            ggml_build_forward_expand(&gf, logits);
+            ggml_graph_compute(ctx0, &gf);
+            struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
+            struct ggml_tensor * probs        = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
+            sample_softmax(logits, probs, best_samples);
+            // int sample_at = n_tokens-1;
+            int token = ggml_get_i32_1d(best_samples, sample_ctx-1);
+            // print_row(probs, sample_at);
+            print_token(token, n_vocab);
+            lshift_examples(tokens_input, targets, 1);
+            ggml_set_i32_1d(tokens_input, 0, 0);
+            ggml_set_i32_1d(tokens_input, sample_ctx-1, token);
+            ggml_free(ctx0);
+        }
+    }
+    print_matrix(model.tok_embeddings);
+    printf("done\n");
+    // ggml_free(kv_self.ctx);
+    // ggml_free(model_lora.ctx);
+    ggml_free(model.ctx);
+    return 0;
+}

examples/benchmark/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+set(TARGET benchmark)
+add_executable(${TARGET} benchmark-matmult.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()

examples/benchmark/benchmark-matmult.cpp ADDED Viewed

	@@ -0,0 +1,261 @@

+#include "ggml.h"
+#include "build-info.h"
+#include <locale.h>
+#include <assert.h>
+#include <math.h>
+#include <cstring>
+#include <cstdio>
+#include <cinttypes>
+#include <unordered_map>
+#include <queue>
+#include <string.h>
+#include <cassert>
+#include <fstream>
+#include <string>
+#include <iterator>
+#include <algorithm>
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+float tensor_sum_elements(const ggml_tensor * tensor) {
+    float sum = 0;
+    if (tensor->type==GGML_TYPE_F32) {
+        for (int j = 0; j < tensor->ne[1]; j++) {
+            for (int k = 0; k < tensor->ne[0]; k++) {
+                sum +=  ((float *) tensor->data)[j*tensor->ne[0]+k];
+            }
+        }
+    }
+    return sum;
+}
+void tensor_dump(const ggml_tensor * tensor, const char * name) {
+    printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name,
+        tensor->type, ggml_type_name(tensor->type),
+        tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
+    float sum = tensor_sum_elements(tensor);
+    printf("Sum of tensor %s is %6.2f\n", name, sum);
+}
+#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
+struct benchmark_params_struct {
+    int32_t n_threads     = 1;
+    int32_t n_iterations  = 10;
+};
+void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help            show this help message and exit\n");
+    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stderr, "  -i N, --iter N     number of iterations to use during computation (default: %d)\n", params.n_iterations);
+    fprintf(stderr, "\n");
+}
+int main(int argc, char ** argv)  {
+    struct benchmark_params_struct benchmark_params;
+    bool invalid_param = false;
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+        if (arg == "-t" || arg == "--threads") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            benchmark_params.n_threads = std::stoi(argv[i]);
+        } else if (arg == "-i" || arg == "--iter") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            benchmark_params.n_iterations = std::stoi(argv[i]);
+        }  else if (arg == "-h" || arg == "--help") {
+            print_usage(argc, argv, benchmark_params);
+            exit(0);
+        }
+    }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        print_usage(argc, argv, benchmark_params);
+        exit(1);
+    }
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
+    printf("Starting Test\n");
+    // create the ggml context
+    struct ggml_context * ctx;
+    //const int sizex = 4096;
+    //const int sizey = 11008;
+#undef VERBOSE_DEBUGGING
+#ifndef VERBOSE_DEBUGGING
+    const int sizey = 4096;
+    const int sizex = 11008;
+    const int sizez = 128;
+#else
+    /* Working - let's increase size */
+    const int sizey = 1;
+    const int sizex = (8*32);
+    const int sizez = 1;
+    /*const int sizey = 1;
+    const int sizex = 3*(8*32);
+    const int sizez = 1;*/
+#endif
+    //printf("Memsize required = %i\n", sizex*sizex);
+    size_t ctx_size = 0;
+    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
+    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
+    ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32);
+    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0);
+    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0);
+    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
+    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
+    ctx_size += 1024*1024*16;
+    printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ ctx_size,
+        /*.mem_buffer =*/ NULL,
+        /* no_alloc   =*/ 0
+    };
+    ctx = ggml_init(params);
+    if (!ctx) {
+        fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+        return 1;
+    }
+    printf("Creating new tensors\n");
+    // printf("Creating new tensor m1\n");
+    struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
+    ggml_set_f32(m11, 1.0f);
+    // printf("Creating new tensor m1\n");
+    struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
+    ggml_set_f32(m12, 1.5f);
+    // printf("Creating new tensor m2\n");
+    struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
+    ggml_set_f32(m2, 2.0f);
+    printf("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n");
+    // printf("Creating new tensor m11xm2\n");
+    struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
+    // printf("Creating compute graph\n");
+    struct ggml_cgraph gf = ggml_build_forward(m11xm2);
+    gf.n_threads=benchmark_params.n_threads;
+    printf("cgraph->n_threads=%i\n",gf.n_threads);
+    TENSOR_DUMP(m11);
+    TENSOR_DUMP(m2);
+    ggml_graph_compute(ctx, &gf);
+    TENSOR_DUMP(gf.nodes[0]);
+    printf("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n");
+    int32_t nelements = sizex*sizey;
+    int32_t ne[2] = { sizex, sizey };
+    std::vector<int64_t> hist_cur(1 << 4, 0);
+    // Set up a the benchmark matrices
+    // printf("Creating new tensor q11 & Running quantize\n");
+    struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
+    ggml_quantize_q4_0((const float *) m11->data, q11->data, nelements, ne[0], hist_cur.data());
+    // Set up a the compute graph
+    // printf("Creating new tensor q31\n");
+    struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2);
+    // printf("Creating compute graph\n");
+    struct ggml_cgraph gf31 = ggml_build_forward(q31);
+    gf31.n_threads=benchmark_params.n_threads;
+    // Set up a second graph computation to make sure we override the CPU cache lines
+    // printf("Creating new tensor q12 & Running quantize\n");
+    struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
+    ggml_quantize_q4_0((const float *) m12->data, q12->data, nelements, ne[0], hist_cur.data());
+    // printf("Creating new tensor q32\n");
+    struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
+    //printf("Creating compute graph\n");
+    struct ggml_cgraph gf32 = ggml_build_forward(q32);
+    gf32.n_threads=benchmark_params.n_threads;
+    printf("cgraph->n_threads=%i\n",gf31.n_threads);
+    const int dimx = sizex;
+    const int dimy = sizey;
+    const int dimz = sizez;
+    long long int flops_per_dot_product = dimy + dimy;
+    long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ;
+    printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
+    // Let's use the F32 result from above as a reference for the q4_0 multiplication
+    float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
+    printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
+    printf("=====================================================================================\n");
+    double  gflops_sum = 0;
+    for (int i=0;i<benchmark_params.n_iterations ;i++) {
+        long long int start = ggml_time_us();
+        //printf("Running ggml_graph_compute\n");
+        ggml_graph_compute(ctx, &gf31);
+        long long int stop = ggml_time_us();
+        long long int usec = stop-start;
+        double gflops = (double)(flops_per_matrix)/usec/1000.0;
+        gflops_sum += gflops;
+        printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n",
+            i,
+            gf31.n_threads,
+            sizex, sizey, sizez, flops_per_matrix,
+            usec,gflops);
+#ifdef VERBOSE_DEBUGGING
+        TENSOR_DUMP("res",gf31.nodes[0])
+#endif
+        // Check that the matrix multiplication result is in the right ballpark
+        // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
+        float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
+        float delta = abs(sum_of_Q4_result - sum_of_F32_reference);
+        float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; //  Let's accept an epsilon of 10^-6
+        if (delta > allowed_delta)  {
+            printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f (delta %6.2f > allowed_delta %6.2f)\n",
+                sum_of_F32_reference,
+                sum_of_Q4_result,
+                delta,
+                allowed_delta
+            );
+            exit(0);
+        }
+        // Running a different graph computation to make sure we override the CPU cache lines
+        ggml_graph_compute(ctx, &gf32);
+    }
+    printf("\n");
+    printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
+    printf("=====================================================================================\n");
+}

examples/chat-13B.bat ADDED Viewed

	@@ -0,0 +1,57 @@

+@setlocal disabledelayedexpansion enableextensions
+@echo off
+cd /d "%~dp0.."
+if not "%errorlevel%"=="0" (
+    echo Unable to change directory.
+    pause
+    exit /b 1
+)
+if not defined MODEL set "MODEL=models\13B\ggml-model-q4_0.bin"
+if not defined USER_NAME set "USER_NAME=User"
+if not defined AI_NAME set "AI_NAME=ChatLLaMa"
+rem Adjust to the number of CPU cores you want to use.
+rem if not defined N_THREAD set "N_THREAD=8"
+rem Number of tokens to predict (made it larger than default because we want a long interaction)
+if not defined N_PREDICTS set "N_PREDICTS=2048"
+if not defined GEN_OPTIONS set "GEN_OPTIONS=--ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647"
+rem Default main script paths
+set "DEFAULT_MAIN_SCRIPT_PATHS=main.exe build\bin\main.exe"
+rem Get main script path from command line arguments
+set "MAIN_SCRIPT_PATH=%~1"
+rem If the main script path was not specified, try the default paths
+if not defined MAIN_SCRIPT_PATH (
+    for %%i in (%DEFAULT_MAIN_SCRIPT_PATHS%) do (
+        if exist "%%i" set "MAIN_SCRIPT_PATH=%%i"
+    )
+)
+rem If the main script path was not found, tell the user how to specify it
+if not defined MAIN_SCRIPT_PATH (
+    echo The main script could not be found. Please provide the path to the main script as 1st argument to this script, or place the main script in one of the default locations:
+    echo %DEFAULT_MAIN_SCRIPT_PATHS%
+    pause
+    exit /b 1
+)
+rem Default context, feel free to edit it
+set "PROMPT_TEXT=Text transcript of a never ending dialog, where %USER_NAME% interacts with an AI assistant named %AI_NAME%. %AI_NAME% is helpful, kind, honest, friendly, good at writing and never fails to answer %USER_NAME%'s requests immediately and with details and precision. There are no annotations like (30 seconds passed...) or (to himself), just what %USER_NAME% and %AI_NAME% say aloud to each other. The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long. The transcript only includes text, it does not include markup like HTML and Markdown."
+rem Set a temporary variable if N_THREAD is set
+if defined N_THREAD (
+    set "_N_THREAD=--threads %N_THREAD%"
+) else (
+    set "_N_THREAD="
+)
+rem Run the script
+echo "%MAIN_SCRIPT_PATH%" %GEN_OPTIONS% %_N_THREAD% ^
+  --model "%MODEL%" ^
+  --n_predict %N_PREDICTS% ^
+  --color --interactive ^
+  --reverse-prompt "%USER_NAME%:" ^
+  --prompt "%PROMPT_TEXT%"

examples/chat-13B.sh ADDED Viewed

	@@ -0,0 +1,41 @@

+#!/bin/bash
+set -e
+cd "$(dirname "$0")/.." || exit
+MODEL="${MODEL:-./models/13B/ggml-model-q4_0.bin}"
+PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat.txt}
+USER_NAME="${USER_NAME:-USER}"
+AI_NAME="${AI_NAME:-ChatLLaMa}"
+# Adjust to the number of CPU cores you want to use.
+N_THREAD="${N_THREAD:-8}"
+# Number of tokens to predict (made it larger than default because we want a long interaction)
+N_PREDICTS="${N_PREDICTS:-2048}"
+# Note: you can also override the generation options by specifying them on the command line:
+# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
+GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
+DATE_TIME=$(date +%H:%M)
+DATE_YEAR=$(date +%Y)
+PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt)
+sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
+    -e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \
+    -e "s/\[\[DATE_TIME\]\]/$DATE_TIME/g" \
+    -e "s/\[\[DATE_YEAR\]\]/$DATE_YEAR/g" \
+     $PROMPT_TEMPLATE > $PROMPT_FILE
+# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
+./main $GEN_OPTIONS \
+  --model "$MODEL" \
+  --threads "$N_THREAD" \
+  --n_predict "$N_PREDICTS" \
+  --color --interactive \
+  --file ${PROMPT_FILE} \
+  --reverse-prompt "${USER_NAME}:" \
+  --in-prefix ' ' \
+  "$@"

examples/chat-persistent.sh ADDED Viewed

	@@ -0,0 +1,151 @@

+#!/bin/bash
+set -euo pipefail
+cd "$(dirname "$0")/.." || exit
+if [[ -z "${PROMPT_CACHE_FILE+x}" || -z "${CHAT_SAVE_DIR+x}" ]]; then
+    echo >&2 "error: PROMPT_CACHE_FILE and CHAT_SAVE_DIR must be provided"
+    exit 1
+fi
+MODEL="${MODEL:-./models/13B/ggml-model-q4_0.bin}"
+PROMPT_TEMPLATE="${PROMPT_TEMPLATE:-./prompts/chat.txt}"
+USER_NAME="${USER_NAME:-User}"
+AI_NAME="${AI_NAME:-ChatLLaMa}"
+DATE_TIME="$(date +%H:%M)"
+DATE_YEAR="$(date +%Y)"
+LOG="${CHAT_SAVE_DIR}/main.log"
+LOG_BG="${CHAT_SAVE_DIR}/main-bg.log"
+CUR_PROMPT_FILE="${CHAT_SAVE_DIR}/current-prompt.txt"
+CUR_PROMPT_CACHE="${CHAT_SAVE_DIR}/current-cache.bin"
+NEXT_PROMPT_FILE="${CHAT_SAVE_DIR}/next-prompt.txt"
+NEXT_PROMPT_CACHE="${CHAT_SAVE_DIR}/next-cache.bin"
+SESSION_SIZE_MSG_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'
+SAMPLE_TIME_MSG_PATTERN='sample time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
+SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d"
+CTX_SIZE=2048
+CTX_ROTATE_POINT=$((CTX_SIZE * 3 / 5)) # REVIEW
+OPTS=(--model "$MODEL" --ctx_size "$CTX_SIZE" --repeat_last_n 256 "$@")
+# An unbuffered `tail -c+N`
+skip_bytes() {
+    LANG=C IFS= read -r -n "$1" -d '' c
+    while LANG=C IFS= read -r -n 1 -d '' c; do
+        printf '%s' "$c"
+    done
+}
+mkdir -p "$CHAT_SAVE_DIR"
+echo >"$LOG"
+trap "tail -n100 ${LOG}" EXIT
+if [[ ! -e "$CUR_PROMPT_FILE" ]]; then
+    sed -e "s/\[\[USER_NAME\]\]/${USER_NAME}/g" \
+        -e "s/\[\[AI_NAME\]\]/${AI_NAME}/g" \
+        -e "s/\[\[DATE_TIME\]\]/${DATE_TIME}/g" \
+        -e "s/\[\[DATE_YEAR\]\]/${DATE_YEAR}/g" \
+        "$PROMPT_TEMPLATE" >"$CUR_PROMPT_FILE"
+fi
+if [[ ! -e "$NEXT_PROMPT_FILE" ]]; then
+    sed -r "$SED_DELETE_MESSAGES" "$CUR_PROMPT_FILE" >"$NEXT_PROMPT_FILE"
+fi
+if [[ "$(tail -c4 "$NEXT_PROMPT_FILE")" != "..." ]]; then
+    echo '...' >>"$NEXT_PROMPT_FILE"
+fi
+if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
+    echo 'Prompt cache does not exist, building...'
+    # Default batch_size to 8 here for better user feedback during initial prompt processing
+    ./main 2>>"$LOG" \
+        --batch_size 8 \
+        "${OPTS[@]}" \
+        --prompt-cache "$PROMPT_CACHE_FILE" \
+        --file "$CUR_PROMPT_FILE" \
+        --n_predict 1
+    echo
+    echo 'Done!'
+fi
+if [[ ! -e "$CUR_PROMPT_CACHE" ]]; then
+    cp "$PROMPT_CACHE_FILE" "$CUR_PROMPT_CACHE"
+fi
+if [[ ! -e "$NEXT_PROMPT_CACHE" ]]; then
+    cp "$PROMPT_CACHE_FILE" "$NEXT_PROMPT_CACHE"
+fi
+printf '%s ' "$(< "$CUR_PROMPT_FILE")"
+n_tokens=0
+while read -e line; do
+    # Limit generation to remaining context, with a buffer and estimating 2 chars/token for input
+    n_predict=$((CTX_SIZE - n_tokens - ${#line} / 2 - 32))
+    # Swap prompts when we're about to run out of context
+    if ((n_predict <= 0)); then
+        wait # for background main (below) to finish with next prompt
+        mv "$NEXT_PROMPT_FILE"  "$CUR_PROMPT_FILE"
+        mv "$NEXT_PROMPT_CACHE" "$CUR_PROMPT_CACHE"
+        sed -r "$SED_DELETE_MESSAGES" "$CUR_PROMPT_FILE" >"$NEXT_PROMPT_FILE"
+        echo '...' >>"$NEXT_PROMPT_FILE"
+        cp "$PROMPT_CACHE_FILE" "$NEXT_PROMPT_CACHE"
+        n_tokens=0
+        n_predict=$((CTX_SIZE / 2))
+    fi
+    echo " ${line}" >>"$CUR_PROMPT_FILE"
+    if ((n_tokens > CTX_ROTATE_POINT)); then
+        echo " ${line}" >>"$NEXT_PROMPT_FILE"
+    fi
+    n_prompt_len_pre=$(($(wc -c <"$CUR_PROMPT_FILE")))
+    printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
+    ./main 2>>"$LOG" "${OPTS[@]}" \
+            --prompt-cache "$CUR_PROMPT_CACHE" \
+            --prompt-cache-all \
+            --file "$CUR_PROMPT_FILE" \
+            --reverse-prompt "${USER_NAME}:" \
+            --n_predict "$n_predict" |
+        skip_bytes 1 |                  # skip BOS token added by ./main
+        tee "$CUR_PROMPT_FILE.tmp" |    # save prompt + generation to tmp file
+        skip_bytes "$n_prompt_len_pre"  # print generation
+    mv "$CUR_PROMPT_FILE.tmp" "$CUR_PROMPT_FILE"
+    # if we hit n_predict instead of reverse-prompt, we need to add the prompt
+    if [[ "$(tail -n1 "$CUR_PROMPT_FILE")" != "${USER_NAME}:" ]]; then
+        printf '\n%s:' "$USER_NAME"
+        printf '\n%s:' "$USER_NAME" >> "$CUR_PROMPT_FILE"
+    fi
+    printf ' '
+    # HACK get num tokens from debug message
+    # TODO get both messages in one go
+    if  ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
+        ! sample_time_msg="$( tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
+        echo >&2 "Couldn't get number of tokens from ./main output!"
+        exit 1
+    fi
+    n_tokens=$(($(cut -d/ -f2 <<<"$session_size_msg") + $(cut -d/ -f2 <<<"$sample_time_msg")))
+    if ((n_tokens > CTX_ROTATE_POINT)); then
+        tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE"
+    fi
+    # Update cache for next prompt in background, ideally during user input
+    ./main >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
+          --prompt-cache "$NEXT_PROMPT_CACHE" \
+          --file "$NEXT_PROMPT_FILE" \
+          --n_predict 1 &
+done

examples/chat-vicuna.sh ADDED Viewed

	@@ -0,0 +1,41 @@

+#!/bin/bash
+set -e
+cd "$(dirname "$0")/.." || exit
+MODEL="${MODEL:-./models/ggml-vic13b-uncensored-q5_0.bin}"
+PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat.txt}
+USER_NAME="### Human"
+AI_NAME="### Assistant"
+# Adjust to the number of CPU cores you want to use.
+N_THREAD="${N_THREAD:-8}"
+# Number of tokens to predict (made it larger than default because we want a long interaction)
+N_PREDICTS="${N_PREDICTS:-2048}"
+# Note: you can also override the generation options by specifying them on the command line:
+# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
+GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
+DATE_TIME=$(date +%H:%M)
+DATE_YEAR=$(date +%Y)
+PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt)
+sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
+    -e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \
+    -e "s/\[\[DATE_TIME\]\]/$DATE_TIME/g" \
+    -e "s/\[\[DATE_YEAR\]\]/$DATE_YEAR/g" \
+     $PROMPT_TEMPLATE > $PROMPT_FILE
+# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
+./bin/main $GEN_OPTIONS \
+  --model "$MODEL" \
+  --threads "$N_THREAD" \
+  --n_predict "$N_PREDICTS" \
+  --color --interactive \
+  --file ${PROMPT_FILE} \
+  --reverse-prompt "### Human:" \
+  --in-prefix ' ' \
+  "$@"

examples/chat.sh ADDED Viewed

	@@ -0,0 +1,16 @@

+#!/bin/bash
+#
+# Temporary script - will be removed in the future
+#
+cd `dirname $0`
+cd ..
+# Important:
+#
+#   "--keep 48" is based on the contents of prompts/chat-with-bob.txt
+#
+./main -m ./models/7B/ggml-model-q4_0.bin -c 512 -b 1024 -n 256 --keep 48 \
+    --repeat_penalty 1.0 --color -i \
+    -r "User:" -f prompts/chat-with-bob.txt

examples/common.cpp ADDED Viewed

	@@ -0,0 +1,955 @@

+#include "common.h"
+#include <cassert>
+#include <iostream>
+#include <cstring>
+#include <fstream>
+#include <string>
+#include <iterator>
+#include <algorithm>
+#include <sstream>
+#include <unordered_set>
+#include <regex>
+#if defined(__APPLE__) && defined(__MACH__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
+#include <windows.h>
+#include <fcntl.h>
+#include <io.h>
+#else
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include <wchar.h>
+#endif
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+int32_t get_num_physical_cores() {
+#ifdef __linux__
+    // enumerate the set of thread siblings, num entries is num cores
+    std::unordered_set<std::string> siblings;
+    for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
+        std::ifstream thread_siblings("/sys/devices/system/cpu"
+            + std::to_string(cpu) + "/topology/thread_siblings");
+        if (!thread_siblings.is_open()) {
+            break; // no more cpus
+        }
+        std::string line;
+        if (std::getline(thread_siblings, line)) {
+            siblings.insert(line);
+        }
+    }
+    if (siblings.size() > 0) {
+        return static_cast<int32_t>(siblings.size());
+    }
+#elif defined(__APPLE__) && defined(__MACH__)
+    int32_t num_physical_cores;
+    size_t len = sizeof(num_physical_cores);
+    int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
+    if (result == 0) {
+        return num_physical_cores;
+    }
+    result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0);
+    if (result == 0) {
+        return num_physical_cores;
+    }
+#elif defined(_WIN32)
+    //TODO: Implement
+#endif
+    unsigned int n_threads = std::thread::hardware_concurrency();
+    return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
+}
+void process_escapes(std::string& input) {
+    std::size_t input_len = input.length();
+    std::size_t output_idx = 0;
+    for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) {
+        if (input[input_idx] == '\\' && input_idx + 1 < input_len) {
+            switch (input[++input_idx]) {
+                case 'n':  input[output_idx++] = '\n'; break;
+                case 'r':  input[output_idx++] = '\r'; break;
+                case 't':  input[output_idx++] = '\t'; break;
+                case '\'': input[output_idx++] = '\''; break;
+                case '\"': input[output_idx++] = '\"'; break;
+                case '\\': input[output_idx++] = '\\'; break;
+                default:   input[output_idx++] = '\\';
+                           input[output_idx++] = input[input_idx]; break;
+            }
+        } else {
+            input[output_idx++] = input[input_idx];
+        }
+    }
+    input.resize(output_idx);
+}
+bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
+    bool invalid_param = false;
+    bool escape_prompt = false;
+    std::string arg;
+    gpt_params default_params;
+    const std::string arg_prefix = "--";
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+            std::replace(arg.begin(), arg.end(), '_', '-');
+        }
+        if (arg == "-s" || arg == "--seed") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.seed = std::stoi(argv[i]);
+        } else if (arg == "-t" || arg == "--threads") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_threads = std::stoi(argv[i]);
+        } else if (arg == "-p" || arg == "--prompt") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.prompt = argv[i];
+        } else if (arg == "-e") {
+            escape_prompt = true;
+        } else if (arg == "--prompt-cache") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.path_prompt_cache = argv[i];
+        } else if (arg == "--prompt-cache-all") {
+            params.prompt_cache_all = true;
+        } else if (arg == "--prompt-cache-ro") {
+            params.prompt_cache_ro = true;
+        } else if (arg == "-f" || arg == "--file") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            std::ifstream file(argv[i]);
+            if (!file) {
+                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+                invalid_param = true;
+                break;
+            }
+            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
+            if (params.prompt.back() == '\n') {
+                params.prompt.pop_back();
+            }
+        } else if (arg == "-n" || arg == "--n-predict") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_predict = std::stoi(argv[i]);
+        } else if (arg == "--top-k") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.top_k = std::stoi(argv[i]);
+        } else if (arg == "-c" || arg == "--ctx-size") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_ctx = std::stoi(argv[i]);
+        } else if (arg == "--memory-f32") {
+            params.memory_f16 = false;
+        } else if (arg == "--top-p") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.top_p = std::stof(argv[i]);
+        } else if (arg == "--temp") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.temp = std::stof(argv[i]);
+        } else if (arg == "--tfs") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.tfs_z = std::stof(argv[i]);
+        } else if (arg == "--typical") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.typical_p = std::stof(argv[i]);
+        } else if (arg == "--repeat-last-n") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.repeat_last_n = std::stoi(argv[i]);
+        } else if (arg == "--repeat-penalty") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.repeat_penalty = std::stof(argv[i]);
+        } else if (arg == "--frequency-penalty") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.frequency_penalty = std::stof(argv[i]);
+        } else if (arg == "--presence-penalty") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.presence_penalty = std::stof(argv[i]);
+        } else if (arg == "--mirostat") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.mirostat = std::stoi(argv[i]);
+        } else if (arg == "--mirostat-lr") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.mirostat_eta = std::stof(argv[i]);
+        } else if (arg == "--mirostat-ent") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.mirostat_tau = std::stof(argv[i]);
+        } else if (arg == "-b" || arg == "--batch-size") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_batch = std::stoi(argv[i]);
+            params.n_batch = std::min(512, params.n_batch);
+        } else if (arg == "--keep") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_keep = std::stoi(argv[i]);
+        } else if (arg == "-m" || arg == "--model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.model = argv[i];
+        } else if (arg == "-a" || arg == "--alias") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.model_alias = argv[i];
+        } else if (arg == "--lora") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.lora_adapter = argv[i];
+            params.use_mmap = false;
+        } else if (arg == "--lora-base") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.lora_base = argv[i];
+        } else if (arg == "-i" || arg == "--interactive") {
+            params.interactive = true;
+        } else if (arg == "--embedding") {
+            params.embedding = true;
+        } else if (arg == "--interactive-first") {
+            params.interactive_first = true;
+        } else if (arg == "-ins" || arg == "--instruct") {
+            params.instruct = true;
+        } else if (arg == "--multiline-input") {
+            params.multiline_input = true;
+        } else if (arg == "--color") {
+            params.use_color = true;
+        } else if (arg == "--mlock") {
+            params.use_mlock = true;
+        } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
+            params.n_gpu_layers = std::stoi(argv[i]);
+#else
+            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
+            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+#endif
+        } else if (arg == "--main-gpu" || arg == "-mg") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+#ifdef GGML_USE_CUBLAS
+            params.main_gpu = std::stoi(argv[i]);
+#else
+      fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
+#endif
+        } else if (arg == "--tensor-split" || arg == "-ts") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+#ifdef GGML_USE_CUBLAS
+            std::string arg_next = argv[i];
+            // split string by , and /
+            const std::regex regex{R"([,/]+)"};
+            std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
+            std::vector<std::string> split_arg{it, {}};
+            GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
+            for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
+                if (i < split_arg.size()) {
+                    params.tensor_split[i] = std::stof(split_arg[i]);
+                } else {
+                    params.tensor_split[i] = 0.0f;
+                }
+            }
+#else
+      fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
+#endif // GGML_USE_CUBLAS
+        } else if (arg == "--low-vram" || arg == "-lv") {
+#ifdef GGML_USE_CUBLAS
+            params.low_vram = true;
+#else
+      fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
+#endif // GGML_USE_CUBLAS
+        } else if (arg == "--no-mmap") {
+            params.use_mmap = false;
+        } else if (arg == "--mtest") {
+            params.mem_test = true;
+        } else if (arg == "--export") {
+            params.export_cgraph = true;
+        } else if (arg == "--verbose-prompt") {
+            params.verbose_prompt = true;
+        } else if (arg == "-r" || arg == "--reverse-prompt") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.antiprompt.push_back(argv[i]);
+        } else if (arg == "--perplexity") {
+            params.perplexity = true;
+        } else if (arg == "--ignore-eos") {
+            params.logit_bias[llama_token_eos()] = -INFINITY;
+        } else if (arg == "--no-penalize-nl") {
+            params.penalize_nl = false;
+        } else if (arg == "-l" || arg == "--logit-bias") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            std::stringstream ss(argv[i]);
+            llama_token key;
+            char sign;
+            std::string value_str;
+            try {
+                if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
+                    params.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
+                } else {
+                    throw std::exception();
+                }
+            } catch (const std::exception&) {
+                invalid_param = true;
+                break;
+            }
+        } else if (arg == "-h" || arg == "--help") {
+            gpt_print_usage(argc, argv, default_params);
+            exit(0);
+        } else if (arg == "--random-prompt") {
+            params.random_prompt = true;
+        } else if (arg == "--in-prefix") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.input_prefix = argv[i];
+        } else if (arg == "--in-suffix") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.input_suffix = argv[i];
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            gpt_print_usage(argc, argv, default_params);
+            exit(1);
+        }
+    }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        gpt_print_usage(argc, argv, default_params);
+        exit(1);
+    }
+    if (params.prompt_cache_all &&
+            (params.interactive || params.interactive_first ||
+             params.instruct)) {
+        fprintf(stderr, "error: --prompt-cache-all not supported in interactive mode yet\n");
+        gpt_print_usage(argc, argv, default_params);
+        exit(1);
+    }
+#ifdef GGML_USE_CUBLAS
+    if (!params.lora_adapter.empty() && params.n_gpu_layers > 0) {
+        fprintf(stderr, "%s: error: the simultaneous use of LoRAs and GPU acceleration is not supported", __func__);
+        exit(1);
+    }
+#endif // GGML_USE_CUBLAS
+    if (escape_prompt) {
+        process_escapes(params.prompt);
+    }
+    return true;
+}
+void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help            show this help message and exit\n");
+    fprintf(stderr, "  -i, --interactive     run in interactive mode\n");
+    fprintf(stderr, "  --interactive-first   run in interactive mode and wait for input right away\n");
+    fprintf(stderr, "  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
+    fprintf(stderr, "  --multiline-input     allows you to write or paste multiple lines without ending each in '\\'\n");
+    fprintf(stderr, "  -r PROMPT, --reverse-prompt PROMPT\n");
+    fprintf(stderr, "                        halt generation at PROMPT, return control in interactive mode\n");
+    fprintf(stderr, "                        (can be specified more than once for multiple prompts).\n");
+    fprintf(stderr, "  --color               colorise output to distinguish prompt and user input from generations\n");
+    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for < 0)\n");
+    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
+    fprintf(stderr, "                        prompt to start generation with (default: empty)\n");
+    fprintf(stderr, "  -e                    process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
+    fprintf(stderr, "  --prompt-cache FNAME  file to cache prompt state for faster startup (default: none)\n");
+    fprintf(stderr, "  --prompt-cache-all    if specified, saves user input and generations to cache as well.\n");
+    fprintf(stderr, "                        not supported with --interactive or other interactive options\n");
+    fprintf(stderr, "  --prompt-cache-ro     if specified, uses the prompt cache but does not update it.\n");
+    fprintf(stderr, "  --random-prompt       start with a randomized prompt.\n");
+    fprintf(stderr, "  --in-prefix STRING    string to prefix user inputs with (default: empty)\n");
+    fprintf(stderr, "  --in-suffix STRING    string to suffix after user inputs with (default: empty)\n");
+    fprintf(stderr, "  -f FNAME, --file FNAME\n");
+    fprintf(stderr, "                        prompt file to start generation.\n");
+    fprintf(stderr, "  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
+    fprintf(stderr, "  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
+    fprintf(stderr, "  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
+    fprintf(stderr, "  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
+    fprintf(stderr, "  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
+    fprintf(stderr, "  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
+    fprintf(stderr, "  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
+    fprintf(stderr, "  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
+    fprintf(stderr, "  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
+    fprintf(stderr, "  --mirostat N          use Mirostat sampling.\n");
+    fprintf(stderr, "                        Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
+    fprintf(stderr, "                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
+    fprintf(stderr, "  --mirostat-lr N       Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
+    fprintf(stderr, "  --mirostat-ent N      Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
+    fprintf(stderr, "  -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
+    fprintf(stderr, "                        modifies the likelihood of token appearing in the completion,\n");
+    fprintf(stderr, "                        i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
+    fprintf(stderr, "                        or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
+    fprintf(stderr, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
+    fprintf(stderr, "  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
+    fprintf(stderr, "  --no-penalize-nl      do not penalize newline token\n");
+    fprintf(stderr, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
+    fprintf(stderr, "                        not recommended: doubles context memory required and no measurable increase in quality\n");
+    fprintf(stderr, "  --temp N              temperature (default: %.1f)\n", (double)params.temp);
+    fprintf(stderr, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
+    fprintf(stderr, "  --perplexity          compute perplexity over the prompt\n");
+    fprintf(stderr, "  --keep                number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
+    if (llama_mlock_supported()) {
+        fprintf(stderr, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
+    }
+    if (llama_mmap_supported()) {
+        fprintf(stderr, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
+    }
+#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
+    fprintf(stderr, "  -ngl N, --n-gpu-layers N\n");
+    fprintf(stderr, "                        number of layers to store in VRAM\n");
+    fprintf(stderr, "  -ts SPLIT --tensor-split SPLIT\n");
+    fprintf(stderr, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
+    fprintf(stderr, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n" );
+    fprintf(stderr, "  -lv, --low-vram       don't allocate VRAM scratch buffer\n" );
+#endif
+    fprintf(stderr, "  --mtest               compute maximum memory usage\n");
+    fprintf(stderr, "  --export              export the computation graph to 'llama.ggml'\n");
+    fprintf(stderr, "  --verbose-prompt      print prompt before generation\n");
+    fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
+    fprintf(stderr, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
+    fprintf(stderr, "  -m FNAME, --model FNAME\n");
+    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
+    fprintf(stderr, "\n");
+}
+std::string gpt_random_prompt(std::mt19937 & rng) {
+    const int r = rng() % 10;
+    switch (r) {
+        case 0: return "So";
+        case 1: return "Once upon a time";
+        case 2: return "When";
+        case 3: return "The";
+        case 4: return "After";
+        case 5: return "If";
+        case 6: return "import";
+        case 7: return "He";
+        case 8: return "She";
+        case 9: return "They";
+        default: return "To";
+    }
+    return "The";
+}
+// TODO: not great allocating this every time
+std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
+    // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
+    std::vector<llama_token> res(text.size() + (int) add_bos);
+    const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
+    assert(n >= 0);
+    res.resize(n);
+    return res;
+}
+std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
+    auto lparams = llama_context_default_params();
+    lparams.n_ctx        = params.n_ctx;
+    lparams.n_batch      = params.n_batch;
+    lparams.n_gpu_layers = params.n_gpu_layers;
+    lparams.main_gpu     = params.main_gpu;
+    memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float));
+    lparams.low_vram     = params.low_vram;
+    lparams.seed         = params.seed;
+    lparams.f16_kv       = params.memory_f16;
+    lparams.use_mmap     = params.use_mmap;
+    lparams.use_mlock    = params.use_mlock;
+    lparams.logits_all   = params.perplexity;
+    lparams.embedding    = params.embedding;
+    llama_model * model  = llama_load_model_from_file(params.model.c_str(), lparams);
+    if (model == NULL) {
+        fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
+        return std::make_tuple(nullptr, nullptr);
+    }
+    llama_context * lctx = llama_new_context_with_model(model, lparams);
+    if (lctx == NULL) {
+        fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
+        llama_free_model(model);
+        return std::make_tuple(nullptr, nullptr);
+    }
+    if (!params.lora_adapter.empty()) {
+        int err = llama_model_apply_lora_from_file(model,
+                                             params.lora_adapter.c_str(),
+                                             params.lora_base.empty() ? NULL : params.lora_base.c_str(),
+                                             params.n_threads);
+        if (err != 0) {
+            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
+            llama_free(lctx);
+            llama_free_model(model);
+            return std::make_tuple(nullptr, nullptr);
+        }
+    }
+    return std::make_tuple(model, lctx);
+}
+void console_init(console_state & con_st) {
+#if defined(_WIN32)
+    // Windows-specific console initialization
+    DWORD dwMode = 0;
+    con_st.hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
+    if (con_st.hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(con_st.hConsole, &dwMode)) {
+        con_st.hConsole = GetStdHandle(STD_ERROR_HANDLE);
+        if (con_st.hConsole != INVALID_HANDLE_VALUE && (!GetConsoleMode(con_st.hConsole, &dwMode))) {
+            con_st.hConsole = NULL;
+        }
+    }
+    if (con_st.hConsole) {
+        // Enable ANSI colors on Windows 10+
+        if (con_st.use_color && !(dwMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING)) {
+            SetConsoleMode(con_st.hConsole, dwMode | ENABLE_VIRTUAL_TERMINAL_PROCESSING);
+        }
+        // Set console output codepage to UTF8
+        SetConsoleOutputCP(CP_UTF8);
+    }
+    HANDLE hConIn = GetStdHandle(STD_INPUT_HANDLE);
+    if (hConIn != INVALID_HANDLE_VALUE && GetConsoleMode(hConIn, &dwMode)) {
+        // Set console input codepage to UTF16
+        _setmode(_fileno(stdin), _O_WTEXT);
+        // Turn off ICANON (ENABLE_LINE_INPUT) and ECHO (ENABLE_ECHO_INPUT)
+        dwMode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT);
+        SetConsoleMode(hConIn, dwMode);
+    }
+#else
+    // POSIX-specific console initialization
+    struct termios new_termios;
+    tcgetattr(STDIN_FILENO, &con_st.prev_state);
+    new_termios = con_st.prev_state;
+    new_termios.c_lflag &= ~(ICANON | ECHO);
+    new_termios.c_cc[VMIN] = 1;
+    new_termios.c_cc[VTIME] = 0;
+    tcsetattr(STDIN_FILENO, TCSANOW, &new_termios);
+    con_st.tty = fopen("/dev/tty", "w+");
+    if (con_st.tty != nullptr) {
+        con_st.out = con_st.tty;
+    }
+    setlocale(LC_ALL, "");
+#endif
+}
+void console_cleanup(console_state & con_st) {
+    // Reset console color
+    console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
+#if !defined(_WIN32)
+    if (con_st.tty != nullptr) {
+        con_st.out = stdout;
+        fclose(con_st.tty);
+        con_st.tty = nullptr;
+    }
+    // Restore the terminal settings on POSIX systems
+    tcsetattr(STDIN_FILENO, TCSANOW, &con_st.prev_state);
+#endif
+}
+/* Keep track of current color of output, and emit ANSI code if it changes. */
+void console_set_color(console_state & con_st, console_color_t color) {
+    if (con_st.use_color && con_st.color != color) {
+        fflush(stdout);
+        switch(color) {
+            case CONSOLE_COLOR_DEFAULT:
+                fprintf(con_st.out, ANSI_COLOR_RESET);
+                break;
+            case CONSOLE_COLOR_PROMPT:
+                fprintf(con_st.out, ANSI_COLOR_YELLOW);
+                break;
+            case CONSOLE_COLOR_USER_INPUT:
+                fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_GREEN);
+                break;
+            case CONSOLE_COLOR_ERROR:
+                fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_RED);
+                break;
+        }
+        con_st.color = color;
+        fflush(con_st.out);
+    }
+}
+char32_t getchar32() {
+#if defined(_WIN32)
+    HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE);
+    wchar_t high_surrogate = 0;
+    while (true) {
+        INPUT_RECORD record;
+        DWORD count;
+        if (!ReadConsoleInputW(hConsole, &record, 1, &count) || count == 0) {
+            return WEOF;
+        }
+        if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) {
+            wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar;
+            if (wc == 0) {
+                continue;
+            }
+            if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
+                high_surrogate = wc;
+                continue;
+            } else if ((wc >= 0xDC00) && (wc <= 0xDFFF)) { // Check if wc is a low surrogate
+                if (high_surrogate != 0) { // Check if we have a high surrogate
+                    return ((high_surrogate - 0xD800) << 10) + (wc - 0xDC00) + 0x10000;
+                }
+            }
+            high_surrogate = 0; // Reset the high surrogate
+            return static_cast<char32_t>(wc);
+        }
+    }
+#else
+    wchar_t wc = getwchar();
+    if (static_cast<wint_t>(wc) == WEOF) {
+        return WEOF;
+    }
+#if WCHAR_MAX == 0xFFFF
+    if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
+        wchar_t low_surrogate = getwchar();
+        if ((low_surrogate >= 0xDC00) && (low_surrogate <= 0xDFFF)) { // Check if the next wchar is a low surrogate
+            return (static_cast<char32_t>(wc & 0x03FF) << 10) + (low_surrogate & 0x03FF) + 0x10000;
+        }
+    }
+    if ((wc >= 0xD800) && (wc <= 0xDFFF)) { // Invalid surrogate pair
+        return 0xFFFD; // Return the replacement character U+FFFD
+    }
+#endif
+    return static_cast<char32_t>(wc);
+#endif
+}
+void pop_cursor(console_state & con_st) {
+#if defined(_WIN32)
+    if (con_st.hConsole != NULL) {
+        CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
+        GetConsoleScreenBufferInfo(con_st.hConsole, &bufferInfo);
+        COORD newCursorPosition = bufferInfo.dwCursorPosition;
+        if (newCursorPosition.X == 0) {
+            newCursorPosition.X = bufferInfo.dwSize.X - 1;
+            newCursorPosition.Y -= 1;
+        } else {
+            newCursorPosition.X -= 1;
+        }
+        SetConsoleCursorPosition(con_st.hConsole, newCursorPosition);
+        return;
+    }
+#endif
+    putc('\b', con_st.out);
+}
+int estimateWidth(char32_t codepoint) {
+#if defined(_WIN32)
+    return 1;
+#else
+    return wcwidth(codepoint);
+#endif
+}
+int put_codepoint(console_state & con_st, const char* utf8_codepoint, size_t length, int expectedWidth) {
+#if defined(_WIN32)
+    CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
+    if (!GetConsoleScreenBufferInfo(con_st.hConsole, &bufferInfo)) {
+        // go with the default
+        return expectedWidth;
+    }
+    COORD initialPosition = bufferInfo.dwCursorPosition;
+    DWORD nNumberOfChars = length;
+    WriteConsole(con_st.hConsole, utf8_codepoint, nNumberOfChars, &nNumberOfChars, NULL);
+    CONSOLE_SCREEN_BUFFER_INFO newBufferInfo;
+    GetConsoleScreenBufferInfo(con_st.hConsole, &newBufferInfo);
+    // Figure out our real position if we're in the last column
+    if (utf8_codepoint[0] != 0x09 && initialPosition.X == newBufferInfo.dwSize.X - 1) {
+        DWORD nNumberOfChars;
+        WriteConsole(con_st.hConsole, &" \b", 2, &nNumberOfChars, NULL);
+        GetConsoleScreenBufferInfo(con_st.hConsole, &newBufferInfo);
+    }
+    int width = newBufferInfo.dwCursorPosition.X - initialPosition.X;
+    if (width < 0) {
+        width += newBufferInfo.dwSize.X;
+    }
+    return width;
+#else
+    // we can trust expectedWidth if we've got one
+    if (expectedWidth >= 0 || con_st.tty == nullptr) {
+        fwrite(utf8_codepoint, length, 1, con_st.out);
+        return expectedWidth;
+    }
+    fputs("\033[6n", con_st.tty); // Query cursor position
+    int x1, x2, y1, y2;
+    int results = 0;
+    results = fscanf(con_st.tty, "\033[%d;%dR", &y1, &x1);
+    fwrite(utf8_codepoint, length, 1, con_st.tty);
+    fputs("\033[6n", con_st.tty); // Query cursor position
+    results += fscanf(con_st.tty, "\033[%d;%dR", &y2, &x2);
+    if (results != 4) {
+        return expectedWidth;
+    }
+    int width = x2 - x1;
+    if (width < 0) {
+        // Calculate the width considering text wrapping
+        struct winsize w;
+        ioctl(STDOUT_FILENO, TIOCGWINSZ, &w);
+        width += w.ws_col;
+    }
+    return width;
+#endif
+}
+void replace_last(console_state & con_st, char ch) {
+#if defined(_WIN32)
+    pop_cursor(con_st);
+    put_codepoint(con_st, &ch, 1, 1);
+#else
+    fprintf(con_st.out, "\b%c", ch);
+#endif
+}
+void append_utf8(char32_t ch, std::string & out) {
+    if (ch <= 0x7F) {
+        out.push_back(static_cast<unsigned char>(ch));
+    } else if (ch <= 0x7FF) {
+        out.push_back(static_cast<unsigned char>(0xC0 | ((ch >> 6) & 0x1F)));
+        out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
+    } else if (ch <= 0xFFFF) {
+        out.push_back(static_cast<unsigned char>(0xE0 | ((ch >> 12) & 0x0F)));
+        out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
+        out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
+    } else if (ch <= 0x10FFFF) {
+        out.push_back(static_cast<unsigned char>(0xF0 | ((ch >> 18) & 0x07)));
+        out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 12) & 0x3F)));
+        out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
+        out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
+    } else {
+        // Invalid Unicode code point
+    }
+}
+// Helper function to remove the last UTF-8 character from a string
+void pop_back_utf8_char(std::string & line) {
+    if (line.empty()) {
+        return;
+    }
+    size_t pos = line.length() - 1;
+    // Find the start of the last UTF-8 character (checking up to 4 bytes back)
+    for (size_t i = 0; i < 3 && pos > 0; ++i, --pos) {
+        if ((line[pos] & 0xC0) != 0x80) break; // Found the start of the character
+    }
+    line.erase(pos);
+}
+bool console_readline(console_state & con_st, std::string & line) {
+    console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
+    if (con_st.out != stdout) {
+        fflush(stdout);
+    }
+    line.clear();
+    std::vector<int> widths;
+    bool is_special_char = false;
+    bool end_of_stream = false;
+    char32_t input_char;
+    while (true) {
+        fflush(con_st.out); // Ensure all output is displayed before waiting for input
+        input_char = getchar32();
+        if (input_char == '\r' || input_char == '\n') {
+            break;
+        }
+        if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D*/) {
+            end_of_stream = true;
+            break;
+        }
+        if (is_special_char) {
+            console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
+            replace_last(con_st, line.back());
+            is_special_char = false;
+        }
+        if (input_char == '\033') { // Escape sequence
+            char32_t code = getchar32();
+            if (code == '[' || code == 0x1B) {
+                // Discard the rest of the escape sequence
+                while ((code = getchar32()) != (char32_t) WEOF) {
+                    if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') {
+                        break;
+                    }
+                }
+            }
+        } else if (input_char == 0x08 || input_char == 0x7F) { // Backspace
+            if (!widths.empty()) {
+                int count;
+                do {
+                    count = widths.back();
+                    widths.pop_back();
+                    // Move cursor back, print space, and move cursor back again
+                    for (int i = 0; i < count; i++) {
+                        replace_last(con_st, ' ');
+                        pop_cursor(con_st);
+                    }
+                    pop_back_utf8_char(line);
+                } while (count == 0 && !widths.empty());
+            }
+        } else {
+            int offset = line.length();
+            append_utf8(input_char, line);
+            int width = put_codepoint(con_st, line.c_str() + offset, line.length() - offset, estimateWidth(input_char));
+            if (width < 0) {
+                width = 0;
+            }
+            widths.push_back(width);
+        }
+        if (!line.empty() && (line.back() == '\\' || line.back() == '/')) {
+            console_set_color(con_st, CONSOLE_COLOR_PROMPT);
+            replace_last(con_st, line.back());
+            is_special_char = true;
+        }
+    }
+    bool has_more = con_st.multiline_input;
+    if (is_special_char) {
+        replace_last(con_st, ' ');
+        pop_cursor(con_st);
+        char last = line.back();
+        line.pop_back();
+        if (last == '\\') {
+            line += '\n';
+            fputc('\n', con_st.out);
+            has_more = !has_more;
+        } else {
+            // llama will just eat the single space, it won't act as a space
+            if (line.length() == 1 && line.back() == ' ') {
+                line.clear();
+                pop_cursor(con_st);
+            }
+            has_more = false;
+        }
+    } else {
+        if (end_of_stream) {
+            has_more = false;
+        } else {
+            line += '\n';
+            fputc('\n', con_st.out);
+        }
+    }
+    fflush(con_st.out);
+    return has_more;
+}

examples/common.h ADDED Viewed

	@@ -0,0 +1,138 @@

+// Various helper functions and utilities
+#pragma once
+#include "llama.h"
+#include <string>
+#include <vector>
+#include <random>
+#include <thread>
+#include <unordered_map>
+#include <tuple>
+#if !defined (_WIN32)
+#include <stdio.h>
+#include <termios.h>
+#endif
+//
+// CLI argument parsing
+//
+int32_t get_num_physical_cores();
+struct gpt_params {
+    int32_t seed                            = -1;  // RNG seed
+    int32_t n_threads                       = get_num_physical_cores();
+    int32_t n_predict                       = -1;  // new tokens to predict
+    int32_t n_ctx                           = 512; // context size
+    int32_t n_batch                         = 512; // batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_keep                          = 0;   // number of tokens to keep from initial prompt
+    int32_t n_gpu_layers                    = 0;   // number of layers to store in VRAM
+    int32_t main_gpu                        = 0;   // the GPU that is used for scratch and small tensors
+    float   tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
+    bool    low_vram                        = 0;   // if true, reduce VRAM usage at the cost of performance
+    // sampling parameters
+    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
+    int32_t top_k             = 40;    // <= 0 to use vocab size
+    float   top_p             = 0.95f; // 1.0 = disabled
+    float   tfs_z             = 1.00f; // 1.0 = disabled
+    float   typical_p         = 1.00f; // 1.0 = disabled
+    float   temp              = 0.80f; // 1.0 = disabled
+    float   repeat_penalty    = 1.10f; // 1.0 = disabled
+    int32_t repeat_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float   frequency_penalty = 0.00f; // 0.0 = disabled
+    float   presence_penalty  = 0.00f; // 0.0 = disabled
+    int     mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float   mirostat_tau      = 5.00f; // target entropy
+    float   mirostat_eta      = 0.10f; // learning rate
+    std::string model             = "models/7B/ggml-model.bin"; // model path
+    std::string model_alias       = "unknown"; // model alias
+    std::string prompt            = "";
+    std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state
+    std::string input_prefix      = "";  // string to prefix user inputs with
+    std::string input_suffix      = "";  // string to suffix user inputs with
+    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
+    std::string lora_adapter = "";  // lora adapter path
+    std::string lora_base    = "";  // base model path for the lora adapter
+    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
+    bool random_prompt     = false; // do not randomize prompt if none provided
+    bool use_color         = false; // use color to distinguish generations and inputs
+    bool interactive       = false; // interactive mode
+    bool prompt_cache_all  = false; // save user input and generations to prompt cache
+    bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it
+    bool embedding         = false; // get only sentence embedding
+    bool interactive_first = false; // wait for user input immediately
+    bool multiline_input   = false; // reverse the usage of `\`
+    bool instruct          = false; // instruction mode (used for Alpaca models)
+    bool penalize_nl       = true;  // consider newlines as a repeatable token
+    bool perplexity        = false; // compute perplexity over the prompt
+    bool use_mmap          = true;  // use mmap for faster loads
+    bool use_mlock         = false; // use mlock to keep model in memory
+    bool mem_test          = false; // compute maximum memory usage
+    bool export_cgraph     = false; // export the computation graph
+    bool verbose_prompt    = false; // print prompt tokens before generation
+};
+bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
+void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
+std::string gpt_random_prompt(std::mt19937 & rng);
+//
+// Vocab utils
+//
+std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);
+//
+// Model utils
+//
+std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params);
+//
+// Console utils
+//
+#define ANSI_COLOR_RED     "\x1b[31m"
+#define ANSI_COLOR_GREEN   "\x1b[32m"
+#define ANSI_COLOR_YELLOW  "\x1b[33m"
+#define ANSI_COLOR_BLUE    "\x1b[34m"
+#define ANSI_COLOR_MAGENTA "\x1b[35m"
+#define ANSI_COLOR_CYAN    "\x1b[36m"
+#define ANSI_COLOR_RESET   "\x1b[0m"
+#define ANSI_BOLD          "\x1b[1m"
+enum console_color_t {
+    CONSOLE_COLOR_DEFAULT=0,
+    CONSOLE_COLOR_PROMPT,
+    CONSOLE_COLOR_USER_INPUT,
+    CONSOLE_COLOR_ERROR
+};
+struct console_state {
+    bool multiline_input = false;
+    bool use_color = false;
+    console_color_t color = CONSOLE_COLOR_DEFAULT;
+    FILE* out = stdout;
+#if defined (_WIN32)
+    void* hConsole;
+#else
+    FILE* tty = nullptr;
+    termios prev_state;
+#endif
+};
+void console_init(console_state & con_st);
+void console_cleanup(console_state & con_st);
+void console_set_color(console_state & con_st, console_color_t color);
+bool console_readline(console_state & con_st, std::string & line);

examples/embedding/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+set(TARGET embedding)
+add_executable(${TARGET} embedding.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()

examples/embedding/README.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # embedding
2	+
3	+ TODO

examples/embedding/embedding.cpp ADDED Viewed

	@@ -0,0 +1,97 @@

+#include "common.h"
+#include "llama.h"
+#include "build-info.h"
+#include <ctime>
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+int main(int argc, char ** argv) {
+    gpt_params params;
+    if (gpt_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+    params.embedding = true;
+    if (params.n_ctx > 2048) {
+        fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
+                "expect poor results\n", __func__, params.n_ctx);
+    }
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
+    if (params.seed < 0) {
+        params.seed = time(NULL);
+    }
+    fprintf(stderr, "%s: seed  = %d\n", __func__, params.seed);
+    std::mt19937 rng(params.seed);
+    if (params.random_prompt) {
+        params.prompt = gpt_random_prompt(rng);
+    }
+    llama_init_backend();
+    llama_model * model;
+    llama_context * ctx;
+    // load the model
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    if (model == NULL) {
+        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        return 1;
+    }
+    // print system information
+    {
+        fprintf(stderr, "\n");
+        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
+                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
+    }
+    int n_past = 0;
+    // Add a space in front of the first character to match OG llama tokenizer behavior
+    params.prompt.insert(0, 1, ' ');
+    // tokenize the prompt
+    auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
+    if (params.verbose_prompt) {
+        fprintf(stderr, "\n");
+        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+        fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+        for (int i = 0; i < (int) embd_inp.size(); i++) {
+            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
+        }
+        fprintf(stderr, "\n");
+    }
+    if (params.embedding){
+        if (embd_inp.size() > 0) {
+            if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads)) {
+                fprintf(stderr, "%s : failed to eval\n", __func__);
+                return 1;
+            }
+        }
+        const int n_embd = llama_n_embd(ctx);
+        const auto embeddings = llama_get_embeddings(ctx);
+        for (int i = 0; i < n_embd; i++) {
+            printf("%f ", embeddings[i]);
+        }
+        printf("\n");
+    }
+    llama_print_timings(ctx);
+    llama_free(ctx);
+    llama_free_model(model);
+    return 0;
+}

examples/gpt4all.sh ADDED Viewed

	@@ -0,0 +1,15 @@

+#!/bin/bash
+#
+# Temporary script - will be removed in the future
+#
+cd `dirname $0`
+cd ..
+./main --color --instruct --threads 4 \
+       --model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
+       --file ./prompts/alpaca.txt \
+       --batch_size 8 --ctx_size 2048 -n -1 \
+       --repeat_last_n 64 --repeat_penalty 1.3 \
+       --n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95

examples/jeopardy/README.md ADDED Viewed

	@@ -0,0 +1,21 @@

+# llama.cpp/example/jeopardy
+This is pretty much just a straight port of aigoopy/llm-jeopardy/ with an added graph viewer.
+The jeopardy test can be used to compare the fact knowledge of different models and compare them to eachother. This is in contrast to some other tests, which test logical deduction, creativity, writing skills, etc.
+Step 1: Open jeopardy.sh and modify the following:
+```
+MODEL=(path to your model)
+MODEL_NAME=(name of your model)
+prefix=(basically, if you use vicuna it's Human: , if you use something else it might be User: , etc)
+opts=(add -instruct here if needed for your model, or anything else you want to test out)
+```
+Step 2: Run `jeopardy.sh` from the llama.cpp folder
+Step 3: Repeat steps 1 and 2 until you have all the results you need.
+Step 4: Run `graph.py`, and follow the instructions. At the end, it will generate your final graph.
+Note: The Human bar is based off of the full, original 100 sample questions. If you modify the question count or questions, it will not be valid.

examples/jeopardy/graph.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import matplotlib.pyplot as plt
+import os
+import csv
+labels = []
+numbers = []
+numEntries = 1
+rows = []
+def bar_chart(numbers, labels, pos):
+    plt.bar(pos, numbers, color='blue')
+    plt.xticks(ticks=pos, labels=labels)
+    plt.title("Jeopardy Results by Model")
+    plt.xlabel("Model")
+    plt.ylabel("Questions Correct")
+    plt.show()
+def calculatecorrect():
+    directory = os.fsencode("./examples/jeopardy/results/")
+    csv_reader = csv.reader(open("./examples/jeopardy/qasheet.csv", 'rt'), delimiter=',')
+    for row in csv_reader:
+        global rows
+        rows.append(row)
+    for listing in os.listdir(directory):
+        filename = os.fsdecode(listing)
+        if filename.endswith(".txt"):
+            file = open("./examples/jeopardy/results/" + filename, "rt")
+            global labels
+            global numEntries
+            global numbers
+            labels.append(filename[:-4])
+            numEntries += 1
+            i = 1
+            totalcorrect = 0
+            for line in file.readlines():
+                if line.strip() != "------":
+                    print(line)
+                else:
+                    print("Correct answer: " + rows[i][2] + "\n")
+                    i += 1
+                    print("Did the AI get the question right? (y/n)")
+                    if input() == "y":
+                        totalcorrect += 1
+            numbers.append(totalcorrect)
+if __name__ == '__main__':
+    calculatecorrect()
+    pos = list(range(numEntries))
+    labels.append("Human")
+    numbers.append(48.11)
+    bar_chart(numbers, labels, pos)
+    print(labels)
+    print(numbers)

examples/jeopardy/jeopardy.sh ADDED Viewed

	@@ -0,0 +1,30 @@

+#!/bin/bash
+set -e
+MODEL=./models/ggml-vicuna-13b-1.1-q4_0.bin
+MODEL_NAME=Vicuna
+# exec options
+prefix="Human: " # Ex. Vicuna uses "Human: "
+opts="--temp 0 -n 80" # additional flags
+nl='
+'
+introduction="You will be playing a game of Jeopardy. Simply answer the question in the correct format (Ex. What is Paris, or Who is George Washington)."
+# file options
+question_file=./examples/jeopardy/questions.txt
+touch ./examples/jeopardy/results/$MODEL_NAME.txt
+output_file=./examples/jeopardy/results/$MODEL_NAME.txt
+counter=1
+echo 'Running'
+while IFS= read -r question
+do
+  exe_cmd="./main -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
+  echo $counter
+  echo "Current Question: $question"
+  eval "$exe_cmd"
+  echo -e "\n------" >> $output_file
+  counter=$((counter+1))
+done < "$question_file"

examples/jeopardy/qasheet.csv ADDED Viewed

	@@ -0,0 +1,103 @@

+Index,Original Category,Original Correct Question,Model Prompt
+1,The Oscars,Who is John Williams?,Which actor Born in 1932 was the son of a percussionist in the CBS radio orchestra has been nominated for 53 Oscars?
+2,English Literature,What is Paradise Lost?,"What work in English Literature says: 'The mind is its own place, & in itself can make a heaven of hell, a hell of heaven. What matter where, if I be still the same'?"
+3,Writers’ Lesser-Known Works,Who is Niccolò Machiavelli?,"Known for more philosophical works, he wrote the play 'La Mandragola', in which Florentines are rewarded for immoral actions?"
+4,Exploration,What is Easter Island (Rapa Nui)?,"James Cook's account of a 1774 visit where records an object 'near 27 feet long, and upwards of 8 feet over the breast or shoulders'?"
+5,The Bill of Rights,What is the Eighth Amendment?,England's 'Bloody Assizes' & a 1685 life sentence for perjury were 2 main origins of which amendment to the U.S. Constitution?
+6,Nobel Peace Prize Winners,Who are Nelson Mandela & Desmond Tutu?,"Which nobel peace price winners each lived at times on Vilakazi St. in Soweto , so it claims to be the world's only street home to 2 Nobel Peace Prize winners?"
+7,Famous Names,Who is Walt Disney?,"In 1966, the year of who's death did he share plans for an experimental prototype community in Florida?"
+8,Geography,What is Colombia?,"Of the 13 nations through which the Equator passes, what is the only one whose coastline borders the Caribbean Sea?"
+9,Fashion History,What are rhinestones?,"Which decorative items in fashion history get their name from their origin in the port city of Strasbourg, on the border of France & Germany?"
+10,Movies of the ’80s,What is Driving Miss Daisy?,What 1980's movie is based on an off-Broadway play with just 3 characters and won the Best Picture Oscar & the actors in all 3 roles were nominated?
+11,Novelists,Who is John Grisham?,"A 2012 book review for which novelist noted subjects that 'sparked his ire': capital punishment, big tobacco & 'the plight of the unjustly convicted'?"
+12,20th Century Eponyms,What is the Maginot Line?,"A 1940 headline about what 20th Century Eponym included 'failure', 'liability when it came to offense' & 'stout hearts no match for tanks'?"
+13,City History,What is Stockholm?,"Over 700 years after its traditional 1252 founding date, what port city became associated with a psychological response?"
+14,Brand Names,What is Jacuzzi?,"The success of what brand has its roots with a hydrotherapy pump its cofounder created for his son, who had arthritis?"
+15,American Authors,Who is Washington Irving?,"In a periodical in 1807, what American Author called New York City 'Gotham, Gotham! Most enlightened of cities'?"
+16,Symbols,What is “less than”?,What symbol is a rotated V in math and a feeling of some marginalized or underrepresented people in society?
+17,Movie Theme Songs,Who is James Bond?,"Monty Norman, the composer of what character's theme, said the staccato riff conveyed sexiness, mystery & ruthlessness?"
+18,American Novelists,Who is Joseph Heller?,"What American Novelist served with an airman named Yohannan in World War II & despite what readers might think, he said he enjoyed his service?"
+19,Medieval Places,"What is Canterbury, England? (Canterbury Cathedral)","In what Medieval place did one of the participants in an 1170 event say, 'Let us away, knights; he will rise no more'?"
+20,Countries of Africa,What is Morocco?,"At one time a province of the Roman Empire, what African country kingdom is known to Arabic scholars as Al-Maghrib Al-Aqsa, 'the far west'?"
+21,Statehood,What is Wyoming?,Congress relented in 1890 after what prospective state said it would wait 100 years rather than come in without the women?
+22,1980s Movies,What is Raiders of the Lost Ark?,"A writer & producer of what movie said he wanted it to be like a Western or James Bond film, 'only it takes place in the 30s'?"
+23,Art Exhibitions,Who is Rembrandt?,In 1898 what's been called the first blockbuster art show was devoted to which artist & put on for Queen Wilhelmina's coronation?
+24,Countries of the World,What is Mongolia?,"Part of the largest contiguous land empire during the 1200s & 1300s, today what is the world's second-largest landlocked country?"
+25,Literature,What is “Howl”?,A 2006 book was titled 'The Poem That Changed America:' What 'Fifty Years Later'?
+26,Invasions,Who is William of Orange?,"Backed by 14,000 troops, who invaded England to restore, in his words, its 'religion, laws, and liberties'?"
+27,Landmarks,What is the Eiffel Tower?,"After its completion in the late 19th c., what was landmark was called 'a truly tragic street lamp' & a 'high & skinny pyramid of iron ladders'?"
+28,Geographic Name’s the Same,What is Dover?,"The busiest passenger port in the U.K., what shares its name with a capital of one of the original 13 states?"
+29,Names in the Bookstore,Who is Peter Mark Roget?,"This man made lists, perhaps to cope with depression; a set of lists he published in 1852 made whose name synonymous with a type of book?"
+30,U.S. History,Who is Dr. Samuel Mudd?,"An 1869 presidential pardon was granted to which man, due in part to a plea by the Medical Society of Harford County, Maryland?"
+31,American Literature,What is The Things They Carried?,"Letters, pocket knives, C rations & steel helmets are among the tangible items referred to in the title of what American literature modern war classic?"
+32,Nonfiction,What is The Communist Manifesto,"What nonfiction book has the line, 'The discovery of America…opened up fresh ground for the rising bourgeoisie'?"
+33, a new version was passed 81 years later,Laws in U.S. History,What is the Civil Rights Act?,,,,,,,,,,,,,,,,,,0, 2/3
+34,Names of Myth,Who is Helen of Troy?,"Whose brothers, Castor & Pollux, saved her after Theseus stole her away as a kid; a larger force would seek her later in life?"
+35,African Countries,What is Sudan?,"Once Africa's largest country in area, what African Country dropped to third in 2011 when a portion of it declared independence?"
+36,The Ancient World,What is Alexandria?,"The ancient writer Galen said books on ships arriving to what city's port were seized, originals kept & copies returned?"
+37,Famous Names,Who is Andy Warhol?,"For a special 1970s cookbook, who provided one simple recipe–a can of Campbell's tomato soup & 2 cans of milk?"
+38,People & Places,What is Guam?,"Thought to descend from people of Southeast Asia, the Chamorro make up what U.S. territory’s largest ethnic group?"
+39,Current World Leaders,What is the Philippines?,"In office from 2022, the president of what country has taken so many foreign trips a play on his name is 'Ferdinand Magellan Jr.'?"
+40,Writers & The South,Who is Tennessee Williams?,In 1939 which writer lived on Toulouse Street in the French Quarter & chose the professional name that bonded him to the South?
+41,National Parks,What is Yellowstone?,"What National Park is named for a river indigenous people called Mi tse a-da-zi, translated by French-speaking trappers as 'Pierre Jaune'?"
+42,Sports,Who are the Harlem Globetrotters?,"In 2010 who introduced the 4-point shot, 35 feet from the basket?"
+43,The U.S. Military,What is “Top Gun”?,Losses over Asia in the 1960s led to the establishment of the program known as what at a San Diego naval base in 1969?
+44,Art & Science,What is Halley’s Comet?,"A craft that visited what was named for Giotto, based on the story that 680 years earlier, the painter depicted it as the Star of Bethlehem?"
+45,Words From World War I,What is “tank”?,"In World War I, 'Cistern' & 'reservoir' were suggested names for what secret invention, but the British preferred this less clumsy monosyllable?"
+46,European History,What is Holy Roman Emperor?,"Until 1806, some German nobles included among their honors the title of 'Elector' for their role in selecting this personage?"
+47,Theater History,Who is Peter Pan?,"In 1904, wearing a harness, actress Nina Boucicault became the first to play what character onstage?"
+48,European Cities,What is Aachen?,"Alphabetically the first German city in encyclopedias, what was also the first one taken by the Allies in World War II?"
+49,Word Origins,What is mantra?,This Sanskrit word referring to a spoken word or phrase comes from a word for 'to think'?
+50,Inventions,What is barbed wire?,1917's 'Elements of Trench Warfare' said what Old West invention was 'difficult to destroy' & 'difficult to get through'?
+51,World War II,What is Schindler’s list?,"Mimi Reinhard, who never learned to type using more than 2 fingers, produced what in World War II with 1,100 names, including hers?"
+52, their offspring was the source of this mythical object,Mythology,What is the Golden Fleece?
+53,Literature,What is Pride and Prejudice?,"Published in 2011, P.D. James' final novel, 'Death Comes to Pemberley', was a sequel to what novel from 200 years earlier?"
+54, only these 2 west of the Mississippi River border each other,U.S. State Names,What are Oregon & Nevada?
+55,Word Origins,What is passion?,"Originally relating to a story of suffering, what word now more commonly refers to strong emotion of any kind?"
+56,World Cinema,What is La Vie en Rose?,"The 2007 biopic called 'La Môme' in France, meaning 'The Kid', was released in the U.S. under what other French title?"
+57,History,What is Santa Maria?,"Returning home in 1493, Columbus stopped in the Azores at an island with what name, also something he'd lost off the Haiti coast?"
+58,Landmarks,What is a kremlin?,Pskov & Nizhny Novgorod are 2 of the cities that have a fortress called what?
+59,Foreign-Born Authors,Who is Vladimir Nabokov?,In the 1950s the New York Times said what author 'is writing about all lust' & his lecherous narrator 'is all of us'?
+60,Astronomy & Geography,What is Capricorn?,"At the winter solstice, the sun is in Sagittarius; it once appeared in what constellation, giving a geographic feature its name?"
+61,Television,What is Law & Order?,"Mike Post combined the sound of a slamming jail door, an anvil & 100 men stomping on a floor for what television series that debuted in 1990?"
+62,British Landmarks,What is the Tower of London?,"Like Sir Thomas More, 3 16th century English queens are buried at what British location?"
+63,Early American History,What are witches?,"In 1692 Increase Mather wrote, 'It were better that ten suspected' of these who 'escape, than that one innocent person … be condemned'?"
+64,Geography Mnemonics,What are Arkansas and Louisiana?,"The Geography Mnemonic Mimal, sometimes said to be the silhouette of a chef or elf, stands for Minnesota, Iowa, Missouri, and what other 2 states?"
+65,Business Milestones,What is the Ford Model T?,"What was first sold in 1908, at a price equivalent to about $27,000 today?"
+66,In The Bookstore,Who is Tom Clancy?,The name of what author dead since 2013 now appears on books written by a former U.S. marshal & a former Apache helicopter pilot?
+67,Historic Art,What is the Bayeux Tapestry?,The artwork once known in France as 'la tapisserie de la Reine Mathilde' is better known as what?
+68,Pop Stars,Who is Madonna?,In 2022 which pop star became the first woman to have a Billboard Top 10 album in 5 decades starting with the 1980s?
+69,Classic Tale Characters,Who is Scheherazade?,"In one 19th century translation, what female classic tale character 'perceived the dawn of day and ceased' speaking nearly 1,000 times?"
+70,USA,What is Jack Daniel’s?,"Ironically, though what company founded in the 1860s is Moore County, Tennessee's largest employer, Moore is a dry county?"
+71,Historic People,Who was William Bligh?,"After a 1789 event, who wrote, 'My first determination was to seek a supply of…water at Tofoa, & afterwards to sail for Tongataboo'?"
+72,The Movies,What is The Godfather?,Laurence Olivier & Ernest Borgnine were considered for the lead role & Sergio Leone to direct for what film that turned 50 in 2022?
+73,Continental Geography,What is Colombia?,"Until a 1903 secession, what country's contiguous territory spanned 2 continents?"
+74,Foreign-Born Authors,Who is Isabel Allende?,"Early in her career which foreign-born author translated romance novels into Spanish, often changing the dialogue to make the heroines smarter?"
+75,Historic Crimes,What is the Mona Lisa?,"Saying it was stolen by Napoleon, self-styled Italian patriot Vincenzo Peruggia took what in 1911?"
+76,U.S. Bodies of Water,What is Lake Mead?,"Continuing a downward trend, in July 2022 what US body of water was at 27% capacity, its lowest level since 1937 when it was first being filled?"
+77,Gods & Goddesses,Who is Aurora (or Eos)?,"Each morning which goddess began her ride in her chariot across the sky ahead of her brother Sol, or Helios?"
+78,America At War,What is the Battle of New Orleans?,"Until the Civil War, the Jan. 8 date of what American battle of dubious military importance but big morale value was a national holiday?"
+79,Children’s Books,What is The Velveteen Rabbit?,"Which children's book title character is told 'By the time you are real, most of your hair has been loved off your eyes drop out & you get shabby'?"
+80,TV Finales,What is Grace and Frankie?,"In a TV reunion over 40 years in the making, Dolly Parton appeared as an angel named Agnes in the final episode of what comedy in 2022?"
+81,American Poems,Who is Evangeline?,"In an 1847 American poem what character sees her town of Grand-Pré burned, but finally reunites with her beau for a kiss before his death?"
+82,Famous Names,Who is Banksy?,"In 2001 who published a book called 'Banging Your Head Against a Brick Wall'; in 2002, 'Existencilism'?"
+83,Children’s Lit,What is Charlotte’s Web?,The title object of what childrens book 'never looked more beautiful each strand held dozens of bright drops of early morning dew'?
+84,Classic Songs,What is “Here Comes Santa Claus”?,The shouts of excited children at a 1946 holiday parade are said to have inspired what perennial classic song favorite?
+85,Brand Names,What are Milk Duds?,"Unable to make what candies perfectly round, the confectioner embraced this flawed name for the product?"
+86,Countries of the World,What is Italy?,"What country is home to 58 UNESCO World Heritage Sites, more than any other country; the sites include a volcano & a lagoon?"
+87,Action Movies,What is Die Hard?,"What action movie's last line is 'If this is their idea of Christmas, I gotta be here for New Years'?"
+88,Presidential Facts,Who is Woodrow Wilson?,Only 3 presidents have married while in office— John Tyler was the first & which one was the last?
+89,19th Century Americans,Who is Frederick Douglass?,"Demonstrating the dignity & humanity of Black Americans, who sat for 160 known photographs, the most of any American in the 19th century?"
+90,Latin Phrases,What is “quid pro quo”?,"Originally, which Latin 3-word phrase referred to when a doctor or apothecary substituted one medicine for another?"
+91,1970s Movies,What is Monty Python and the Holy Grail?,The 1975 premiere of what movie comedy advertised free coconuts for the first thousand in the audience?
+92,Name’s The Same,What is Manhattan?,"A cocktail, an island & a WWII venture originally called 'Development of Substitute Materials' all bear what name?"
+93,U.S. Presidents,Who is Calvin Coolidge?,"Which US President was sworn in twice as President within 2 years, first by his father & then later by a former U.S. President?"
+94,Plays,What is The Tempest?,A 1609 story in which an exiled king of Bulgaria creates a sea palace with his magic may have inspired the plot of what play?
+95,Landmarks,What is the Berlin Wall?,"In 2009, during a 20th anniversary celebration, what landmark was called 'an edifice of fear. On Nov. 9, it became a place of joy'?"
+96,World Capitals,"What is Vienna, Austria?","Among what world capital's nicknames are the 'City of Classical Music' &, possibly in honor of a famous resident from 1860 to 1938, the 'City of Dreams'?"
+97,Language & Its Meanings,What is a night owl?,"Now meaning someone with nocturnal habits, what catches a sleeping dove in Shakespeare's 'Lucrece'?"
+98,Flags of Our Hemisphere,What is Brazil?,"The stars on what country's flag represent states, 26 of them; unlike the USA's, its 'federal district' gets its own 27th star?"
+99,Names in U.S. History,Who is Oliver Brown?,What father was the only man among the 13 plaintiffs in a US class-action case filed in 1951?
+100,Children’s Authors,"Who is Sarah? (from Sarah, Plain and Tall)","Reversing the story of what heroine she created, childrens author Patricia Maclachlan was born on the prairie but spent much of her life in New England?"
+,,,
+TOTALS,,,

examples/jeopardy/questions.txt ADDED Viewed

	@@ -0,0 +1,100 @@

+Which man born in 1932 was the son of a percussionist in the CBS radio orchestra has been nominated for 53 Oscars?
+What work in English Literature says: 'The mind is its own place, & in itself can make a heaven of hell, a hell of heaven. What matter where, if I be still the same'?
+Known for more philosophical works, he wrote the play 'La Mandragola', in which Florentines are rewarded for immoral actions?
+James Cook's account of a 1774 visit where records an object 'near 27 feet long, and upwards of 8 feet over the breast or shoulders'?
+England's 'Bloody Assizes' & a 1685 life sentence for perjury were 2 main origins of which amendment to the U.S. Constitution?
+Which nobel peace price winners each lived at times on Vilakazi St. in Soweto , so it claims to be the world's only street home to 2 Nobel Peace Prize winners?
+In 1966, the year of who's death did he share plans for an experimental prototype community in Florida?
+Of the 13 nations through which the Equator passes, what is the only one whose coastline borders the Caribbean Sea?
+Which decorative items in fashion history get their name from their origin in the port city of Strasbourg, on the border of France & Germany?
+What 1980's movie is based on an off-Broadway play with just 3 characters and won the Best Picture Oscar & the actors in all 3 roles were nominated?
+A 2012 book review for which novelist noted subjects that 'sparked his ire': capital punishment, big tobacco & 'the plight of the unjustly convicted'?
+A 1940 headline about what 20th Century Eponym included 'failure', 'liability when it came to offense' & 'stout hearts no match for tanks'?
+Over 700 years after its traditional 1252 founding date, what port city became associated with a psychological response?
+The success of what brand has its roots with a hydrotherapy pump its cofounder created for his son, who had arthritis?
+In a periodical in 1807, what American Author called New York City 'Gotham, Gotham! Most enlightened of cities'?
+What symbol is a rotated V in math and a feeling of some marginalized or underrepresented people in society?
+Monty Norman, the composer of what character's theme, said the staccato riff conveyed sexiness, mystery & ruthlessness?
+What American Novelist served with an airman named Yohannan in World War II & despite what readers might think, he said he enjoyed his service?
+In what Medieval place did one of the participants in an 1170 event say, 'Let us away, knights; he will rise no more'?
+At one time a province of the Roman Empire, what African country kingdom is known to Arabic scholars as Al-Maghrib Al-Aqsa, 'the far west'?
+Congress relented in 1890 after what prospective state said it would wait 100 years rather than come in without the women?
+A writer & producer of what movie said he wanted it to be like a Western or James Bond film, 'only it takes place in the 30s'?
+In 1898 what's been called the first blockbuster art show was devoted to which artist & put on for Queen Wilhelmina's coronation?
+Part of the largest contiguous land empire during the 1200s & 1300s, today what is the world's second-largest landlocked country?
+A 2006 book was titled 'The Poem That Changed America:' What 'Fifty Years Later'?
+Backed by 14,000 troops, who invaded England to restore, in his words, its 'religion, laws, and liberties'?
+After its completion in the late 19th c., what was landmark was called 'a truly tragic street lamp' & a 'high & skinny pyramid of iron ladders'?
+The busiest passenger port in the U.K., what shares its name with a capital of one of the original 13 states?
+This man made lists, perhaps to cope with depression; a set of lists he published in 1852 made whose name synonymous with a type of book?
+An 1869 presidential pardon was granted to which man, due in part to a plea by the Medical Society of Harford County, Maryland?
+Letters, pocket knives, C rations & steel helmets are among the tangible items referred to in the title of what American literature modern war classic?
+What nonfiction book has the line, 'The discovery of America…opened up fresh ground for the rising bourgeoisie'?
+A radical Republican championed what 1875 act but the Supreme Court struck it down in 1883; a new version was passed 81 years later?
+Whose brothers, Castor & Pollux, saved her after Theseus stole her away as a kid; a larger force would seek her later in life?
+Once Africa's largest country in area, what African Country dropped to third in 2011 when a portion of it declared independence?
+The ancient writer Galen said books on ships arriving to what city's port were seized, originals kept & copies returned?
+For a special 1970s cookbook, who provided one simple recipe–a can of Campbell's tomato soup & 2 cans of milk?
+Thought to descend from people of Southeast Asia, the Chamorro make up what U.S. territory’s largest ethnic group?
+In office from 2022, the president of what country has taken so many foreign trips a play on his name is 'Ferdinand Magellan Jr.'?
+In 1939 which writer lived on Toulouse Street in the French Quarter & chose the professional name that bonded him to the South?
+What National Park is named for a river indigenous people called Mi tse a-da-zi, translated by French-speaking trappers as 'Pierre Jaune'?
+In 2010 who introduced the 4-point shot, 35 feet from the basket?
+Losses over Asia in the 1960s led to the establishment of the program known as what at a San Diego naval base in 1969?
+A craft that visited what was named for Giotto, based on the story that 680 years earlier, the painter depicted it as the Star of Bethlehem?
+In World War I, 'Cistern' & 'reservoir' were suggested names for what secret invention, but the British preferred this less clumsy monosyllable?
+Until 1806, some German nobles included among their honors the title of 'Elector' for their role in selecting this personage?
+In 1904, wearing a harness, actress Nina Boucicault became the first to play what character onstage?
+Alphabetically the first German city in encyclopedias, what was also the first one taken by the Allies in World War II?
+This Sanskrit word referring to a spoken word or phrase comes from a word for 'to think'?
+1917's 'Elements of Trench Warfare' said what Old West invention was 'difficult to destroy' & 'difficult to get through'?
+Mimi Reinhard, who never learned to type using more than 2 fingers, produced what in World War II with 1,100 names, including hers?
+Poseidon carried off the maiden Theophane & turned her into a ewe; their offspring was the source of what mythical object?
+Published in 2011, P.D. James' final novel, 'Death Comes to Pemberley', was a sequel to what novel from 200 years earlier?
+5 U.S. states have 6-letter names; only which 2 west of the Mississippi River border each other?
+Originally relating to a story of suffering, what word now more commonly refers to strong emotion of any kind?
+The 2007 biopic called 'La Môme' in France, meaning 'The Kid', was released in the U.S. under what other French title?
+Returning home in 1493, Columbus stopped in the Azores at an island with what name, also something he'd lost off the Haiti coast?
+Pskov & Nizhny Novgorod are 2 of the cities that have a fortress called what?
+In the 1950s the New York Times said what author 'is writing about all lust' & his lecherous narrator 'is all of us'?
+At the winter solstice, the sun is in Sagittarius; it once appeared in what constellation, giving a geographic feature its name?
+Mike Post combined the sound of a slamming jail door, an anvil & 100 men stomping on a floor for what television series that debuted in 1990?
+Like Sir Thomas More, 3 16th century English queens are buried at what British location?
+In 1692 Increase Mather wrote, 'It were better that ten suspected' of these who 'escape, than that one innocent person be condemned'?
+The Geography Mnemonic Mimal, sometimes said to be the silhouette of a chef or elf, stands for Minnesota, Iowa, Missouri, and what other 2 states?
+What was first sold in 1908, at a price equivalent to about $27,000 today?
+The name of what author dead since 2013 now appears on books written by a former U.S. marshal & a former Apache helicopter pilot?
+The artwork once known in France as 'la tapisserie de la Reine Mathilde' is better known as what?
+In 2022 which pop star became the first woman to have a Billboard Top 10 album in 5 decades starting with the 1980s?
+In one 19th century translation, what female classic tale character 'perceived the dawn of day and ceased' speaking nearly 1,000 times?
+Ironically, though what company founded in the 1860s is Moore County, Tennessee's largest employer, Moore is a dry county?
+After a 1789 event, who wrote, 'My first determination was to seek a supply of…water at Tofoa, & afterwards to sail for Tongataboo'?
+Laurence Olivier & Ernest Borgnine were considered for the lead role & Sergio Leone to direct for what film that turned 50 in 2022?
+Until a 1903 secession, what country's contiguous territory spanned 2 continents?
+Early in her career which foreign-born author translated romance novels into Spanish, often changing the dialogue to make the heroines smarter?
+Saying it was stolen by Napoleon, self-styled Italian patriot Vincenzo Peruggia took what in 1911?
+Continuing a downward trend, in July 2022 what US body of water was at 27% capacity, its lowest level since 1937 when it was first being filled?
+Each morning which goddess began her ride in her chariot across the sky ahead of her brother Sol, or Helios?
+Until the Civil War, the Jan. 8 date of what American battle of dubious military importance but big morale value was a national holiday?
+Which children's book title character is told 'By the time you are real, most of your hair has been loved off your eyes drop out & you get shabby'?
+In a TV reunion over 40 years in the making, Dolly Parton appeared as an angel named Agnes in the final episode of what comedy in 2022?
+In an 1847 American poem what character sees her town of Grand-Pré burned, but finally reunites with her beau for a kiss before his death?
+In 2001 who published a book called 'Banging Your Head Against a Brick Wall'; in 2002, 'Existencilism'?
+The title object of what childrens book 'never looked more beautiful each strand held dozens of bright drops of early morning dew'?
+The shouts of excited children at a 1946 holiday parade are said to have inspired what perennial classic song favorite?
+Unable to make what candies perfectly round, the confectioner embraced this flawed name for the product?
+What country is home to 58 UNESCO World Heritage Sites, more than any other country; the sites include a volcano & a lagoon?
+What action movie's last line is 'If this is their idea of Christmas, I gotta be here for New Years'?
+Only 3 presidents have married while in office— John Tyler was the first & which one was the last?
+Demonstrating the dignity & humanity of Black Americans, who sat for 160 known photographs, the most of any American in the 19th century?
+Originally, which Latin 3-word phrase referred to when a doctor or apothecary substituted one medicine for another?
+The 1975 premiere of what movie comedy advertised free coconuts for the first thousand in the audience?
+A cocktail, an island & a WWII venture originally called 'Development of Substitute Materials' all bear what name?
+Which US President was sworn in twice as President within 2 years, first by his father & then later by a former U.S. President?
+A 1609 story in which an exiled king of Bulgaria creates a sea palace with his magic may have inspired the plot of what play?
+In 2009, during a 20th anniversary celebration, what landmark was called 'an edifice of fear. On Nov. 9, it became a place of joy'?
+Among what world capital's nicknames are the 'City of Classical Music' &, possibly in honor of a famous resident from 1860 to 1938, the 'City of Dreams'?
+Now meaning someone with nocturnal habits, what catches a sleeping dove in Shakespeare's 'Lucrece'?
+The stars on what country's flag represent states, 26 of them; unlike the USA's, its 'federal district' gets its own 27th star?
+What father was the only man among the 13 plaintiffs in a US class-action case filed in 1951?
+Reversing the story of what heroine she created, childrens author Patricia Maclachlan was born on the prairie but spent much of her life in New England?

examples/main/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+set(TARGET main)
+add_executable(${TARGET} main.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()

examples/main/README.md ADDED Viewed

	@@ -0,0 +1,293 @@

+# llama.cpp/example/main
+This example program allows you to use various LLaMA language models in an easy and efficient way. It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
+## Table of Contents
+1. [Quick Start](#quick-start)
+2. [Common Options](#common-options)
+3. [Input Prompts](#input-prompts)
+4. [Interaction](#interaction)
+5. [Context Management](#context-management)
+6. [Generation Flags](#generation-flags)
+7. [Performance Tuning and Memory Options](#performance-tuning-and-memory-options)
+8. [Additional Options](#additional-options)
+## Quick Start
+To get started right away, run the following command, making sure to use the correct path for the model you have:
+#### Unix-based systems (Linux, macOS, etc.):
+```bash
+./main -m models/7B/ggml-model.bin --prompt "Once upon a time"
+```
+#### Windows:
+```powershell
+main.exe -m models\7B\ggml-model.bin --prompt "Once upon a time"
+```
+For an interactive experience, try this command:
+#### Unix-based systems (Linux, macOS, etc.):
+```bash
+./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " \
+'User: Hi
+AI: Hello. I am an AI chatbot. Would you like to talk?
+User: Sure!
+AI: What would you like to talk about?
+User:'
+```
+#### Windows:
+```powershell
+main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -e --prompt "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
+```
+The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
+#### Unix-based systems (Linux, macOS, etc.):
+```bash
+./main -m models/7B/ggml-model.bin --ignore-eos -n -1 --random-prompt
+```
+#### Windows:
+```powershell
+main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt
+```
+## Common Options
+In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
+-   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
+-   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
+-   `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
+-   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
+-   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
+## Input Prompts
+The `main` program provides several ways to interact with the LLaMA models using input prompts:
+-   `--prompt PROMPT`: Provide a prompt directly as a command-line option.
+-   `--file FNAME`: Provide a file containing a prompt or multiple prompts.
+-   `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
+-   `--random-prompt`: Start with a randomized prompt.
+## Interaction
+The `main` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive`, `--interactive-first`, and `--instruct`.
+In interactive mode, users can participate in text generation by injecting their input during the process. Users can press `Ctrl+C` at any time to interject and type their input, followed by pressing `Return` to submit it to the LLaMA model. To submit additional lines without finalizing input, users can end the current line with a backslash (`\`) and continue typing.
+### Interaction Options
+-   `-i, --interactive`: Run the program in interactive mode, allowing users to engage in real-time conversations or provide specific instructions to the model.
+-   `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
+-   `-ins, --instruct`: Run the program in instruction mode, which is specifically designed to work with Alpaca models that excel in completing tasks based on user instructions.
+-   `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
+By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
+### Reverse Prompts
+Reverse prompts are a powerful way to create a chat-like experience with a LLaMA model by pausing the text generation when specific text strings are encountered:
+-   `-r PROMPT, --reverse-prompt PROMPT`: Specify one or multiple reverse prompts to pause text generation and switch to interactive mode. For example, `-r "User:"` can be used to jump back into the conversation whenever it's the user's turn to speak. This helps create a more interactive and conversational experience. However, the reverse prompt doesn't work when it ends with a space.
+To overcome this limitation, you can use the `--in-prefix` flag to add a space or any other characters after the reverse prompt.
+### In-Prefix
+The `--in-prefix` flag is used to add a prefix to your input, primarily, this is used to insert a space after the reverse prompt. Here's an example of how to use the `--in-prefix` flag in conjunction with the `--reverse-prompt` flag:
+```sh
+./main -r "User:" --in-prefix " "
+```
+### In-Suffix
+The `--in-suffix` flag is used to add a suffix after your input. This is useful for adding an "Assistant:" prompt after the user's input. It's added after the new-line character (`\n`) that's automatically added to the end of the user's input. Here's an example of how to use the `--in-suffix` flag in conjunction with the `--reverse-prompt` flag:
+```sh
+./main -r "User:" --in-prefix " " --in-suffix "Assistant:"
+```
+### Instruction Mode
+Instruction mode is particularly useful when working with Alpaca models, which are designed to follow user instructions for specific tasks:
+-   `-ins, --instruct`: Enable instruction mode to leverage the capabilities of Alpaca models in completing tasks based on user-provided instructions.
+Technical detail: the user's input is internally prefixed with the reverse prompt (or `### Instruction:` as the default), and followed by `### Response:` (except if you just press Return without any input, to keep generating a longer response).
+By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
+## Context Management
+During text generation, LLaMA models have a limited context size, which means they can only consider a certain number of tokens from the input and generated text. When the context fills up, the model resets internally, potentially losing some information from the beginning of the conversation or instructions. Context management options help maintain continuity and coherence in these situations.
+### Context Size
+The `--ctx-size` option allows you to set the size of the prompt context used by the LLaMA models during text generation. A larger context size helps the model to better comprehend and generate responses for longer input or conversations.
+-   `-c N, --ctx-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.
+### Keep Prompt
+The `--keep` option allows users to retain the original prompt when the model runs out of context, ensuring a connection to the initial instruction or conversation topic is maintained.
+-   `--keep N`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
+By utilizing context management options like `--ctx-size` and `--keep`, you can maintain a more coherent and consistent interaction with the LLaMA models, ensuring that the generated text remains relevant to the original prompt or conversation.
+## Generation Flags
+The following options allow you to control the text generation process and fine-tune the diversity, creativity, and quality of the generated text according to your needs. By adjusting these options and experimenting with different combinations of values, you can find the best settings for your specific use case.
+### Number of Tokens to Predict
+-   `-n N, --n-predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity).
+The `--n-predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit.
+It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n-predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.
+### Temperature
+-   `--temp N`: Adjust the randomness of the generated text (default: 0.8).
+Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
+Example usage: `--temp 0.5`
+### Repeat Penalty
+-   `--repeat-penalty N`: Control the repetition of token sequences in the generated text (default: 1.1).
+-   `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).
+-   `--no-penalize-nl`: Disable penalization for newline tokens when applying the repeat penalty.
+The `repeat-penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.1.
+The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx-size`).
+Use the `--no-penalize-nl` option to disable newline penalization when applying the repeat penalty. This option is particularly useful for generating chat conversations, dialogues, code, poetry, or any text where newline tokens play a significant role in structure and formatting. Disabling newline penalization helps maintain the natural flow and intended formatting in these specific use cases.
+Example usage: `--repeat-penalty 1.15 --repeat-last-n 128 --no-penalize-nl`
+### Top-K Sampling
+-   `--top-k N`: Limit the next token selection to the K most probable tokens (default: 40).
+Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top-k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text. The default value is 40.
+Example usage: `--top-k 30`
+### Top-P Sampling
+-   `--top-p N`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
+Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top-p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. The default value is 0.9.
+Example usage: `--top-p 0.95`
+### Tail Free Sampling (TFS)
+-   `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
+Tail free sampling (TFS) is a text generation technique that aims to reduce the impact of less likely tokens, which may be less relevant, less coherent, or nonsensical, on the output. The method adjusts the logits (token probabilities) by raising them to the power of the parameter z. A higher value of z (e.g., 2.0) will further suppress less likely tokens from the tail of the distribution, while a value of 1.0 disables the effect of TFS. By setting the parameter z, you can control how much the probabilities of less likely tokens are reduced.
+Example usage: `--tfs 2.0`
+### Locally Typical Sampling
+-   `--typical N`: Enable locally typical sampling with parameter p (default: 1.0, 1.0 = disabled).
+Locally typical sampling promotes the generation of contextually coherent and diverse text by sampling tokens that are typical or expected based on the surrounding context. By setting the parameter p between 0 and 1, you can control the balance between producing text that is locally coherent and diverse. A value closer to 1 will promote more contextually coherent tokens, while a value closer to 0 will promote more diverse tokens. A value equal to 1 disables locally typical sampling.
+Example usage: `--typical 0.9`
+### Mirostat Sampling
+-   `--mirostat N`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0).
+-   `--mirostat-lr N`: Set the Mirostat learning rate, parameter eta (default: 0.1).
+-   `--mirostat-ent N`: Set the Mirostat target entropy, parameter tau (default: 5.0).
+Mirostat is an algorithm that actively maintains the quality of generated text within a desired range during text generation. It aims to strike a balance between coherence and diversity, avoiding low-quality output caused by excessive repetition (boredom traps) or incoherence (confusion traps).
+The `--mirostat-lr` option sets the Mirostat learning rate (eta). The learning rate influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. The default value is `0.1`.
+The `--mirostat-ent` option sets the Mirostat target entropy (tau), which represents the desired perplexity value for the generated text. Adjusting the target entropy allows you to control the balance between coherence and diversity in the generated text. A lower value will result in more focused and coherent text, while a higher value will lead to more diverse and potentially less coherent text. The default value is `5.0`.
+Example usage: `--mirostat 2 --mirostat-lr 0.05 --mirostat-ent 3.0`
+### Logit Bias
+-   `-l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS`: Modify the likelihood of a token appearing in the generated text completion.
+The logit bias option allows you to manually adjust the likelihood of specific tokens appearing in the generated text. By providing a token ID and a positive or negative bias value, you can increase or decrease the probability of that token being generated.
+For example, use `--logit-bias 15043+1` to increase the likelihood of the token 'Hello', or `--logit-bias 15043-1` to decrease its likelihood. Using a value of negative infinity, `--logit-bias 15043-inf` ensures that the token `Hello` is never produced.
+A more practical use case might be to prevent the generation of `\code{begin}` and `\code{end}` by setting the `\` token (29905) to negative infinity with `-l 29905-inf`. (This is due to the prevalence of LaTeX codes that show up in LLaMA model inference.)
+Example usage: `--logit-bias 29905-inf`
+### RNG Seed
+-   `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1, < 0 = random seed).
+The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than 0, a random seed will be used, which will result in different outputs on each run.
+## Performance Tuning and Memory Options
+These options help improve the performance and memory usage of the LLaMA models. By adjusting these settings, you can fine-tune the model's behavior to better suit your system's capabilities and achieve optimal performance for your specific use case.
+### Number of Threads
+-   `-t N, --threads N`: Set the number of threads to use during computation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has (as opposed to the logical number of cores). Using the correct number of threads can greatly improve performance.
+### Mlock
+-   `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped. This can improve performance but trades away some of the advantages of memory-mapping by requiring more RAM to run and potentially slowing down load times as the model loads into RAM.
+### No Memory Mapping
+-   `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low on available memory, using mmap might increase the risk of pageouts, negatively impacting performance. Disabling mmap results in slower load times but may reduce pageouts if you're not using `--mlock`. Note that if the model is larger than the total amount of RAM, turning off mmap would prevent the model from loading at all.
+### Memory Float 32
+-   `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement and cached prompt file size but does not appear to increase generation quality in a measurable way. Not recommended.
+### Batch Size
+-   `-b N, --batch-size N`: Set the batch size for prompt processing (default: 512). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.
+### Prompt Caching
+-   `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation.
+### Quantization
+For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-data--run).
+## Additional Options
+These options provide extra functionality and customization when running the LLaMA models:
+-   `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
+-   `--verbose-prompt`: Print the prompt before generating text.
+-   `--mtest`: Test the model's functionality by running a series of tests to ensure it's working properly.
+-   `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
+-   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
+-   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
+-   `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
+-   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
+-   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.

examples/main/main.cpp ADDED Viewed

	@@ -0,0 +1,675 @@

+// Defines sigaction on msys:
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include "common.h"
+#include "llama.h"
+#include "build-info.h"
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+#include <signal.h>
+#include <unistd.h>
+#elif defined (_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#include <signal.h>
+#endif
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+static console_state con_st;
+static llama_context ** g_ctx;
+static bool is_interacting = false;
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
+void sigint_handler(int signo) {
+    if (signo == SIGINT) {
+        if (!is_interacting) {
+            is_interacting=true;
+        } else {
+            console_cleanup(con_st);
+            printf("\n");
+            llama_print_timings(*g_ctx);
+            _exit(130);
+        }
+    }
+}
+#endif
+int main(int argc, char ** argv) {
+    gpt_params params;
+    if (gpt_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+    // save choice to use color for later
+    // (note for later: this is a slightly awkward choice)
+    con_st.use_color = params.use_color;
+    con_st.multiline_input = params.multiline_input;
+    console_init(con_st);
+    atexit([]() { console_cleanup(con_st); });
+    if (params.perplexity) {
+        printf("\n************\n");
+        printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
+        printf("************\n\n");
+        return 0;
+    }
+    if (params.embedding) {
+        printf("\n************\n");
+        printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
+        printf("************\n\n");
+        return 0;
+    }
+    if (params.n_ctx > 2048) {
+        fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
+                "expect poor results\n", __func__, params.n_ctx);
+    } else if (params.n_ctx < 8) {
+        fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
+        params.n_ctx = 8;
+    }
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
+    if (params.seed < 0) {
+        params.seed = time(NULL);
+    }
+    fprintf(stderr, "%s: seed  = %d\n", __func__, params.seed);
+    std::mt19937 rng(params.seed);
+    if (params.random_prompt) {
+        params.prompt = gpt_random_prompt(rng);
+    }
+    llama_init_backend();
+    llama_model * model;
+    llama_context * ctx;
+    g_ctx = &ctx;
+    // load the model and apply lora adapter, if any
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    if (model == NULL) {
+        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        return 1;
+    }
+    // print system information
+    {
+        fprintf(stderr, "\n");
+        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
+                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
+    }
+    // determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters
+    // uncomment the "used_mem" line in llama.cpp to see the results
+    if (params.mem_test) {
+        {
+            const std::vector<llama_token> tmp(params.n_batch, llama_token_bos());
+            llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
+        }
+        {
+            const std::vector<llama_token> tmp = { 0, };
+            llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads);
+        }
+        llama_print_timings(ctx);
+        llama_free(ctx);
+        llama_free_model(model);
+        return 0;
+    }
+    // export the cgraph and exit
+    if (params.export_cgraph) {
+        llama_eval_export(ctx, "llama.ggml");
+        llama_free(ctx);
+        llama_free_model(model);
+        return 0;
+    }
+    std::string path_session = params.path_prompt_cache;
+    std::vector<llama_token> session_tokens;
+    if (!path_session.empty()) {
+        fprintf(stderr, "%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
+        // fopen to check for existing session
+        FILE * fp = std::fopen(path_session.c_str(), "rb");
+        if (fp != NULL) {
+            std::fclose(fp);
+            session_tokens.resize(params.n_ctx);
+            size_t n_token_count_out = 0;
+            if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
+                fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
+                return 1;
+            }
+            session_tokens.resize(n_token_count_out);
+            llama_set_rng_seed(ctx, params.seed);
+            fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
+        } else {
+            fprintf(stderr, "%s: session file does not exist, will create\n", __func__);
+        }
+    }
+    // tokenize the prompt
+    std::vector<llama_token> embd_inp;
+    if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
+        // Add a space in front of the first character to match OG llama tokenizer behavior
+        params.prompt.insert(0, 1, ' ');
+        embd_inp = ::llama_tokenize(ctx, params.prompt, true);
+    } else {
+        embd_inp = session_tokens;
+    }
+    const int n_ctx = llama_n_ctx(ctx);
+    if ((int) embd_inp.size() > n_ctx - 4) {
+        fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
+        return 1;
+    }
+    // debug message about similarity of saved session, if applicable
+    size_t n_matching_session_tokens = 0;
+    if (session_tokens.size()) {
+        for (llama_token id : session_tokens) {
+            if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
+                break;
+            }
+            n_matching_session_tokens++;
+        }
+        if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
+            fprintf(stderr, "%s: using full prompt from session file\n", __func__);
+        } else if (n_matching_session_tokens >= embd_inp.size()) {
+            fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
+        } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
+            fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
+                __func__, n_matching_session_tokens, embd_inp.size());
+        } else {
+            fprintf(stderr, "%s: session file matches %zu / %zu tokens of prompt\n",
+                __func__, n_matching_session_tokens, embd_inp.size());
+        }
+    }
+    // if we will use the cache for the full prompt without reaching the end of the cache, force
+    // reevaluation of the last token token to recalculate the cached logits
+    if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() &&
+            session_tokens.size() > embd_inp.size()) {
+        session_tokens.resize(embd_inp.size() - 1);
+    }
+    // number of tokens to keep when resetting context
+    if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct) {
+        params.n_keep = (int)embd_inp.size();
+    }
+    // prefix & suffix for instruct mode
+    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true);
+    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
+    // in instruct mode, we inject a prefix and a suffix to each input by the user
+    if (params.instruct) {
+        params.interactive_first = true;
+        params.antiprompt.push_back("### Instruction:\n\n");
+    }
+    // enable interactive mode if interactive start is specified
+    if (params.interactive_first) {
+        params.interactive = true;
+    }
+    // determine newline token
+    auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
+    if (params.verbose_prompt) {
+        fprintf(stderr, "\n");
+        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+        fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+        for (int i = 0; i < (int) embd_inp.size(); i++) {
+            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
+        }
+        if (params.n_keep > 0) {
+        fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
+            for (int i = 0; i < params.n_keep; i++) {
+                fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]));
+            }
+            fprintf(stderr, "'\n");
+        }
+        fprintf(stderr, "\n");
+    }
+    if (params.interactive) {
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+        struct sigaction sigint_action;
+        sigint_action.sa_handler = sigint_handler;
+        sigemptyset (&sigint_action.sa_mask);
+        sigint_action.sa_flags = 0;
+        sigaction(SIGINT, &sigint_action, NULL);
+#elif defined (_WIN32)
+        auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
+            return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
+        };
+        SetConsoleCtrlHandler(static_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+#endif
+        fprintf(stderr, "%s: interactive mode on.\n", __func__);
+        if (params.antiprompt.size()) {
+            for (auto antiprompt : params.antiprompt) {
+                fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str());
+            }
+        }
+        if (!params.input_prefix.empty()) {
+            fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
+        }
+        if (!params.input_suffix.empty()) {
+            fprintf(stderr, "Input suffix: '%s'\n", params.input_suffix.c_str());
+        }
+    }
+    fprintf(stderr, "sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
+            params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
+    fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
+    fprintf(stderr, "\n\n");
+    // TODO: replace with ring-buffer
+    std::vector<llama_token> last_n_tokens(n_ctx);
+    std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
+    if (params.interactive) {
+        const char *control_message;
+        if (con_st.multiline_input) {
+            control_message = " - To return control to LLaMa, end your input with '\\'.\n"
+                              " - To return control without starting a new line, end your input with '/'.\n";
+        } else {
+            control_message = " - Press Return to return control to LLaMa.\n"
+                              " - To return control without starting a new line, end your input with '/'.\n"
+                              " - If you want to submit another line, end your input with '\\'.\n";
+        }
+        fprintf(stderr, "== Running in interactive mode. ==\n"
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
+               " - Press Ctrl+C to interject at any time.\n"
+#endif
+               "%s\n", control_message);
+        is_interacting = params.interactive_first;
+    }
+    bool is_antiprompt        = false;
+    bool input_echo           = true;
+    bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < embd_inp.size();
+    int n_past             = 0;
+    int n_remain           = params.n_predict;
+    int n_consumed         = 0;
+    int n_session_consumed = 0;
+    // the first thing we will do is to output the prompt, so set color accordingly
+    console_set_color(con_st, CONSOLE_COLOR_PROMPT);
+    std::vector<llama_token> embd;
+    // do one empty run to warm up the model
+    {
+        const std::vector<llama_token> tmp = { llama_token_bos(), };
+        llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
+        llama_reset_timings(ctx);
+    }
+    while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
+        // predict
+        if (embd.size() > 0) {
+            // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
+            // --prompt or --file which uses the same value.
+            auto max_embd_size = n_ctx - 4;
+            // Ensure the input doesn't exceed the context size by truncating embd if necessary.
+            if ((int)embd.size() > max_embd_size) {
+                auto skipped_tokens = embd.size() - max_embd_size;
+                console_set_color(con_st, CONSOLE_COLOR_ERROR);
+                printf("<<input too long: skipped %zu token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
+                console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
+                fflush(stdout);
+                embd.resize(max_embd_size);
+            }
+            // infinite text generation via context swapping
+            // if we run out of context:
+            // - take the n_keep first tokens from the original prompt (via n_past)
+            // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
+            if (n_past + (int) embd.size() > n_ctx) {
+                const int n_left = n_past - params.n_keep;
+                // always keep the first token - BOS
+                n_past = std::max(1, params.n_keep);
+                // insert n_left/2 tokens at the start of embd from last_n_tokens
+                embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
+                // stop saving session if we run out of context
+                path_session.clear();
+                //printf("\n---\n");
+                //printf("resetting: '");
+                //for (int i = 0; i < (int) embd.size(); i++) {
+                //    printf("%s", llama_token_to_str(ctx, embd[i]));
+                //}
+                //printf("'\n");
+                //printf("\n---\n");
+            }
+            // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
+            if (n_session_consumed < (int) session_tokens.size()) {
+                size_t i = 0;
+                for ( ; i < embd.size(); i++) {
+                    if (embd[i] != session_tokens[n_session_consumed]) {
+                        session_tokens.resize(n_session_consumed);
+                        break;
+                    }
+                    n_past++;
+                    n_session_consumed++;
+                    if (n_session_consumed >= (int) session_tokens.size()) {
+                        ++i;
+                        break;
+                    }
+                }
+                if (i > 0) {
+                    embd.erase(embd.begin(), embd.begin() + i);
+                }
+            }
+            // evaluate tokens in batches
+            // embd is typically prepared beforehand to fit within a batch, but not always
+            for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
+                int n_eval = (int) embd.size() - i;
+                if (n_eval > params.n_batch) {
+                    n_eval = params.n_batch;
+                }
+                if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) {
+                    fprintf(stderr, "%s : failed to eval\n", __func__);
+                    return 1;
+                }
+                n_past += n_eval;
+            }
+            if (embd.size() > 0 && !path_session.empty()) {
+                session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
+                n_session_consumed = session_tokens.size();
+            }
+        }
+        embd.clear();
+        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
+            // out of user input, sample next token
+            const float   temp            = params.temp;
+            const int32_t top_k           = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
+            const float   top_p           = params.top_p;
+            const float   tfs_z           = params.tfs_z;
+            const float   typical_p       = params.typical_p;
+            const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
+            const float   repeat_penalty  = params.repeat_penalty;
+            const float   alpha_presence  = params.presence_penalty;
+            const float   alpha_frequency = params.frequency_penalty;
+            const int     mirostat        = params.mirostat;
+            const float   mirostat_tau    = params.mirostat_tau;
+            const float   mirostat_eta    = params.mirostat_eta;
+            const bool    penalize_nl     = params.penalize_nl;
+            // optionally save the session on first sample (for faster prompt loading next time)
+            if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
+                need_to_save_session = false;
+                llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
+            }
+            llama_token id = 0;
+            {
+                auto logits  = llama_get_logits(ctx);
+                auto n_vocab = llama_n_vocab(ctx);
+                // Apply params.logit_bias map
+                for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
+                    logits[it->first] += it->second;
+                }
+                std::vector<llama_token_data> candidates;
+                candidates.reserve(n_vocab);
+                for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+                    candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+                }
+                llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+                // Apply penalties
+                float nl_logit = logits[llama_token_nl()];
+                auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
+                llama_sample_repetition_penalty(ctx, &candidates_p,
+                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+                    last_n_repeat, repeat_penalty);
+                llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
+                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+                    last_n_repeat, alpha_frequency, alpha_presence);
+                if (!penalize_nl) {
+                    logits[llama_token_nl()] = nl_logit;
+                }
+                if (temp <= 0) {
+                    // Greedy sampling
+                    id = llama_sample_token_greedy(ctx, &candidates_p);
+                } else {
+                    if (mirostat == 1) {
+                        static float mirostat_mu = 2.0f * mirostat_tau;
+                        const int mirostat_m = 100;
+                        llama_sample_temperature(ctx, &candidates_p, temp);
+                        id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
+                    } else if (mirostat == 2) {
+                        static float mirostat_mu = 2.0f * mirostat_tau;
+                        llama_sample_temperature(ctx, &candidates_p, temp);
+                        id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
+                    } else {
+                        // Temperature sampling
+                        llama_sample_top_k(ctx, &candidates_p, top_k, 1);
+                        llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
+                        llama_sample_typical(ctx, &candidates_p, typical_p, 1);
+                        llama_sample_top_p(ctx, &candidates_p, top_p, 1);
+                        llama_sample_temperature(ctx, &candidates_p, temp);
+                        id = llama_sample_token(ctx, &candidates_p);
+                    }
+                }
+                // printf("`%d`", candidates_p.size);
+                last_n_tokens.erase(last_n_tokens.begin());
+                last_n_tokens.push_back(id);
+            }
+            // replace end of text token with newline token when in interactive mode
+            if (id == llama_token_eos() && params.interactive && !params.instruct) {
+                id = llama_token_newline.front();
+                if (params.antiprompt.size() != 0) {
+                    // tokenize and inject first reverse prompt
+                    const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
+                    embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
+                }
+            }
+            // add it to the context
+            embd.push_back(id);
+            // echo this to console
+            input_echo = true;
+            // decrement remaining sampling budget
+            --n_remain;
+        } else {
+            // some user input remains from prompt or interaction, forward it to processing
+            while ((int) embd_inp.size() > n_consumed) {
+                embd.push_back(embd_inp[n_consumed]);
+                last_n_tokens.erase(last_n_tokens.begin());
+                last_n_tokens.push_back(embd_inp[n_consumed]);
+                ++n_consumed;
+                if ((int) embd.size() >= params.n_batch) {
+                    break;
+                }
+            }
+        }
+        // display text
+        if (input_echo) {
+            for (auto id : embd) {
+                printf("%s", llama_token_to_str(ctx, id));
+            }
+            fflush(stdout);
+        }
+        // reset color to default if we there is no pending user input
+        if (input_echo && (int)embd_inp.size() == n_consumed) {
+            console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
+        }
+        // if not currently processing queued inputs;
+        if ((int) embd_inp.size() <= n_consumed) {
+            // check for reverse prompt
+            if (params.antiprompt.size()) {
+                std::string last_output;
+                for (auto id : last_n_tokens) {
+                    last_output += llama_token_to_str(ctx, id);
+                }
+                is_antiprompt = false;
+                // Check if each of the reverse prompts appears at the end of the output.
+                // If we're not running interactively, the reverse prompt might be tokenized with some following characters
+                // so we'll compensate for that by widening the search window a bit.
+                for (std::string & antiprompt : params.antiprompt) {
+                    size_t extra_padding = params.interactive ? 0 : 2;
+                    size_t search_start_pos = last_output.length() > static_cast<size_t>(antiprompt.length() + extra_padding)
+                        ? last_output.length() - static_cast<size_t>(antiprompt.length() + extra_padding)
+                        : 0;
+                    if (last_output.find(antiprompt.c_str(), search_start_pos) != std::string::npos) {
+                        if (params.interactive) {
+                            is_interacting = true;
+                            console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
+                        }
+                        is_antiprompt = true;
+                        fflush(stdout);
+                        break;
+                    }
+                }
+            }
+            if (n_past > 0 && is_interacting) {
+                if (params.instruct) {
+                    printf("\n> ");
+                }
+                std::string buffer;
+                if (!params.input_prefix.empty()) {
+                    buffer += params.input_prefix;
+                    printf("%s", buffer.c_str());
+                }
+                std::string line;
+                bool another_line = true;
+                do {
+                    another_line = console_readline(con_st, line);
+                    buffer += line;
+                } while (another_line);
+                // done taking input, reset color
+                console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
+                // Add tokens to embd only if the input buffer is non-empty
+                // Entering a empty line lets the user pass control back
+                if (buffer.length() > 1) {
+                    // append input suffix if any
+                    if (!params.input_suffix.empty()) {
+                        buffer += params.input_suffix;
+                        printf("%s", params.input_suffix.c_str());
+                    }
+                    // instruct mode: insert instruction prefix
+                    if (params.instruct && !is_antiprompt) {
+                        n_consumed = embd_inp.size();
+                        embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
+                    }
+                    auto line_inp = ::llama_tokenize(ctx, buffer, false);
+                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
+                    // instruct mode: insert response suffix
+                    if (params.instruct) {
+                        embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
+                    }
+                    n_remain -= line_inp.size();
+                }
+                input_echo = false; // do not echo this again
+            }
+            if (n_past > 0) {
+                is_interacting = false;
+            }
+        }
+        // end of text token
+        if (!embd.empty() && embd.back() == llama_token_eos()) {
+            if (params.instruct) {
+                is_interacting = true;
+            } else {
+                fprintf(stderr, " [end of text]\n");
+                break;
+            }
+        }
+        // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
+        if (params.interactive && n_remain <= 0 && params.n_predict != -1) {
+            n_remain = params.n_predict;
+            is_interacting = true;
+        }
+    }
+    if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
+        fprintf(stderr, "\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
+        llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
+    }
+    llama_print_timings(ctx);
+    llama_free(ctx);
+    llama_free_model(model);
+    return 0;
+}

examples/metal/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+set(TEST_TARGET metal)
+add_executable(${TEST_TARGET} metal.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)

examples/metal/metal.cpp ADDED Viewed

	@@ -0,0 +1,104 @@

+// Evaluate a statically exported ggml computation graph with Metal
+//
+// - First, export a LLaMA graph:
+//
+//  $ ./bin/main -m ../models/7B/ggml-model-q4_0.bin --export
+//
+// - Run this tool to evaluate the exported graph:
+//
+//  $ ./bin/metal llama.ggml
+//
+// The purpose of this tool is mostly for debugging and demonstration purposes.
+// The main limitation of exporting computation graphs is that their sizes are static which often
+// can be a problem for real-world applications.
+//
+#include "ggml.h"
+#include "ggml-metal.h"
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+int main(int argc, char ** argv) {
+    ggml_time_init();
+    if (argc != 2) {
+        fprintf(stderr, "Usage: %s llama.ggml\n", argv[0]);
+        return -1;
+    }
+    const char * fname_cgraph = argv[1];
+    // load the compute graph
+    struct ggml_context * ctx_data = NULL;
+    struct ggml_context * ctx_eval = NULL;
+    struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
+    gf.n_threads = 1;
+    // this allocates all Metal resources and memory buffers
+    auto * ctx_metal = ggml_metal_init();
+    const size_t max_size_data = ggml_get_max_tensor_size(ctx_data);
+    const size_t max_size_eval = ggml_get_max_tensor_size(ctx_eval);
+    ggml_metal_add_buffer(ctx_metal, "data", ggml_get_mem_buffer(ctx_data), ggml_get_mem_size(ctx_data), max_size_data);
+    ggml_metal_add_buffer(ctx_metal, "eval", ggml_get_mem_buffer(ctx_eval), ggml_get_mem_size(ctx_eval), max_size_eval);
+    // main
+    {
+        struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "embd");
+        *(int32_t *) input->data = 1; // BOS
+        ggml_metal_set_tensor(ctx_metal, input);
+        // warmup
+        ggml_metal_graph_compute(ctx_metal, &gf);
+        const int n_iter = 16;
+        const int64_t t0 = ggml_time_us();
+        // the actual inference happens here
+        for (int i = 0; i < n_iter; ++i) {
+            ggml_metal_graph_compute(ctx_metal, &gf);
+        }
+        const int64_t t1 = ggml_time_us();
+        printf("time: %.2f ms, %.2f ms/tok\n", (t1 - t0) / 1000.0, (t1 - t0) / 1000.0 / n_iter);
+    }
+    // debug output
+    {
+        struct ggml_tensor * logits = gf.nodes[gf.n_nodes - 1];
+        ggml_metal_get_tensor(ctx_metal, logits);
+        float * ptr = (float *) ggml_get_data(logits);
+        printf("logits: ");
+        for (int i = 0; i < 10; i++) {
+            printf("%8.4f ", ptr[i]);
+        }
+        printf("\n");
+        int imax = 0;
+        double sum = 0.0;
+        double vmax = -1e9;
+        for (int i = 0; i < 32000; i++) {
+            sum += (double) ptr[i];
+            if (ptr[i] > vmax) {
+                vmax = ptr[i];
+                imax = i;
+            }
+        }
+        printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax);
+    }
+    ggml_metal_free(ctx_metal);
+    ggml_free(ctx_data);
+    ggml_free(ctx_eval);
+    return 0;
+}

examples/perplexity/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+set(TARGET perplexity)
+add_executable(${TARGET} perplexity.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()

examples/perplexity/README.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # perplexity
2	+
3	+ TODO

examples/perplexity/perplexity.cpp ADDED Viewed

	@@ -0,0 +1,176 @@

+#include "common.h"
+#include "llama.h"
+#include "build-info.h"
+#include <cmath>
+#include <ctime>
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+std::vector<float> softmax(const std::vector<float>& logits) {
+    std::vector<float> probs(logits.size());
+    float max_logit = logits[0];
+    for (float v : logits) max_logit = std::max(max_logit, v);
+    double sum_exp = 0.0;
+    for (size_t i = 0; i < logits.size(); i++) {
+        // Subtract the maximum logit value from the current logit value for numerical stability
+        const float logit = logits[i] - max_logit;
+        const float exp_logit = expf(logit);
+        sum_exp += exp_logit;
+        probs[i] = exp_logit;
+    }
+    for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp;
+    return probs;
+}
+void perplexity(llama_context * ctx, const gpt_params & params) {
+    // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
+    // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
+    // Output: `perplexity: 13.5106 [114/114]`
+    // BOS tokens will be added for each chunk before eval
+    auto tokens = ::llama_tokenize(ctx, params.prompt, true);
+    int count   = 0;
+    const int n_chunk = tokens.size() / params.n_ctx;
+    const int n_vocab = llama_n_vocab(ctx);
+    const int n_batch = params.n_batch;
+    double nll = 0.0;
+    fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
+    for (int i = 0; i < n_chunk; ++i) {
+        const int start =     i * params.n_ctx;
+        const int end   = start + params.n_ctx;
+        const int num_batches = (params.n_ctx + n_batch - 1) / n_batch;
+        std::vector<float> logits;
+        const auto t_start = std::chrono::high_resolution_clock::now();
+        for (int j = 0; j < num_batches; ++j) {
+            const int batch_start = start + j * n_batch;
+            const int batch_size  = std::min(end - batch_start, n_batch);
+            // save original token and restore it after eval
+            const auto token_org = tokens[batch_start];
+            // add BOS token for the first batch of each chunk
+            if (j == 0) {
+                tokens[batch_start] = llama_token_bos();
+            }
+            if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
+                fprintf(stderr, "%s : failed to eval\n", __func__);
+                return;
+            }
+            // restore the original token in case it was set to BOS
+            tokens[batch_start] = token_org;
+            const auto batch_logits = llama_get_logits(ctx);
+            logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
+        }
+        const auto t_end = std::chrono::high_resolution_clock::now();
+        if (i == 0) {
+            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
+            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            int total_seconds = (int)(t_total * n_chunk);
+            if (total_seconds >= 60*60) {
+                fprintf(stderr, "%d hours ", total_seconds / (60*60));
+                total_seconds = total_seconds % (60*60);
+            }
+            fprintf(stderr, "%d minutes\n", total_seconds / 60);
+        }
+        // We get the logits for all the tokens in the context window (params.n_ctx)
+        // from llama_eval above.  Now, based on https://huggingface.co/docs/transformers/perplexity,
+        // calculate the perplexity over the last half of the window (so the model always has
+        // some context to predict the token).
+        //
+        // We rely on the fact that attention in the forward pass only looks at previous
+        // tokens here, so the logits returned for each token are an accurate representation
+        // of what the model would have predicted at that point.
+        //
+        // Example, we have a context window of 512, we will compute perplexity for each of the
+        // last 256 tokens.  Then, we split the input up into context window size chunks to
+        // process the entire prompt.
+        for (int j = std::min(512, params.n_ctx / 2); j < params.n_ctx - 1; ++j) {
+            // Calculate probability of next token, given the previous ones.
+            const std::vector<float> tok_logits(
+                logits.begin() + (j + 0) * n_vocab,
+                logits.begin() + (j + 1) * n_vocab);
+            const float prob = softmax(tok_logits)[tokens[start + j + 1]];
+            nll += -std::log(prob);
+            ++count;
+        }
+        // perplexity is e^(average negative log-likelihood)
+        printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
+        fflush(stdout);
+    }
+    printf("\n");
+}
+int main(int argc, char ** argv) {
+    gpt_params params;
+    params.n_batch = 512;
+    if (gpt_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+    params.perplexity = true;
+    params.n_batch = std::min(params.n_batch, params.n_ctx);
+    if (params.n_ctx > 2048) {
+        fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
+                "expect poor results\n", __func__, params.n_ctx);
+    }
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
+    if (params.seed < 0) {
+        params.seed = time(NULL);
+    }
+    fprintf(stderr, "%s: seed  = %d\n", __func__, params.seed);
+    std::mt19937 rng(params.seed);
+    if (params.random_prompt) {
+        params.prompt = gpt_random_prompt(rng);
+    }
+    llama_init_backend();
+    llama_model * model;
+    llama_context * ctx;
+    // load the model and apply lora adapter, if any
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    if (model == NULL) {
+        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        return 1;
+    }
+    // print system information
+    {
+        fprintf(stderr, "\n");
+        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
+                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
+    }
+    perplexity(ctx, params);
+    llama_print_timings(ctx);
+    llama_free(ctx);
+    llama_free_model(model);
+    return 0;
+}

examples/quantize-stats/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+set(TARGET quantize-stats)
+add_executable(${TARGET} quantize-stats.cpp)
+target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)

examples/quantize-stats/quantize-stats.cpp ADDED Viewed

	@@ -0,0 +1,438 @@

+#include "ggml.h"
+#include "build-info.h"
+#define LLAMA_API_INTERNAL
+#include "llama.h"
+#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <map>
+#include <numeric>
+#include <regex>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include <thread>
+#include <mutex>
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+struct quantize_stats_params {
+    std::string model = "models/7B/ggml-model-f16.bin";
+    bool verbose = false;
+    bool per_layer_stats = false;
+    bool print_histogram = false;
+    bool reference = false;
+    std::vector<std::string> include_layers;
+    std::vector<std::string> exclude_layers;
+    std::vector<enum ggml_type> include_types;
+};
+const size_t HISTOGRAM_BUCKETS = 150;
+const double HISTOGRAM_RANGE = 0.03;
+struct error_stats {
+    size_t num_samples;
+    double total_error;
+    double max_error;
+    uint64_t error_histogram[HISTOGRAM_BUCKETS];
+};
+void quantize_stats_print_usage(int /*argc*/, char ** argv) {
+    quantize_stats_params params;
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help            show this help message and exit\n");
+    fprintf(stderr, "  -m FNAME, --model FNAME\n");
+    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
+    fprintf(stderr, "  -r, --reference\n");
+    fprintf(stderr, "                        use reference implementation (default: false)\n");
+    fprintf(stderr, "  -v, --verbose\n");
+    fprintf(stderr, "                        verbose output (default: false)\n");
+    fprintf(stderr, "  -p, --per-layer-stats\n");
+    fprintf(stderr, "                        print stats per layer (default: false)\n");
+    fprintf(stderr, "  --histogram\n");
+    fprintf(stderr, "                        print error histogram (default: false)\n");
+    fprintf(stderr, "  -l LAYER, --include-layer LAYER\n");
+    fprintf(stderr, "                        only test layers matching pattern\n");
+    fprintf(stderr, "  -L LAYER, --exclude-layer LAYER\n");
+    fprintf(stderr, "                        exclude layers matching pattern\n");
+    fprintf(stderr, "  -t TYPE, --type TYPE\n");
+    fprintf(stderr, "                        only test given type (q4_0, q4_1)\n");
+    fprintf(stderr, "\n");
+}
+// Check if a layer is included/excluded by command line
+bool layer_included(const quantize_stats_params params, const std::string & layer) {
+    for (const auto& excluded : params.exclude_layers) {
+        if (std::regex_search(layer, std::regex(excluded))) {
+            return false;
+        }
+    }
+    for (const auto& included : params.include_layers) {
+        if (std::regex_search(layer, std::regex(included))) {
+            return true;
+        }
+    }
+    return params.include_layers.empty();
+}
+// Update error statistics given vectors with the before/after result of quantization
+void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) {
+    for (int64_t i = 0; i < nelements; i++) {
+        double diff = input[i] - output[i];
+        stats.total_error += diff * diff;
+        stats.max_error = fmax(fabs(diff), stats.max_error);
+        stats.error_histogram[std::max(std::min((size_t) floor(fabs(diff) / HISTOGRAM_RANGE * HISTOGRAM_BUCKETS), HISTOGRAM_BUCKETS-1), (size_t) 0)]++;
+    }
+    stats.num_samples += nelements;
+}
+void combine_error_stats(error_stats & into, const error_stats & from) {
+    into.num_samples += from.num_samples;
+    into.total_error += from.total_error;
+    if (from.max_error > into.max_error) into.max_error = from.max_error;
+    for (size_t i=0; i<HISTOGRAM_BUCKETS; ++i) into.error_histogram[i] += from.error_histogram[i];
+}
+double find_quantile(const error_stats & stats, double quantile) {
+    double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0);
+    double accum = 0;
+    for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
+        accum += stats.error_histogram[i];
+        if (accum >= sum*quantile) {
+            return (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
+        }
+    }
+    return INFINITY;
+}
+void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
+    double rmse = sqrt(stats.total_error / (double) stats.num_samples);
+    double median = find_quantile(stats, .5);
+    double pct95 = find_quantile(stats, .95);
+    printf("%-50s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", name.c_str(), rmse, stats.max_error, pct95, median);
+    if (print_histogram) {
+        printf("Error distribution:\n");
+        for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
+            double lower = i * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
+            double upper = (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
+            if (i == HISTOGRAM_BUCKETS -1) upper = INFINITY;
+            printf("[%3.4f, %3.4f): %11" PRIu64 "\n", lower, upper, stats.error_histogram[i]);
+        }
+    }
+}
+// copied from ggml.h - verify that we can access this as a flat array
+static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+    return
+        tensor->nb[0] == ggml_type_size(tensor->type) &&
+        tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
+        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
+        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
+}
+void test_roundtrip_on_chunk(
+        const ggml_tensor * layer,
+        int64_t offset,
+        int64_t chunk_size,
+        const quantize_fns_t & qfns,
+        bool use_reference,
+        float * input_scratch,
+        char * quantized_scratch,
+        float * output_scratch,
+        error_stats & stats) {
+    if (layer->type == GGML_TYPE_F16) {
+        for (int i = 0; i < chunk_size; i++) {
+            input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
+        }
+    } else {
+        input_scratch = ggml_get_data_f32(layer) + offset;
+    }
+    if (use_reference) {
+        qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
+    } else {
+        qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
+    }
+    qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);
+    update_error_stats(chunk_size, input_scratch, output_scratch, stats);
+}
+// Run quantization function for a single layer and update error stats
+void test_roundtrip_on_layer(
+        std::string & name,
+        bool print_layer_stats,
+        const quantize_fns_t & qfns,
+        bool use_reference,
+        const ggml_tensor * layer,
+        std::vector<float> & input_scratch,
+        std::vector<char> & quantized_scratch,
+        std::vector<float> & output_scratch,
+        error_stats & total_error,
+        int max_thread = 0) {
+    assert(tensor_is_contiguous(layer));
+    error_stats layer_error {};
+    uint64_t nelements = ggml_nelements(layer);
+    float* input_scratch_ptr = nullptr;
+    if (layer->type == GGML_TYPE_F16) {
+        if (input_scratch.size() < nelements) input_scratch.resize(nelements);
+        input_scratch_ptr = input_scratch.data();
+    }
+    if (quantized_scratch.size() < 4*nelements) quantized_scratch.resize(4*nelements);
+    if (output_scratch.size() < nelements) output_scratch.resize(nelements);
+    if (max_thread < 1) max_thread = std::thread::hardware_concurrency();
+    int chunk_size = 32*512;
+    int num_chunks = (nelements + chunk_size - 1)/chunk_size;
+    if (num_chunks < 2 || max_thread < 2) {
+        test_roundtrip_on_chunk(layer, 0, nelements, qfns, use_reference, input_scratch_ptr, quantized_scratch.data(),
+                output_scratch.data(), print_layer_stats ? layer_error : total_error);
+    } else {
+        auto & stats = print_layer_stats ? layer_error : total_error;
+        std::mutex mutex;
+        uint64_t counter = 0;
+        auto compute = [&mutex, &counter, &stats, &qfns, nelements, layer, use_reference, input_scratch_ptr,
+             &quantized_scratch, &output_scratch, chunk_size] () {
+            error_stats local_stats {};
+            while (true) {
+                std::unique_lock<std::mutex> lock(mutex);
+                uint64_t offset = counter; counter += chunk_size;
+                if (offset >= nelements) {
+                    combine_error_stats(stats, local_stats);
+                    break;
+                }
+                lock.unlock();
+                uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset;
+                test_roundtrip_on_chunk(layer, offset, chunk, qfns, use_reference, input_scratch_ptr + offset,
+                        quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats);
+            }
+        };
+        int nthread = std::min(num_chunks, max_thread);
+        std::vector<std::thread> workers(nthread-1);
+        for (auto& w : workers) w = std::thread(compute);
+        compute();
+        for (auto& w : workers) w.join();
+    }
+    if (print_layer_stats) {
+        print_error_stats(name, layer_error, false);
+        combine_error_stats(total_error, layer_error);
+    }
+}
+int main(int argc, char ** argv) {
+    ggml_time_init();
+    quantize_stats_params params;
+    // read command line
+    int max_thread = 0;
+    bool invalid_param = false;
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+        if (arg == "-h" || arg == "--help") {
+            quantize_stats_print_usage(argc, argv);
+            exit(0);
+        } else if (arg == "-r" || arg == "--reference") {
+            params.reference = true;
+        } else if (arg == "-v") {
+            params.verbose = true;
+        } else if (arg == "-p" || arg == "--per-layer-stats") {
+            params.per_layer_stats = true;
+        } else if (arg == "--histogram") {
+            params.print_histogram = true;
+        } else if (arg == "-m" || arg == "--model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.model = argv[i];
+        } else if (arg == "-l" || arg == "--include-layer") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.include_layers.push_back(argv[i]);
+        } else if (arg == "-L" || arg == "--exclude-layer") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.exclude_layers.push_back(argv[i]);
+        } else if (arg == "-t" || arg == "--type") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            int j;
+            for (j = 0; j < GGML_TYPE_COUNT; ++j) {
+               const auto * name = ggml_type_name((ggml_type) j);
+               if (name && strcmp(argv[i], name) == 0) break;
+            }
+            if (j < GGML_TYPE_COUNT) {
+                params.include_types.push_back((ggml_type) j);
+            } else {
+                fprintf(stderr, "error: %s not in list of types\n", argv[i]);
+                invalid_param = true;
+            }
+        } else if (arg == "-n" || arg == "--num-threads") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            max_thread = atoi(argv[i]);
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            quantize_stats_print_usage(argc, argv);
+            return 1;
+        }
+    }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        quantize_stats_print_usage(argc, argv);
+        return 1;
+    }
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
+    // load the model
+    fprintf(stderr, "Loading model\n");
+    const int64_t t_main_start_us = ggml_time_us();
+    llama_model * model;
+    llama_context * ctx;
+    {
+        auto lparams = llama_context_default_params();
+        lparams.n_ctx      = 256;
+        lparams.seed       = 1;
+        lparams.f16_kv     = false;
+        lparams.use_mlock  = false;
+        model = llama_load_model_from_file(params.model.c_str(), lparams);
+        if (model == NULL) {
+            fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
+            return 1;
+        }
+        ctx = llama_new_context_with_model(model, lparams);
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
+            llama_free_model(model);
+            return 1;
+        }
+    }
+    const auto &tensors = llama_internal_get_tensor_map(ctx);
+    // check layer tensors
+    int included_layers = 0;
+    int64_t max_nelements = 0;
+    bool is_f16 = false;
+    for (const auto& kv_tensor : tensors) {
+        if (!layer_included(params, kv_tensor.first)) {
+            continue;
+        }
+        if (params.verbose) {
+            printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), ggml_type_name(kv_tensor.second->type), ggml_nelements(kv_tensor.second));
+        }
+        if (kv_tensor.second->type == GGML_TYPE_F16) {
+            is_f16 = true;
+        } else if (kv_tensor.second->type != GGML_TYPE_F32) {
+            fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
+                "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
+            llama_free(ctx);
+            llama_free_model(model);
+            return 1;
+        }
+        included_layers++;
+        max_nelements = std::max(max_nelements, ggml_nelements(kv_tensor.second));
+    }
+    if (is_f16) {
+        printf("note: source model is f16\n");
+    }
+    printf("testing %d layers with max size %" PRId64 "\n", included_layers, max_nelements);
+    // allocate scratch space
+    std::vector<float> input_scratch;
+    std::vector<char> quantized_scratch;
+    std::vector<float> output_scratch;
+    // loop throught quantization types
+    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
+        const ggml_type type = (ggml_type) i;
+        if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
+            continue;
+        }
+        quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
+        if (qfns.quantize_row_q && qfns.dequantize_row_q) {
+            if (params.verbose) {
+                printf("testing %s ...\n",  ggml_type_name(type));
+            }
+            error_stats global_stats {};
+            for (const auto& kv_tensor : tensors) {
+                if (!layer_included(params, kv_tensor.first)) {
+                    continue;
+                }
+                if (params.verbose) {
+                    printf("  %s ...\n",  kv_tensor.first.c_str());
+                }
+                std::string layer_name { ggml_type_name(type) };
+                layer_name += "::" + kv_tensor.first;
+                test_roundtrip_on_layer(
+                        layer_name,
+                        params.per_layer_stats,
+                        qfns,
+                        params.reference,
+                        kv_tensor.second,
+                        input_scratch,
+                        quantized_scratch,
+                        output_scratch,
+                        global_stats,
+                        max_thread
+                );
+            }
+            print_error_stats(ggml_type_name(type), global_stats, params.print_histogram);
+        }
+    }
+    llama_free(ctx);
+    llama_free_model(model);
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+        printf("\n");
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
+    }
+    return 0;
+}

examples/quantize/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+set(TARGET quantize)
+add_executable(${TARGET} quantize.cpp)
+target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()

examples/quantize/README.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # quantize
2	+
3	+ TODO

examples/quantize/quantize.cpp ADDED Viewed

	@@ -0,0 +1,257 @@

+#include "llama.h"
+#include <cstdio>
+#include <cstring>
+#include <vector>
+#include <string>
+struct quant_option {
+    std::string name;
+    llama_ftype ftype;
+    std::string desc;
+};
+static const std::vector<struct quant_option> QUANT_OPTIONS = {
+    {
+        "Q4_0",
+        LLAMA_FTYPE_MOSTLY_Q4_0,
+        " 3.50G, +0.2499 ppl @ 7B - small, very high quality loss - legacy, prefer using Q3_K_M",
+    },
+    {
+        "Q4_1",
+        LLAMA_FTYPE_MOSTLY_Q4_1,
+        " 3.90G, +0.1846 ppl @ 7B - small, substantial quality loss - legacy, prefer using Q3_K_L",
+    },
+    {
+        "Q5_0",
+        LLAMA_FTYPE_MOSTLY_Q5_0,
+        " 4.30G, +0.0796 ppl @ 7B - medium, balanced quality - legacy, prefer using Q4_K_M",
+    },
+    {
+        "Q5_1",
+        LLAMA_FTYPE_MOSTLY_Q5_1,
+        " 4.70G, +0.0415 ppl @ 7B - medium, low quality loss - legacy, prefer using Q5_K_M",
+    },
+#ifdef GGML_USE_K_QUANTS
+    {
+        "Q2_K",
+        LLAMA_FTYPE_MOSTLY_Q2_K,
+        " 2.67G, +0.8698 ppl @ 7B - smallest, extreme quality loss - not recommended",
+    },
+    {
+        "Q3_K",
+        LLAMA_FTYPE_MOSTLY_Q3_K_M,
+        "alias for Q3_K_M"
+    },
+    {
+        "Q3_K_S",
+        LLAMA_FTYPE_MOSTLY_Q3_K_S,
+        " 2.75G, +0.5505 ppl @ 7B - very small, very high quality loss",
+    },
+    {
+        "Q3_K_M",
+        LLAMA_FTYPE_MOSTLY_Q3_K_M,
+        " 3.06G, +0.2437 ppl @ 7B - very small, very high quality loss",
+    },
+    {
+        "Q3_K_L",
+        LLAMA_FTYPE_MOSTLY_Q3_K_L,
+        " 3.35G, +0.1803 ppl @ 7B - small, substantial quality loss",
+    },
+    {
+        "Q4_K",
+        LLAMA_FTYPE_MOSTLY_Q4_K_M,
+        "alias for Q4_K_M",
+    },
+    {
+        "Q4_K_S",
+        LLAMA_FTYPE_MOSTLY_Q4_K_S,
+        " 3.56G, +0.1149 ppl @ 7B - small, significant quality loss",
+    },
+    {
+        "Q4_K_M",
+        LLAMA_FTYPE_MOSTLY_Q4_K_M,
+        " 3.80G, +0.0535 ppl @ 7B - medium, balanced quality - *recommended*",
+    },
+    {
+        "Q5_K",
+        LLAMA_FTYPE_MOSTLY_Q5_K_M,
+        "alias for Q5_K_M",
+    },
+    {
+        "Q5_K_S",
+        LLAMA_FTYPE_MOSTLY_Q5_K_S,
+        " 4.33G, +0.0353 ppl @ 7B - large, low quality loss - *recommended*",
+    },
+    {
+        "Q5_K_M",
+        LLAMA_FTYPE_MOSTLY_Q5_K_M,
+        " 4.45G, +0.0142 ppl @ 7B - large, very low quality loss - *recommended*",
+    },
+    {
+        "Q6_K",
+        LLAMA_FTYPE_MOSTLY_Q6_K,
+        " 5.15G, +0.0044 ppl @ 7B - very large, extremely low quality loss",
+    },
+#endif
+    {
+        "Q8_0",
+        LLAMA_FTYPE_MOSTLY_Q8_0,
+        " 6.70G, +0.0004 ppl @ 7B - very large, extremely low quality loss - not recommended",
+    },
+    {
+        "F16",
+        LLAMA_FTYPE_MOSTLY_F16,
+        "13.00G              @ 7B - extremely large, virtually no quality loss - not recommended",
+    },
+    {
+        "F32",
+        LLAMA_FTYPE_ALL_F32,
+        "26.00G              @ 7B - absolutely huge, lossless - not recommended",
+    },
+};
+bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
+    std::string ftype_str;
+    for (auto ch : ftype_str_in) {
+        ftype_str.push_back(std::toupper(ch));
+    }
+    for (auto & it : QUANT_OPTIONS) {
+        if (it.name == ftype_str) {
+            ftype = it.ftype;
+            ftype_str_out = it.name;
+            return true;
+        }
+    }
+    try {
+        int ftype_int = std::stoi(ftype_str);
+        for (auto & it : QUANT_OPTIONS) {
+            if (it.ftype == ftype_int) {
+                ftype = it.ftype;
+                ftype_str_out = it.name;
+                return true;
+            }
+        }
+    }
+    catch (...) {
+        // stoi failed
+    }
+    return false;
+}
+// usage:
+//  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
+//
+void usage(const char * executable) {
+    fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n\n", executable);
+    fprintf(stderr, "  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
+    fprintf(stderr, "  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
+    fprintf(stderr, "\nAllowed quantization types:\n");
+    for (auto & it : QUANT_OPTIONS) {
+        printf("  %2d  or  %-6s : %s\n", it.ftype, it.name.c_str(), it.desc.c_str());
+    }
+    exit(1);
+}
+int main(int argc, char ** argv) {
+    if (argc < 3) {
+        usage(argv[0]);
+    }
+    llama_model_quantize_params params = llama_model_quantize_default_params();
+    int arg_idx = 1;
+    for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
+        if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
+            params.quantize_output_tensor = false;
+        } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
+            params.allow_requantize = true;
+        } else {
+            usage(argv[0]);
+        }
+    }
+    if (argc - arg_idx < 3) {
+        usage(argv[0]);
+    }
+    llama_init_backend();
+    // parse command line arguments
+    const std::string fname_inp = argv[arg_idx];
+    arg_idx++;
+    std::string fname_out;
+    std::string ftype_str;
+    if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
+        std::string fpath;
+        const size_t pos = fname_inp.find_last_of('/');
+        if (pos != std::string::npos) {
+            fpath = fname_inp.substr(0, pos + 1);
+        }
+        // export as [inp path]/ggml-model-[ftype].bin
+        fname_out = fpath + "ggml-model-" + ftype_str + ".bin";
+        arg_idx++;
+    }
+    else {
+        fname_out = argv[arg_idx];
+        arg_idx++;
+        if (argc <= arg_idx) {
+            fprintf(stderr, "%s: missing ftype\n", __func__);
+            return 1;
+        }
+        if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
+            fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
+            return 1;
+        }
+        arg_idx++;
+    }
+    // parse nthreads
+    if (argc > arg_idx) {
+        try {
+            params.nthread = std::stoi(argv[arg_idx]);
+        }
+        catch (const std::exception & e) {
+            fprintf(stderr, "%s: invalid nthread '%s' (%s)\n", __func__, argv[arg_idx], e.what());
+            return 1;
+        }
+    }
+    fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
+    if (params.nthread > 0) {
+        fprintf(stderr, " using %d threads", params.nthread);
+    }
+    fprintf(stderr, "\n");
+    const int64_t t_main_start_us = llama_time_us();
+    int64_t t_quantize_us = 0;
+    // load the model
+    {
+        const int64_t t_start_us = llama_time_us();
+        if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), &params)) {
+            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
+            return 1;
+        }
+        t_quantize_us = llama_time_us() - t_start_us;
+    }
+    // report timing
+    {
+        const int64_t t_main_end_us = llama_time_us();
+        printf("\n");
+        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0);
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
+    }
+    return 0;
+}

examples/reason-act.sh ADDED Viewed

	@@ -0,0 +1,17 @@

+#!/bin/bash
+cd `dirname $0`
+cd ..
+# get -m model parameter otherwise defer to default
+if [ "$1" == "-m" ]; then
+  MODEL="-m $2 "
+fi
+./main $MODEL --color \
+    -f ./prompts/reason-act.txt \
+    -i --interactive-first \
+    --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 -c 2048 \
+    -r "Question:" -r "Observation:" --in-prefix " " \
+    -n -1

examples/save-load-state/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+set(TARGET save-load-state)
+add_executable(${TARGET} save-load-state.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()

examples/save-load-state/save-load-state.cpp ADDED Viewed

	@@ -0,0 +1,170 @@

+#include "common.h"
+#include "llama.h"
+#include "build-info.h"
+#include <vector>
+#include <cstdio>
+#include <chrono>
+int main(int argc, char ** argv) {
+    gpt_params params;
+    params.seed = 42;
+    params.n_threads = 4;
+    params.repeat_last_n = 64;
+    params.prompt = "The quick brown fox";
+    if (gpt_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
+    if (params.n_predict < 0) {
+        params.n_predict = 16;
+    }
+    auto lparams = llama_context_default_params();
+    lparams.n_ctx     = params.n_ctx;
+    lparams.seed      = params.seed;
+    lparams.f16_kv    = params.memory_f16;
+    lparams.use_mmap  = params.use_mmap;
+    lparams.use_mlock = params.use_mlock;
+    auto n_past = 0;
+    auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);
+    // init
+    auto model = llama_load_model_from_file(params.model.c_str(), lparams);
+    if (model == nullptr) {
+        return 1;
+    }
+    auto ctx = llama_new_context_with_model(model, lparams);
+    if (ctx == nullptr) {
+        llama_free_model(model);
+        return 1;
+    }
+    auto tokens = std::vector<llama_token>(params.n_ctx);
+    auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true);
+    if (n_prompt_tokens < 1) {
+        fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
+        llama_free(ctx);
+        llama_free_model(model);
+        return 1;
+    }
+    // evaluate prompt
+    llama_eval(ctx, tokens.data(), n_prompt_tokens, n_past, params.n_threads);
+    last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens);
+    n_past += n_prompt_tokens;
+    const size_t state_size = llama_get_state_size(ctx);
+    uint8_t * state_mem = new uint8_t[state_size];
+    // Save state (rng, logits, embedding and kv_cache) to file
+    {
+        FILE *fp_write = fopen("dump_state.bin", "wb");
+        llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file
+        fwrite(state_mem, 1, state_size, fp_write);
+        fclose(fp_write);
+    }
+    // save state (last tokens)
+    const auto last_n_tokens_data_saved = std::vector<llama_token>(last_n_tokens_data);
+    const auto n_past_saved = n_past;
+    // first run
+    printf("\n%s", params.prompt.c_str());
+    for (auto i = 0; i < params.n_predict; i++) {
+        auto logits = llama_get_logits(ctx);
+        auto n_vocab = llama_n_vocab(ctx);
+        std::vector<llama_token_data> candidates;
+        candidates.reserve(n_vocab);
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+        }
+        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+        auto next_token = llama_sample_token(ctx, &candidates_p);
+        auto next_token_str = llama_token_to_str(ctx, next_token);
+        last_n_tokens_data.push_back(next_token);
+        printf("%s", next_token_str);
+        if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
+            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
+            llama_free(ctx);
+            llama_free_model(model);
+            return 1;
+        }
+        n_past += 1;
+    }
+    printf("\n\n");
+    // free old context
+    llama_free(ctx);
+    // make new context
+    auto ctx2 = llama_new_context_with_model(model, lparams);
+    // Load state (rng, logits, embedding and kv_cache) from file
+    {
+        FILE *fp_read = fopen("dump_state.bin", "rb");
+        if (state_size != llama_get_state_size(ctx2)) {
+            fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
+            llama_free(ctx2);
+            llama_free_model(model);
+            return 1;
+        }
+        const size_t ret = fread(state_mem, 1, state_size, fp_read);
+        if (ret != state_size) {
+            fprintf(stderr, "\n%s : failed to read state\n", __func__);
+            llama_free(ctx2);
+            llama_free_model(model);
+            return 1;
+        }
+        llama_set_state_data(ctx2, state_mem);  // could also read directly from memory mapped file
+        fclose(fp_read);
+    }
+    delete[] state_mem;
+    // restore state (last tokens)
+    last_n_tokens_data = last_n_tokens_data_saved;
+    n_past = n_past_saved;
+    // second run
+    for (auto i = 0; i < params.n_predict; i++) {
+        auto logits = llama_get_logits(ctx2);
+        auto n_vocab = llama_n_vocab(ctx2);
+        std::vector<llama_token_data> candidates;
+        candidates.reserve(n_vocab);
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+        }
+        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+        auto next_token = llama_sample_token(ctx2, &candidates_p);
+        auto next_token_str = llama_token_to_str(ctx2, next_token);
+        last_n_tokens_data.push_back(next_token);
+        printf("%s", next_token_str);
+        if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
+            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
+            llama_free(ctx2);
+            llama_free_model(model);
+            return 1;
+        }
+        n_past += 1;
+    }
+    printf("\n\n");
+    llama_free(ctx2);
+    llama_free_model(model);
+    return 0;
+}

examples/server/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+set(TARGET server)
+option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+add_executable(${TARGET} server.cpp json.hpp httplib.h)
+target_compile_definitions(${TARGET} PRIVATE
+    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
+)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()

examples/server/README.md ADDED Viewed

	@@ -0,0 +1,192 @@

+# llama.cpp/example/server
+This example demonstrates a simple HTTP API server to interact with llama.cpp.
+Command line options:
+-   `--threads N`, `-t N`: Set the number of threads to use during computation.
+-   `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
+-   `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
+-   `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
+-   `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
+-   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
+-   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
+-   `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
+-   `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`.
+-   `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
+-   `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
+-   `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
+-   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
+-   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
+-   `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
+-   `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
+-   `--port`: Set the port to listen. Default: `8080`.
+-   `--embedding`: Enable embedding extraction, Default: disabled.
+## Build
+Build llama.cpp with server from repository root with either make or CMake.
+- Using `make`:
+  ```bash
+  LLAMA_BUILD_SERVER=1 make
+  ```
+- Using `CMake`:
+  ```bash
+  mkdir build-server
+  cd build-server
+  cmake -DLLAMA_BUILD_SERVER=ON ..
+  cmake --build . --config Release
+  ```
+## Quick Start
+To get started right away, run the following command, making sure to use the correct path for the model you have:
+### Unix-based systems (Linux, macOS, etc.):
+```bash
+./server -m models/7B/ggml-model.bin -c 2048
+```
+### Windows:
+```powershell
+server.exe -m models\7B\ggml-model.bin -c 2048
+```
+The above command will start a server that by default listens on `127.0.0.1:8080`.
+You can consume the endpoints with Postman or NodeJS with axios library.
+## Testing with CURL
+Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the base OS.
+```sh
+curl --request POST \
+    --url http://localhost:8080/completion \
+    --data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}'
+```
+## Node JS Test
+You need to have [Node.js](https://nodejs.org/en) installed.
+```bash
+mkdir llama-client
+cd llama-client
+npm init
+npm install axios
+```
+Create a index.js file and put inside this:
+```javascript
+const axios = require("axios");
+const prompt = `Building a website can be done in 10 simple steps:`;
+async function Test() {
+    let result = await axios.post("http://127.0.0.1:8080/completion", {
+        prompt,
+        n_predict: 512,
+    });
+    // the response is received until completion finish
+    console.log(result.data.content);
+}
+Test();
+```
+And run it:
+```bash
+node .
+```
+## API Endpoints
+-   **POST** `/completion`: Given a prompt, it returns the predicted completion.
+    *Options:*
+    `temperature`: Adjust the randomness of the generated text (default: 0.8).
+    `top_k`: Limit the next token selection to the K most probable tokens (default: 40).
+    `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
+    `n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: 128, -1 = infinity).
+    `n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context.
+    By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
+    `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
+    `prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. A space is inserted in the front like main.cpp does.
+    `stop`: Specify a JSON array of stopping strings.
+    These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).
+    `tfs_z`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
+    `typical_p`: Enable locally typical sampling with parameter p (default: 1.0, 1.0 = disabled).
+    `repeat_penalty`: Control the repetition of token sequences in the generated text (default: 1.1).
+    `repeat_last_n`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).
+    `penalize_nl`: Penalize newline tokens when applying the repeat penalty (default: true).
+    `presence_penalty`: Repeat alpha presence penalty (default: 0.0, 0.0 = disabled).
+    `frequency_penalty`: Repeat alpha frequency penalty (default: 0.0, 0.0 = disabled);
+    `mirostat`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0).
+    `mirostat_tau`: Set the Mirostat target entropy, parameter tau (default: 5.0).
+    `mirostat_eta`: Set the Mirostat learning rate, parameter eta (default: 0.1).
+    `seed`: Set the random number generator (RNG) seed (default: -1, < 0 = random seed).
+    `ignore_eos`: Ignore end of stream token and continue generating (default: false).
+    `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced (default: []).
+-   **POST** `/tokenize`: Tokenize a given text.
+    *Options:*
+    `content`: Set the text to tokenize.
+    Note that the special `BOS` token is not added in fron of the text and also a space character is not inserted automatically as it is for `/completion`.
+-   **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.
+    *Options:*
+    `content`: Set the text to process.
+## More examples
+### Interactive mode
+Check the sample in [chat.mjs](chat.mjs).
+Run with NodeJS version 16 or later:
+```sh
+node chat.mjs
+```
+Another sample in [chat.sh](chat.sh).
+Requires [bash](https://www.gnu.org/software/bash/), [curl](https://curl.se) and [jq](https://jqlang.github.io/jq/).
+Run with bash:
+```sh
+bash chat.sh
+```

examples/server/chat.mjs ADDED Viewed

	@@ -0,0 +1,89 @@

+import * as readline from 'node:readline'
+import { stdin, stdout } from 'node:process'
+const API_URL = 'http://127.0.0.1:8080'
+const chat = [
+    {
+        human: "Hello, Assistant.",
+        assistant: "Hello. How may I help you today?"
+    },
+    {
+        human: "Please tell me the largest city in Europe.",
+        assistant: "Sure. The largest city in Europe is Moscow, the capital of Russia."
+    },
+]
+const instruction = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.`
+function format_prompt(question) {
+    return `${instruction}\n${
+        chat.map(m =>`### Human: ${m.human}\n### Assistant: ${m.assistant}`).join("\n")
+    }\n### Human: ${question}\n### Assistant:`
+}
+async function tokenize(content) {
+    const result = await fetch(`${API_URL}/tokenize`, {
+        method: 'POST',
+        body: JSON.stringify({ content })
+    })
+    if (!result.ok) {
+        return []
+    }
+    return await result.json().tokens
+}
+const n_keep = await tokenize(instruction).length
+async function chat_completion(question) {
+    const result = await fetch(`${API_URL}/completion`, {
+        method: 'POST',
+        body: JSON.stringify({
+            prompt: format_prompt(question),
+            temperature: 0.2,
+            top_k: 40,
+            top_p: 0.9,
+            n_keep: n_keep,
+            n_predict: 256,
+            stop: ["\n### Human:"], // stop completion after generating this
+            stream: true,
+        })
+    })
+    if (!result.ok) {
+        return
+    }
+    let answer = ''
+    for await (var chunk of result.body) {
+        const t = Buffer.from(chunk).toString('utf8')
+        if (t.startsWith('data: ')) {
+            const message = JSON.parse(t.substring(6))
+            answer += message.content
+            process.stdout.write(message.content)
+            if (message.stop) {
+                if (message.truncated) {
+                    chat.shift()
+                }
+                break
+            }
+        }
+    }
+    process.stdout.write('\n')
+    chat.push({ human: question, assistant: answer.trimStart() })
+}
+const rl = readline.createInterface({ input: stdin, output: stdout });
+const readlineQuestion = (rl, query, options) => new Promise((resolve, reject) => {
+    rl.question(query, options, resolve)
+});
+while(true) {
+    const question = await readlineQuestion(rl, '> ')
+    await chat_completion(question)
+}

examples/server/chat.sh ADDED Viewed

	@@ -0,0 +1,77 @@

+#!/bin/bash
+API_URL="${API_URL:-http://127.0.0.1:8080}"
+CHAT=(
+    "Hello, Assistant."
+    "Hello. How may I help you today?"
+    "Please tell me the largest city in Europe."
+    "Sure. The largest city in Europe is Moscow, the capital of Russia."
+)
+INSTRUCTION="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
+trim() {
+    shopt -s extglob
+    set -- "${1##+([[:space:]])}"
+    printf "%s" "${1%%+([[:space:]])}"
+}
+trim_trailing() {
+    shopt -s extglob
+    printf "%s" "${1%%+([[:space:]])}"
+}
+format_prompt() {
+    echo -n "${INSTRUCTION}"
+    printf "\n### Human: %s\n### Assistant: %s" "${CHAT[@]}" "$1"
+}
+tokenize() {
+    curl \
+        --silent \
+        --request POST \
+        --url "${API_URL}/tokenize" \
+        --data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \
+    | jq '.tokens[]'
+}
+N_KEEP=$(tokenize "${INSTRUCTION}" | wc -l)
+chat_completion() {
+    PROMPT="$(trim_trailing "$(format_prompt "$1")")"
+    DATA="$(echo -n "$PROMPT" | jq -Rs --argjson n_keep $N_KEEP '{
+        prompt: .,
+        temperature: 0.2,
+        top_k: 40,
+        top_p: 0.9,
+        n_keep: $n_keep,
+        n_predict: 256,
+        stop: ["\n### Human:"],
+        stream: true
+    }')"
+    ANSWER=''
+    while IFS= read -r LINE; do
+        if [[ $LINE = data:* ]]; then
+            CONTENT="$(echo "${LINE:5}" | jq -r '.content')"
+            printf "%s" "${CONTENT}"
+            ANSWER+="${CONTENT}"
+        fi
+    done < <(curl \
+        --silent \
+        --no-buffer \
+        --request POST \
+        --url "${API_URL}/completion" \
+        --data-raw "${DATA}")
+    printf "\n"
+    CHAT+=("$1" "$(trim "$ANSWER")")
+}
+while true; do
+    read -r -e -p "> " QUESTION
+    chat_completion "${QUESTION}"
+done

examples/server/httplib.h ADDED Viewed

The diff for this file is too large to render. See raw diff

examples/server/json.hpp ADDED Viewed

The diff for this file is too large to render. See raw diff

examples/server/server.cpp ADDED Viewed

	@@ -0,0 +1,975 @@

+#include "common.h"
+#include "llama.h"
+#include "build-info.h"
+// single thread
+#define CPPHTTPLIB_THREAD_POOL_COUNT 1
+#ifndef NDEBUG
+// crash the server in debug mode, otherwise send an http 500 error
+#define CPPHTTPLIB_NO_EXCEPTIONS 1
+#endif
+#include "httplib.h"
+#include "json.hpp"
+#ifndef SERVER_VERBOSE
+#define SERVER_VERBOSE 1
+#endif
+using namespace httplib;
+using json = nlohmann::json;
+struct server_params {
+    std::string hostname = "127.0.0.1";
+    int32_t port = 8080;
+    int32_t read_timeout = 600;
+    int32_t write_timeout = 600;
+};
+static size_t common_part(const std::vector<llama_token> & a, const std::vector<llama_token> & b) {
+    size_t i;
+    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
+    return i;
+}
+enum stop_type {
+    STOP_FULL,
+    STOP_PARTIAL,
+};
+static bool ends_with(const std::string & str, const std::string & suffix) {
+    return str.size() >= suffix.size() &&
+        0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
+}
+static size_t find_partial_stop_string(const std::string & stop,
+                                       const std::string & text) {
+    if (!text.empty() && !stop.empty()) {
+        const char text_last_char = text.back();
+        for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) {
+            if (stop[char_index] == text_last_char) {
+                const std::string current_partial = stop.substr(0, char_index + 1);
+                if (ends_with(text, current_partial)) {
+                    return text.size() - char_index - 1;
+                }
+            }
+        }
+    }
+    return std::string::npos;
+}
+template<class Iter>
+static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
+    std::string ret;
+    for (; begin != end; ++begin) {
+        ret += llama_token_to_str(ctx, *begin);
+    }
+    return ret;
+}
+static void server_log(const char * level, const char * function, int line,
+                       const char * message, const nlohmann::ordered_json & extra) {
+    nlohmann::ordered_json log {
+        { "timestamp", time(nullptr) },
+        { "level", level },
+        { "function", function },
+        { "line", line },
+        { "message", message },
+    };
+    if (!extra.empty()) {
+        log.merge_patch(extra);
+    }
+    const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
+    fprintf(stdout, "%.*s\n", (int)str.size(), str.data());
+    fflush(stdout);
+}
+static bool server_verbose = false;
+#if SERVER_VERBOSE != 1
+#  define LOG_VERBOSE(MSG, ...)
+#else
+#  define LOG_VERBOSE(MSG, ...)                                          \
+    do {                                                                 \
+        if (server_verbose) {                                            \
+            server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
+        }                                                                \
+    } while(0)
+#endif
+#define LOG_ERROR(MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_INFO(MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
+struct llama_server_context {
+    bool stream = false;
+    bool has_next_token = false;
+    std::string generated_text;
+    size_t num_tokens_predicted = 0;
+    size_t n_past = 0;
+    size_t n_remain = 0;
+    std::vector<llama_token> embd;
+    std::vector<llama_token> last_n_tokens;
+    llama_model * model = nullptr;
+    llama_context * ctx = nullptr;
+    gpt_params params;
+    bool truncated = false;
+    bool stopped_eos = false;
+    bool stopped_word = false;
+    bool stopped_limit = false;
+    std::string stopping_word;
+    int32_t multibyte_pending = 0;
+    ~llama_server_context() {
+        if (ctx) {
+            llama_free(ctx);
+            ctx = nullptr;
+        }
+        if (model) {
+            llama_free_model(model);
+            model = nullptr;
+        }
+    }
+    void rewind() {
+        params.antiprompt.clear();
+        num_tokens_predicted = 0;
+        generated_text = "";
+        generated_text.reserve(params.n_ctx);
+        truncated = false;
+        stopped_eos = false;
+        stopped_word = false;
+        stopped_limit = false;
+        stopping_word = "";
+        multibyte_pending = 0;
+        n_remain = 0;
+        n_past = 0;
+    }
+    bool loadModel(const gpt_params & params_) {
+        params = params_;
+        std::tie(model, ctx) = llama_init_from_gpt_params(params);
+        if (model == nullptr) {
+            LOG_ERROR("unable to load model", { { "model", params_.model } });
+            return false;
+        }
+        last_n_tokens.resize(params.n_ctx);
+        std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
+        return true;
+    }
+    void loadPrompt() {
+        params.prompt.insert(0, 1, ' '); // always add a first space
+        std::vector<llama_token> prompt_tokens = ::llama_tokenize(ctx, params.prompt, true);
+        if (params.n_keep < 0) {
+            params.n_keep = (int)prompt_tokens.size();
+        }
+        params.n_keep = std::min(params.n_ctx - 4, params.n_keep);
+        // if input prompt is too big, truncate like normal
+        if (prompt_tokens.size() >= (size_t)params.n_ctx) {
+            const int n_left = (params.n_ctx - params.n_keep) / 2;
+            std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
+            const int erased_blocks = (prompt_tokens.size() - params.n_keep - n_left - 1) / n_left;
+            new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end());
+            std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), last_n_tokens.begin());
+            LOG_VERBOSE("input truncated", {
+                { "n_ctx", params.n_ctx },
+                { "n_keep", params.n_keep },
+                { "n_left", n_left },
+                { "new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend()) },
+            });
+            truncated = true;
+            prompt_tokens = new_tokens;
+        } else {
+            const size_t ps = prompt_tokens.size();
+            std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0);
+            std::copy(prompt_tokens.begin(), prompt_tokens.end(), last_n_tokens.end() - ps);
+        }
+        // compare the evaluated prompt with the new prompt
+        n_past = common_part(embd, prompt_tokens);
+        embd = prompt_tokens;
+        if (n_past == prompt_tokens.size()) {
+            // we have to evaluate at least 1 token to generate logits.
+            n_past--;
+        }
+        LOG_VERBOSE("prompt ingested", {
+            { "n_past", n_past },
+            { "cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past) },
+            { "to_eval", tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()) },
+        });
+        has_next_token = true;
+    }
+    void beginCompletion() {
+        // number of tokens to keep when resetting context
+        n_remain = params.n_predict;
+        llama_set_rng_seed(ctx, params.seed);
+    }
+    llama_token nextToken() {
+        llama_token result = -1;
+        if (embd.size() >= (size_t)params.n_ctx) {
+            // Reset context
+            const int n_left = (params.n_ctx - params.n_keep) / 2;
+            std::vector<llama_token> new_tokens(embd.begin(), embd.begin() + params.n_keep);
+            new_tokens.insert(new_tokens.end(), embd.end() - n_left, embd.end());
+            embd = new_tokens;
+            n_past = params.n_keep;
+            truncated = true;
+            LOG_VERBOSE("input truncated", {
+                { "n_ctx", params.n_ctx },
+                { "n_keep", params.n_keep },
+                { "n_left", n_left },
+                { "new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend()) },
+            });
+        }
+        while (n_past < embd.size()) {
+            int n_eval = (int)embd.size() - n_past;
+            if (n_eval > params.n_batch) {
+                n_eval = params.n_batch;
+            }
+            if (llama_eval(ctx, &embd[n_past], n_eval, n_past, params.n_threads)) {
+                LOG_ERROR("failed to eval", {
+                    { "n_eval", n_eval },
+                    { "n_past", n_past },
+                    { "n_threads", params.n_threads },
+                    { "embd", tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()) },
+                });
+                has_next_token = false;
+                return result;
+            }
+            n_past += n_eval;
+        }
+        if (params.n_predict == 0) {
+            has_next_token = false;
+            return llama_token_eos();
+        }
+        // out of user input, sample next token
+        const float temp = params.temp;
+        const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
+        const float top_p = params.top_p;
+        const float tfs_z = params.tfs_z;
+        const float typical_p = params.typical_p;
+        const int32_t repeat_last_n = params.repeat_last_n < 0 ? params.n_ctx : params.repeat_last_n;
+        const float repeat_penalty = params.repeat_penalty;
+        const float alpha_presence = params.presence_penalty;
+        const float alpha_frequency = params.frequency_penalty;
+        const int mirostat = params.mirostat;
+        const float mirostat_tau = params.mirostat_tau;
+        const float mirostat_eta = params.mirostat_eta;
+        const bool penalize_nl = params.penalize_nl;
+        llama_token id = 0;
+        {
+            auto * logits = llama_get_logits(ctx);
+            auto n_vocab = llama_n_vocab(ctx);
+            // Apply params.logit_bias map
+            for (const auto & it : params.logit_bias) {
+                logits[it.first] += it.second;
+            }
+            std::vector<llama_token_data> candidates;
+            candidates.reserve(n_vocab);
+            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+                candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
+            }
+            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+            // Apply penalties
+            float nl_logit = logits[llama_token_nl()];
+            auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx);
+            llama_sample_repetition_penalty(ctx, &candidates_p,
+                last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+                last_n_repeat, repeat_penalty);
+            llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
+                last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+                last_n_repeat, alpha_frequency, alpha_presence);
+            if (!penalize_nl) {
+                logits[llama_token_nl()] = nl_logit;
+            }
+            if (temp <= 0) {
+                // Greedy sampling
+                id = llama_sample_token_greedy(ctx, &candidates_p);
+            } else {
+                if (mirostat == 1) {
+                    static float mirostat_mu = 2.0f * mirostat_tau;
+                    const int mirostat_m = 100;
+                    llama_sample_temperature(ctx, &candidates_p, temp);
+                    id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
+                } else if (mirostat == 2) {
+                    static float mirostat_mu = 2.0f * mirostat_tau;
+                    llama_sample_temperature(ctx, &candidates_p, temp);
+                    id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
+                } else {
+                    // Temperature sampling
+                    llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
+                    llama_sample_typical(ctx, &candidates_p, typical_p, 1);
+                    llama_sample_top_p(ctx, &candidates_p, top_p, 1);
+                    llama_sample_top_k(ctx, &candidates_p, top_k, 1);
+                    llama_sample_temperature(ctx, &candidates_p, temp);
+                    id = llama_sample_token(ctx, &candidates_p);
+                }
+            }
+            last_n_tokens.erase(last_n_tokens.begin());
+            last_n_tokens.push_back(id);
+            num_tokens_predicted++;
+        }
+        // add it to the context
+        embd.push_back(id);
+        result = id;
+        // decrement remaining sampling budget
+        --n_remain;
+        if (!embd.empty() && embd.back() == llama_token_eos()) {
+            //stopping_word = llama_token_to_str(ctx, embd.back());
+            has_next_token = false;
+            stopped_eos = true;
+            LOG_VERBOSE("eos token found", {});
+            return result;
+        }
+        has_next_token = params.n_predict == -1 || n_remain != 0;
+        return result;
+    }
+    size_t findStoppingStrings(const std::string & text, const size_t last_token_size,
+                               const stop_type type) {
+        size_t stop_pos = std::string::npos;
+        for (const std::string & word : params.antiprompt) {
+            size_t pos;
+            if (type == STOP_FULL) {
+                const size_t tmp = word.size() + last_token_size;
+                const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;
+                pos = text.find(word, from_pos);
+            }
+            else {
+                pos = find_partial_stop_string(word, text);
+            }
+            if (pos != std::string::npos &&
+                (stop_pos == std::string::npos || pos < stop_pos)) {
+                if (type == STOP_FULL) {
+                    stopping_word = word;
+                    stopped_word = true;
+                    has_next_token = false;
+                }
+                stop_pos = pos;
+            }
+        }
+        return stop_pos;
+    }
+    std::string doCompletion() {
+        const llama_token token = nextToken();
+        const std::string token_text = token == -1 ? "" : llama_token_to_str(ctx, token);
+        generated_text += token_text;
+        if (multibyte_pending > 0) {
+            multibyte_pending -= token_text.size();
+        } else if (token_text.size() == 1) {
+            const char c = token_text[0];
+            // 2-byte characters: 110xxxxx 10xxxxxx
+            if ((c & 0xE0) == 0xC0) {
+                multibyte_pending = 1;
+            // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
+            } else if ((c & 0xF0) == 0xE0) {
+                multibyte_pending = 2;
+            // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+            } else if ((c & 0xF8) == 0xF0) {
+                multibyte_pending = 3;
+            } else {
+                multibyte_pending = 0;
+            }
+        }
+        if (multibyte_pending > 0 && !has_next_token) {
+            has_next_token = true;
+            n_remain++;
+        }
+        if (!has_next_token && n_remain == 0) {
+            stopped_limit = true;
+        }
+        LOG_VERBOSE("next token", {
+            { "token", token },
+            { "token_text", llama_token_to_str(ctx, token) },
+            { "has_next_token", has_next_token },
+            { "n_remain", n_remain },
+            { "num_tokens_predicted", num_tokens_predicted },
+            { "stopped_eos", stopped_eos },
+            { "stopped_word", stopped_word },
+            { "stopped_limit", stopped_limit },
+            { "stopping_word", stopping_word },
+        });
+        return token_text;
+    }
+    std::vector<float> getEmbedding() {
+        static const int n_embd = llama_n_embd(ctx);
+        if (!params.embedding) {
+            LOG_WARNING("embedding disabled", {
+                { "params.embedding", params.embedding },
+            });
+            return std::vector<float>(n_embd, 0.0f);
+        }
+        const float * data = llama_get_embeddings(ctx);
+        std::vector<float> embedding(data, data + n_embd);
+        return embedding;
+    }
+};
+static void server_print_usage(const char * argv0, const gpt_params & params,
+                               const server_params & sparams) {
+    fprintf(stderr, "usage: %s [options]\n", argv0);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help            show this help message and exit\n");
+    fprintf(stderr, "  -v, --verbose         verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
+    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stderr, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
+    fprintf(stderr, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
+    fprintf(stderr, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
+    fprintf(stderr, "                        not recommended: doubles context memory required and no measurable increase in quality\n");
+    if (llama_mlock_supported()) {
+        fprintf(stderr, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
+    }
+    if (llama_mmap_supported()) {
+        fprintf(stderr, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
+    }
+#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
+    fprintf(stderr, "  -ngl N, --n-gpu-layers N\n");
+    fprintf(stderr, "                        number of layers to store in VRAM\n");
+    fprintf(stderr, "  -ts SPLIT --tensor-split SPLIT\n");
+    fprintf(stderr, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
+    fprintf(stderr, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
+    fprintf(stderr, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
+    fprintf(stderr, "  -lv, --low-vram don't allocate VRAM scratch buffer\n");
+#endif
+    fprintf(stderr, "  -m FNAME, --model FNAME\n");
+    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
+    fprintf(stderr, "  -a ALIAS, --alias ALIAS\n");
+    fprintf(stderr, "                        set an alias for the model, will be added as `model` field in completion response\n");
+    fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
+    fprintf(stderr, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
+    fprintf(stderr, "  --host                ip address to listen (default  (default: %s)\n", sparams.hostname.c_str());
+    fprintf(stderr, "  --port PORT           port to listen (default  (default: %d)\n", sparams.port);
+    fprintf(stderr, "  -to N, --timeout N    server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
+    fprintf(stderr, "  --embedding           enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
+    fprintf(stderr, "\n");
+}
+static void server_params_parse(int argc, char ** argv, server_params & sparams,
+                                gpt_params & params) {
+    gpt_params default_params;
+    server_params default_sparams;
+    std::string arg;
+    bool invalid_param = false;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+        if (arg == "--port") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            sparams.port = std::stoi(argv[i]);
+        } else if (arg == "--host") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            sparams.hostname = argv[i];
+        } else if (arg == "--timeout" || arg == "-to") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            sparams.read_timeout = std::stoi(argv[i]);
+            sparams.write_timeout = std::stoi(argv[i]);
+        } else if (arg == "-m" || arg == "--model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.model = argv[i];
+        } else if (arg == "-a" || arg == "--alias") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.model_alias = argv[i];
+        } else if (arg == "-h" || arg == "--help") {
+            server_print_usage(argv[0], default_params, default_sparams);
+            exit(0);
+        } else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_ctx = std::stoi(argv[i]);
+        } else if (arg == "--memory-f32" || arg == "--memory_f32") {
+            params.memory_f16 = false;
+        } else if (arg == "--threads" || arg == "-t") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_threads = std::stoi(argv[i]);
+        } else if (arg == "-b" || arg == "--batch-size") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_batch = std::stoi(argv[i]);
+            params.n_batch = std::min(512, params.n_batch);
+        } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
+            params.n_gpu_layers = std::stoi(argv[i]);
+#else
+            LOG_WARNING("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
+                        "See main README.md for information on enabling GPU BLAS support", { { "n_gpu_layers", params.n_gpu_layers } });
+#endif
+        }
+        else if (arg == "--tensor-split" || arg == "-ts") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+#ifdef GGML_USE_CUBLAS
+            std::string arg_next = argv[i];
+            // split string by , and /
+            const std::regex regex{ R"([,/]+)" };
+            std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
+            std::vector<std::string> split_arg{ it, {} };
+            GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
+            for (size_t i_device = 0; i_device < LLAMA_MAX_DEVICES; ++i_device) {
+                if (i_device < split_arg.size()) {
+                    params.tensor_split[i_device] = std::stof(split_arg[i_device]);
+                }
+                else {
+                    params.tensor_split[i_device] = 0.0f;
+                }
+            }
+#else
+            LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.", {});
+#endif // GGML_USE_CUBLAS
+        }
+        else if (arg == "--low-vram" || arg == "-lv")
+        {
+#ifdef GGML_USE_CUBLAS
+            params.low_vram = true;
+#else
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
+#endif // GGML_USE_CUBLAS
+        }
+        else if (arg == "--main-gpu" || arg == "-mg") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+#ifdef GGML_USE_CUBLAS
+            params.main_gpu = std::stoi(argv[i]);
+#else
+            LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});
+#endif
+        } else if (arg == "--lora") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.lora_adapter = argv[i];
+            params.use_mmap = false;
+        } else if (arg == "--lora-base") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.lora_base = argv[i];
+        } else if (arg == "-v" || arg == "--verbose") {
+#if SERVER_VERBOSE != 1
+            LOG_WARNING("server.cpp is not built with verbose logging.", {});
+#else
+            server_verbose = true;
+#endif
+        } else if (arg == "--mlock") {
+            params.use_mlock = true;
+        } else if (arg == "--no-mmap") {
+            params.use_mmap = false;
+        } else if (arg == "--embedding") {
+            params.embedding = true;
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            server_print_usage(argv[0], default_params, default_sparams);
+            exit(1);
+        }
+    }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        server_print_usage(argv[0], default_params, default_sparams);
+        exit(1);
+    }
+}
+static json format_generation_settings(llama_server_context & llama) {
+    const auto eos_bias = llama.params.logit_bias.find(llama_token_eos());
+    const bool ignore_eos = eos_bias != llama.params.logit_bias.end() &&
+        eos_bias->second < 0.0f && std::isinf(eos_bias->second);
+    return json {
+        { "seed", llama.params.seed },
+        { "temp", llama.params.temp },
+        { "top_k", llama.params.top_k },
+        { "top_p", llama.params.top_p },
+        { "tfs_z", llama.params.tfs_z },
+        { "typical_p", llama.params.typical_p },
+        { "repeat_last_n", llama.params.repeat_last_n },
+        { "repeat_penalty", llama.params.repeat_penalty },
+        { "presence_penalty", llama.params.presence_penalty },
+        { "frequency_penalty", llama.params.frequency_penalty },
+        { "mirostat", llama.params.mirostat },
+        { "mirostat_tau", llama.params.mirostat_tau },
+        { "mirostat_eta", llama.params.mirostat_eta },
+        { "penalize_nl", llama.params.penalize_nl },
+        { "stop", llama.params.antiprompt },
+        { "n_predict", llama.params.n_predict },
+        { "n_keep", llama.params.n_keep },
+        { "ignore_eos", ignore_eos },
+        { "stream", llama.stream },
+        { "logit_bias", llama.params.logit_bias },
+    };
+}
+static json format_embedding_response(llama_server_context & llama) {
+    return json {
+        { "embedding", llama.getEmbedding() },
+    };
+}
+static json format_final_response(llama_server_context & llama, const std::string & content) {
+    return json {
+        { "content", content },
+        { "stop", true },
+        { "model", llama.params.model_alias },
+        { "tokens_predicted", llama.num_tokens_predicted },
+        { "generation_settings", format_generation_settings(llama) },
+        { "prompt", llama.params.prompt },
+        { "truncated", llama.truncated },
+        { "stopped_eos", llama.stopped_eos },
+        { "stopped_word", llama.stopped_word },
+        { "stopped_limit", llama.stopped_limit },
+        { "stopping_word", llama.stopping_word },
+    };
+}
+static json format_partial_response(const std::string & content) {
+    return json {
+        { "content", content },
+        { "stop", false },
+    };
+}
+static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
+    return json {
+        { "tokens", tokens }
+    };
+}
+static void parse_options_completion(const json & body, llama_server_context & llama) {
+    gpt_params default_params;
+    llama.stream = body.value("stream", false);
+    llama.params.n_predict = body.value("n_predict", default_params.n_predict);
+    llama.params.top_k = body.value("top_k", default_params.top_k);
+    llama.params.top_p = body.value("top_p", default_params.top_p);
+    llama.params.tfs_z = body.value("tfs_z", default_params.tfs_z);
+    llama.params.typical_p = body.value("typical_p", default_params.typical_p);
+    llama.params.repeat_last_n = body.value("repeat_last_n", default_params.repeat_last_n);
+    llama.params.temp = body.value("temperature", default_params.temp);
+    llama.params.repeat_penalty = body.value("repeat_penalty", default_params.repeat_penalty);
+    llama.params.presence_penalty = body.value("presence_penalty", default_params.presence_penalty);
+    llama.params.frequency_penalty = body.value("frequency_penalty", default_params.frequency_penalty);
+    llama.params.mirostat = body.value("mirostat", default_params.mirostat);
+    llama.params.mirostat_tau = body.value("mirostat_tau", default_params.mirostat_tau);
+    llama.params.mirostat_eta = body.value("mirostat_eta", default_params.mirostat_eta);
+    llama.params.penalize_nl = body.value("penalize_nl", default_params.penalize_nl);
+    llama.params.n_keep = body.value("n_keep", default_params.n_keep);
+    llama.params.seed = body.value("seed", default_params.seed);
+    llama.params.prompt = body.value("prompt", default_params.prompt);
+    llama.params.logit_bias.clear();
+    if (body.value("ignore_eos", false)) {
+        llama.params.logit_bias[llama_token_eos()] = -INFINITY;
+    }
+    const auto & logit_bias = body.find("logit_bias");
+    if (logit_bias != body.end() && logit_bias->is_array()) {
+        const int n_vocab = llama_n_vocab(llama.ctx);
+        for (const auto & el : *logit_bias) {
+            if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) {
+                llama_token tok = el[0].get<llama_token>();
+                if (tok >= 0 && tok < n_vocab) {
+                    if (el[1].is_number()) {
+                        llama.params.logit_bias[tok] = el[1].get<float>();
+                    } else if (el[1].is_boolean() && !el[1].get<bool>()) {
+                        llama.params.logit_bias[tok] = -INFINITY;
+                    }
+                }
+            }
+        }
+    }
+    llama.params.antiprompt.clear();
+    const auto & stop = body.find("stop");
+    if (stop != body.end() && stop->is_array()) {
+        for (const auto & word : *stop) {
+            if (!word.empty()) {
+                llama.params.antiprompt.push_back(word);
+            }
+        }
+    }
+    LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama));
+}
+static void log_server_request(const Request & req, const Response & res) {
+    LOG_INFO("request", {
+        { "remote_addr", req.remote_addr },
+        { "remote_port", req.remote_port },
+        { "status", res.status },
+        { "path", req.path },
+        { "request", req.body },
+        { "response", res.body },
+    });
+}
+int main(int argc, char ** argv) {
+    // own arguments required by this example
+    gpt_params params;
+    server_params sparams;
+    // struct that contains llama context and inference
+    llama_server_context llama;
+    server_params_parse(argc, argv, sparams, params);
+    if (params.model_alias == "unknown") {
+        params.model_alias = params.model;
+    }
+    llama_init_backend();
+    LOG_INFO("build info", {
+        { "build", BUILD_NUMBER },
+        { "commit", BUILD_COMMIT }
+    });
+    LOG_INFO("system info", {
+        { "n_threads", params.n_threads },
+        { "total_threads", std::thread::hardware_concurrency() },
+        { "system_info", llama_print_system_info() },
+    });
+    // load the model
+    if (!llama.loadModel(params)) {
+        return 1;
+    }
+    Server svr;
+    svr.set_default_headers({
+        { "Access-Control-Allow-Origin", "*" },
+        { "Access-Control-Allow-Headers", "content-type" }
+    });
+    svr.Get("/", [](const Request &, Response & res) {
+        res.set_content("<h1>llama.cpp server works</h1>", "text/html");
+    });
+    svr.Post("/completion", [&llama](const Request & req, Response & res) {
+        llama.rewind();
+        llama_reset_timings(llama.ctx);
+        parse_options_completion(json::parse(req.body), llama);
+        llama.loadPrompt();
+        llama.beginCompletion();
+        if (!llama.stream) {
+            size_t stop_pos = std::string::npos;
+            while (llama.has_next_token) {
+                const std::string token_text = llama.doCompletion();
+                stop_pos = llama.findStoppingStrings(llama.generated_text,
+                    token_text.size(), STOP_FULL);
+            }
+            if (stop_pos == std::string::npos) {
+                stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL);
+            }
+            if (stop_pos != std::string::npos) {
+                llama.generated_text.erase(llama.generated_text.begin() + stop_pos,
+                    llama.generated_text.end());
+            }
+            const json data = format_final_response(llama, llama.generated_text);
+            llama_print_timings(llama.ctx);
+            res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace),
+                            "application/json");
+        } else {
+            const auto chunked_content_provider = [&](size_t, DataSink & sink) {
+                size_t sent_count = 0;
+                while (llama.has_next_token) {
+                    const std::string token_text = llama.doCompletion();
+                    if (llama.multibyte_pending > 0) {
+                        continue;
+                    }
+                    size_t pos = std::min(sent_count, llama.generated_text.size());
+                    const std::string str_test = llama.generated_text.substr(pos);
+                    size_t stop_pos =
+                        llama.findStoppingStrings(str_test, token_text.size(), STOP_FULL);
+                    if (stop_pos != std::string::npos) {
+                        llama.generated_text.erase(
+                            llama.generated_text.begin() + pos + stop_pos,
+                            llama.generated_text.end());
+                        pos = std::min(sent_count, llama.generated_text.size());
+                    } else {
+                        stop_pos = llama.findStoppingStrings(str_test, token_text.size(),
+                            STOP_PARTIAL);
+                    }
+                    const std::string to_send = llama.generated_text.substr(pos, stop_pos);
+                    sent_count += to_send.size();
+                    const json data = llama.has_next_token
+                                          ? format_partial_response(to_send)
+                                          // Generation is done, send extra information.
+                                          : format_final_response(llama, to_send);
+                    const std::string str =
+                        "data: " +
+                        data.dump(-1, ' ', false, json::error_handler_t::replace) +
+                        "\n\n";
+                    LOG_VERBOSE("data stream", {
+                        { "to_send", str }
+                    });
+                    if (!sink.write(str.data(), str.size())) {
+                        LOG_VERBOSE("stream closed", {});
+                        llama_print_timings(llama.ctx);
+                        return false;
+                    }
+                }
+                llama_print_timings(llama.ctx);
+                sink.done();
+                return true;
+            };
+            res.set_chunked_content_provider("text/event-stream", chunked_content_provider);
+        }
+    });
+    svr.Options(R"(/.*)", [](const Request &, Response & res) {
+        return res.set_content("", "application/json");
+    });
+    svr.Post("/tokenize", [&llama](const Request & req, Response & res) {
+        const json body = json::parse(req.body);
+        const std::string content = body.value("content", "");
+        const std::vector<llama_token> tokens = llama_tokenize(llama.ctx, content, false);
+        const json data = format_tokenizer_response(tokens);
+        return res.set_content(data.dump(), "application/json");
+    });
+    svr.Post("/embedding", [&llama](const Request & req, Response & res) {
+        const json body = json::parse(req.body);
+        llama.rewind();
+        llama_reset_timings(llama.ctx);
+        llama.params.prompt = body.value("content", "");
+        llama.params.n_predict = 0;
+        llama.loadPrompt();
+        llama.beginCompletion();
+        llama.doCompletion();
+        const json data = format_embedding_response(llama);
+        return res.set_content(data.dump(), "application/json");
+    });
+    svr.set_logger(log_server_request);
+    svr.set_exception_handler([](const Request &, Response & res, std::exception_ptr ep) {
+        const auto * fmt = "500 Internal Server Error\n%s";
+        char buf[BUFSIZ];
+        try {
+            std::rethrow_exception(std::move(ep));
+        } catch (std::exception & e) {
+            snprintf(buf, sizeof(buf), fmt, e.what());
+        } catch (...) {
+            snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
+        }
+        res.set_content(buf, "text/plain");
+        res.status = 500;
+    });
+    // set timeouts and change hostname and port
+    svr.set_read_timeout(sparams.read_timeout);
+    svr.set_write_timeout(sparams.write_timeout);
+    if (!svr.bind_to_port(sparams.hostname, sparams.port)) {
+        LOG_ERROR("couldn't bind to server socket", {
+            { "hostname", sparams.hostname },
+            { "port", sparams.port },
+        });
+        return 1;
+    }
+    LOG_INFO("HTTP server listening", {
+        { "hostname", sparams.hostname },
+        { "port", sparams.port },
+    });
+    if (!svr.listen_after_bind()) {
+        return 1;
+    }
+    return 0;
+}

examples/simple/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+set(TARGET simple)
+add_executable(${TARGET} simple.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()

examples/simple/simple.cpp ADDED Viewed

	@@ -0,0 +1,179 @@

+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include "common.h"
+#include "llama.h"
+#include "build-info.h"
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+#include <signal.h>
+#include <unistd.h>
+#elif defined (_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
+#include <windows.h>
+#include <signal.h>
+#endif
+int main(int argc, char ** argv)
+{
+    gpt_params params;
+    //---------------------------------
+    // Print help :
+    //---------------------------------
+    if ( argc == 1 || argv[1][0] == '-' )
+    {
+        printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] );
+        return 1 ;
+    }
+    //---------------------------------
+    // Load parameters :
+    //---------------------------------
+    if ( argc >= 2 )
+    {
+        params.model = argv[1];
+    }
+    if ( argc >= 3 )
+    {
+        params.prompt = argv[2];
+    }
+    if ( params.prompt.empty() )
+    {
+        params.prompt = "Hello my name is";
+    }
+    //---------------------------------
+    // Init LLM :
+    //---------------------------------
+    llama_init_backend();
+    llama_model * model;
+    llama_context * ctx;
+    std::tie(model, ctx) = llama_init_from_gpt_params( params );
+    if ( model == NULL )
+    {
+        fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
+        return 1;
+    }
+    //---------------------------------
+    // Tokenize the prompt :
+    //---------------------------------
+    std::vector<llama_token> tokens_list;
+    tokens_list = ::llama_tokenize( ctx , params.prompt , true );
+    const int max_context_size     = llama_n_ctx( ctx );
+    const int max_tokens_list_size = max_context_size - 4 ;
+    if ( (int)tokens_list.size() > max_tokens_list_size )
+    {
+        fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" ,
+             __func__ , (int)tokens_list.size() , max_tokens_list_size );
+        return 1;
+    }
+    fprintf( stderr, "\n\n" );
+    // Print the tokens from the prompt :
+    for( auto id : tokens_list )
+    {
+        printf( "%s" , llama_token_to_str( ctx , id ) );
+    }
+    fflush(stdout);
+    //---------------------------------
+    // Main prediction loop :
+    //---------------------------------
+    // The LLM keeps a contextual cache memory of previous token evaluation.
+    // Usually, once this cache is full, it is required to recompute a compressed context based on previous
+    // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
+    // example, we will just stop the loop once this cache is full or once an end of stream is detected.
+    while ( llama_get_kv_cache_token_count( ctx ) < max_context_size )
+    {
+        //---------------------------------
+        // Evaluate the tokens :
+        //---------------------------------
+        if ( llama_eval( ctx , tokens_list.data() , tokens_list.size() , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
+        {
+            fprintf( stderr,  "%s : failed to eval\n" , __func__ );
+            return 1;
+        }
+        tokens_list.clear();
+        //---------------------------------
+        // Select the best prediction :
+        //---------------------------------
+        llama_token new_token_id = 0;
+        auto logits  = llama_get_logits( ctx );
+        auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens)
+        std::vector<llama_token_data> candidates;
+        candidates.reserve( n_vocab );
+        for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ )
+        {
+            candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } );
+        }
+        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+        // Select it using the "Greedy sampling" method :
+        new_token_id = llama_sample_token_greedy( ctx , &candidates_p );
+        // is it an end of stream ?
+        if ( new_token_id == llama_token_eos() )
+        {
+            fprintf(stderr, " [end of text]\n");
+            break;
+        }
+        // Print the new token :
+        printf( "%s" , llama_token_to_str( ctx , new_token_id ) );
+        fflush( stdout );
+        // Push this new token for next evaluation :
+        tokens_list.push_back( new_token_id );
+    } // wend of main loop
+    llama_free( ctx );
+    llama_free_model( model );
+    return 0;
+}
+// EOF

examples/train-text-from-scratch/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+set(TARGET train-text-from-scratch)
+add_executable(${TARGET} train-text-from-scratch.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)