Spaces:

myy97
/

llama2-webui

Sleeping

App Files Files Community

myy97 commited on Dec 22, 2023

Commit

140387c

•

1 Parent(s): 91cebc6

Upload folder using huggingface_hub

Browse files

Files changed (44) hide show

.env +19 -0
.gitattributes +1 -0
.github/workflows/branch.yml +60 -0
.github/workflows/release.yml +30 -0
.gitignore +10 -0
CONTRIBUTING.md +90 -0
LICENSE +21 -0
README.md +379 -8
app.py +433 -0
benchmark.py +145 -0
code_completion.py +216 -0
colab/Llama_2_7b_Chat_GPTQ.ipynb +0 -0
colab/ggmlv3_q4_0.ipynb +109 -0
colab/webui_CodeLlama_7B_Instruct_GPTQ.ipynb +514 -0
docs/issues.md +0 -0
docs/news.md +38 -0
docs/performance.md +32 -0
docs/pypi.md +187 -0
env_examples/.env.13b_example +13 -0
env_examples/.env.7b_8bit_example +13 -0
env_examples/.env.7b_ggmlv3_q4_0_example +18 -0
env_examples/.env.7b_gptq_example +18 -0
llama2_cu_python/Makefile +9 -0
llama2_cu_python/__init__.py +3 -0
llama2_cu_python/libllama2.so +3 -0
llama2_cu_python/llama2.cu +1394 -0
llama2_cu_python/llama2.h +23 -0
llama2_cu_python/llama2_cu.py +151 -0
llama2_wrapper/__init__.py +1 -0
llama2_wrapper/download/__init__.py +0 -0
llama2_wrapper/download/__main__.py +59 -0
llama2_wrapper/model.py +839 -0
llama2_wrapper/server/__init__.py +0 -0
llama2_wrapper/server/__main__.py +46 -0
llama2_wrapper/server/app.py +526 -0
llama2_wrapper/types.py +115 -0
poetry.lock +0 -0
prompts/prompts_en.csv +0 -0
prompts/utils.py +48 -0
pyproject.toml +47 -0
requirements.txt +21 -0
static/screenshot.png +0 -0
tests/__init__.py +0 -0
tests/test_get_prompt.py +59 -0

.env ADDED Viewed

	@@ -0,0 +1,19 @@

+MODEL_PATH = ""
+# if MODEL_PATH is "", default llama.cpp/gptq models
+# will be downloaded to: ./models
+# Example ggml path:
+MODEL_PATH = "models/llama2_7b_chat.bin"
+# options: llama.cpp, gptq, transformers
+#BACKEND_TYPE = "llama.cpp"
+BACKEND_TYPE = "llama2.cu"
+# only for transformers bitsandbytes 8 bit
+LOAD_IN_8BIT = False
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = 4000
+DEFAULT_SYSTEM_PROMPT = ""

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+llama2_cu_python/libllama2.so filter=lfs diff=lfs merge=lfs -text

.github/workflows/branch.yml ADDED Viewed

	@@ -0,0 +1,60 @@

+name: Push
+on: [push]
+jobs:
+  test:
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ['3.10']
+        poetry-version: ['1.5.1']
+        os: [ubuntu-latest]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Run image
+        uses: abatilo/actions-poetry@v2.1.4
+        with:
+          poetry-version: ${{ matrix.poetry-version }}
+      - name: Install dependencies
+        run: poetry install
+      - name: Run tests
+        run: poetry run pytest
+      - name: Upload coverage reports to Codecov
+        uses: codecov/codecov-action@v3
+        env:
+          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
+      # - name: Upload coverage to Codecov
+      #   uses: codecov/codecov-action@v2
+  code-quality:
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ['3.10']
+        poetry-version: ['1.5.1']
+        os: [ubuntu-latest]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Python Poetry Action
+        uses: abatilo/actions-poetry@v2.1.6
+        with:
+          poetry-version: ${{ matrix.poetry-version }}
+      - name: Install dependencies
+        run: poetry install
+      - name: Run black
+        run: poetry run black . --check
+      # - name: Run isort
+      #   run: poetry run isort . --check-only --profile black
+      # - name: Run flake8
+      #   run: poetry run flake8 .
+      # - name: Run bandit
+      #   run: poetry run bandit .
+      # - name: Run saftey
+      #   run: poetry run safety check

.github/workflows/release.yml ADDED Viewed

	@@ -0,0 +1,30 @@

+name: Release
+on:
+  release:
+    types:
+      - created
+jobs:
+  publish:
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ['3.10']
+        poetry-version: ['1.5.1']
+        os: [ubuntu-latest]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Run image
+        uses: abatilo/actions-poetry@v2.1.4
+        with:
+          poetry-version: ${{ matrix.poetry-version }}
+      - name: Publish
+        env:
+          PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
+        run: |
+          poetry config pypi-token.pypi $PYPI_TOKEN
+          poetry publish --build

.gitignore ADDED Viewed

	@@ -0,0 +1,10 @@

+models
+dist
+.DS_Store
+.vscode
+__pycache__
+gradio_cached_examples
+.pytest_cache

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,90 @@

+# Contributing to [llama2-webui](https://github.com/liltom-eth/llama2-webui)
+We love your input! We want to make contributing to this project as easy and transparent as possible, whether it's:
+- Reporting a bug
+- Proposing new features
+- Discussing the current state of the code
+- Update README.md
+- Submitting a PR
+## Using GitHub's [issues](https://github.com/liltom-eth/llama2-webui/issues)
+We use GitHub issues to track public bugs. Report a bug by [opening a new issue](https://github.com/liltom-eth/llama2-webui/issues). It's that easy!
+Thanks for **[jlb1504](https://github.com/jlb1504)** for reporting the [first issue](https://github.com/liltom-eth/llama2-webui/issues/1)!
+**Great Bug Reports** tend to have:
+- A quick summary and/or background
+- Steps to reproduce
+  - Be specific!
+  - Give a sample code if you can.
+- What you expected would happen
+- What actually happens
+- Notes (possibly including why you think this might be happening, or stuff you tried that didn't work)
+Proposing new features are also welcome.
+## Pull Request
+All pull requests are welcome. For example, you update the `README.md` to help users to better understand the usage.
+### Clone the repository
+1. Create a user account on GitHub if you do not already have one.
+2. Fork the project [repository](https://github.com/liltom-eth/llama2-webui): click on the *Fork* button near the top of the page. This creates a copy of the code under your account on GitHub.
+3. Clone this copy to your local disk:
+   ```
+   git clone git@github.com:liltom-eth/llama2-webui.git
+   cd llama2-webui
+   ```
+### Implement your changes
+1. Create a branch to hold your changes:
+   ```
+   git checkout -b my-feature
+   ```
+   and start making changes. Never work on the main branch!
+2. Start your work on this branch.
+3. When you’re done editing, do:
+   ```
+   git add <MODIFIED FILES>
+   git commit
+   ```
+   to record your changes in [git](https://git-scm.com/).
+### Submit your contribution
+1. If everything works fine, push your local branch to the remote server with:
+   ```
+   git push -u origin my-feature
+   ```
+2. Go to the web page of your fork and click "Create pull request" to send your changes for review.
+   ```{todo}
+      Find more detailed information in [creating a PR]. You might also want to open
+      the PR as a draft first and mark it as ready for review after the feedbacks
+      from the continuous integration (CI) system or any required fixes.
+   ```
+## License
+By contributing, you agree that your contributions will be licensed under its MIT License.
+## Questions?
+Email us at [liltom.eth@gmail.com](mailto:liltom.eth@gmail.com)

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Tom
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,12 +1,383 @@
 ---
-title: Llama2 Webui
-emoji: 😻
-colorFrom: pink
-colorTo: indigo
-sdk: gradio
-sdk_version: 4.11.0
 app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: llama2-webui
 app_file: app.py
+sdk: gradio
+sdk_version: 3.37.0
 ---
+# llama2-webui
+Running Llama 2 with gradio web UI on GPU or CPU from anywhere (Linux/Windows/Mac).
+- Supporting all Llama 2 models (7B, 13B, 70B, GPTQ, GGML, GGUF, [CodeLlama](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ)) with 8-bit, 4-bit mode.
+- Use [llama2-wrapper](https://pypi.org/project/llama2-wrapper/) as your local llama2 backend for Generative Agents/Apps; [colab example](./colab/Llama_2_7b_Chat_GPTQ.ipynb).
+- [Run OpenAI Compatible API](#start-openai-compatible-api) on Llama2 models.
+![screenshot](./static/screenshot.png)
+![code_llama_playground](https://i.imgur.com/FgMUiT6.gif)
+## Features
+- Supporting models: [Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)/[13b](https://huggingface.co/llamaste/Llama-2-13b-chat-hf)/[70b](https://huggingface.co/llamaste/Llama-2-70b-chat-hf), [Llama-2-GPTQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ), [Llama-2-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML),  [Llama-2-GGUF](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF),  [CodeLlama](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ) ...
+- Supporting model backends: [tranformers](https://github.com/huggingface/transformers), [bitsandbytes(8-bit inference)](https://github.com/TimDettmers/bitsandbytes), [AutoGPTQ(4-bit inference)](https://github.com/PanQiWei/AutoGPTQ), [llama.cpp](https://github.com/ggerganov/llama.cpp)
+- Demos: [Run Llama2 on MacBook Air](https://twitter.com/liltom_eth/status/1682791729207070720?s=20); [Run Llama2 on free Colab T4 GPU](./colab/Llama_2_7b_Chat_GPTQ.ipynb)
+- Use  [llama2-wrapper](https://pypi.org/project/llama2-wrapper/)  as your local llama2 backend for Generative Agents/Apps; [colab example](./colab/Llama_2_7b_Chat_GPTQ.ipynb).
+- [Run OpenAI Compatible API](#start-openai-compatible-api) on Llama2 models.
+- [News](./docs/news.md), [Benchmark](./docs/performance.md), [Issue Solutions](./docs/issues.md)
+## Contents
+- [Install](#install)
+- [Usage](#usage)
+  - [Start Chat UI](#start-chat-ui)
+  - [Start Code Llama UI](#start-code-llama-ui)
+  - [Use llama2-wrapper for Your App](#use-llama2-wrapper-for-your-app)
+  - [Start OpenAI Compatible API](#start-openai-compatible-api)
+- [Benchmark](#benchmark)
+- [Download Llama-2 Models](#download-llama-2-models)
+  - [Model List](#model-list)
+  - [Download Script](#download-script)
+- [Tips](#tips)
+  - [Env Examples](#env-examples)
+  - [Run on Nvidia GPU](#run-on-nvidia-gpu)
+    - [Run bitsandbytes 8 bit](#run-bitsandbytes-8-bit)
+    - [Run GPTQ 4 bit](#run-gptq-4-bit)
+  - [Run on CPU](#run-on-cpu)
+    - [Mac Metal Acceleration](#mac-metal-acceleration)
+    - [AMD/Nvidia GPU Acceleration](#amdnvidia-gpu-acceleration)
+- [License](#license)
+- [Contributing](#contributing)
+## Install
+### Method 1: From [PyPI](https://pypi.org/project/llama2-wrapper/)
+```
+pip install llama2-wrapper
+```
+The newest `llama2-wrapper>=0.1.14` supports llama.cpp's `gguf` models.
+If you would like to use old `ggml` models, install `llama2-wrapper<=0.1.13` or manually install `llama-cpp-python==0.1.77`.
+### Method 2: From Source:
+```
+git clone https://github.com/liltom-eth/llama2-webui.git
+cd llama2-webui
+pip install -r requirements.txt
+```
+### Install Issues:
+`bitsandbytes >= 0.39` may not work on older NVIDIA GPUs. In that case, to use `LOAD_IN_8BIT`, you may have to downgrade like this:
+-  `pip install bitsandbytes==0.38.1`
+`bitsandbytes` also need a special install for Windows:
+```
+pip uninstall bitsandbytes
+pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.0-py3-none-win_amd64.whl
+```
+## Usage
+### Start Chat UI
+Run chatbot simply with web UI:
+```bash
+python app.py
+```
+`app.py` will load the default config `.env` which uses `llama.cpp` as the backend to run `llama-2-7b-chat.ggmlv3.q4_0.bin` model for inference. The model `llama-2-7b-chat.ggmlv3.q4_0.bin` will be automatically downloaded.
+```bash
+Running on backend llama.cpp.
+Use default model path: ./models/llama-2-7b-chat.Q4_0.gguf
+Start downloading model to: ./models/llama-2-7b-chat.Q4_0.gguf
+```
+You can also customize your `MODEL_PATH`, `BACKEND_TYPE,` and model configs in `.env` file to run different llama2 models on different backends (llama.cpp, transformers, gptq).
+### Start Code Llama UI
+We provide a code completion / filling UI for Code Llama.
+Base model **Code Llama** and extend model **Code Llama — Python** are not fine-tuned to follow instructions. They should be prompted so that the expected answer is the natural continuation of the prompt. That means these two models focus on code filling and code completion.
+Here is an example run CodeLlama code completion on llama.cpp backend:
+```
+python code_completion.py --model_path ./models/codellama-7b.Q4_0.gguf
+```
+![code_llama_playground](https://i.imgur.com/FgMUiT6.gif)
+`codellama-7b.Q4_0.gguf` can be downloaded from [TheBloke/CodeLlama-7B-GGUF](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF/blob/main/codellama-7b.Q4_0.gguf).
+**Code Llama — Instruct** trained with “natural language instruction” inputs paired with anticipated outputs. This strategic methodology enhances the model’s capacity to grasp human expectations in prompts. That means instruct models can be used in a chatbot-like app.
+Example run CodeLlama chat on gptq backend:
+```
+python app.py --backend_type gptq --model_path ./models/CodeLlama-7B-Instruct-GPTQ/ --share True
+```
+![code_llama_chat](https://i.imgur.com/lQLfemB.gif)
+`CodeLlama-7B-Instruct-GPTQ` can be downloaded from [TheBloke/CodeLlama-7B-Instruct-GPTQ](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ)
+### Use llama2-wrapper for Your App
+🔥 For developers, we released `llama2-wrapper`  as a llama2 backend wrapper in [PYPI](https://pypi.org/project/llama2-wrapper/).
+Use  `llama2-wrapper`  as your local llama2 backend to answer questions and more, [colab example](./colab/ggmlv3_q4_0.ipynb):
+```python
+# pip install llama2-wrapper
+from llama2_wrapper import LLAMA2_WRAPPER, get_prompt
+llama2_wrapper = LLAMA2_WRAPPER()
+# Default running on backend llama.cpp.
+# Automatically downloading model to: ./models/llama-2-7b-chat.ggmlv3.q4_0.bin
+prompt = "Do you know Pytorch"
+answer = llama2_wrapper(get_prompt(prompt), temperature=0.9)
+```
+Run gptq llama2 model on Nvidia GPU, [colab example](./colab/Llama_2_7b_Chat_GPTQ.ipynb):
+```python
+from llama2_wrapper import LLAMA2_WRAPPER
+llama2_wrapper = LLAMA2_WRAPPER(backend_type="gptq")
+# Automatically downloading model to: ./models/Llama-2-7b-Chat-GPTQ
+```
+Run llama2 7b with bitsandbytes 8 bit with a `model_path`:
+```python
+from llama2_wrapper import LLAMA2_WRAPPER
+llama2_wrapper = LLAMA2_WRAPPER(
+	model_path = "./models/Llama-2-7b-chat-hf",
+  backend_type = "transformers",
+  load_in_8bit = True
+)
+```
+Check [API Document](https://pypi.org/project/llama2-wrapper/) for more usages.
+### Start OpenAI Compatible API
+`llama2-wrapper` offers a web server that acts as a drop-in replacement for the OpenAI API. This allows you to use Llama2 models with any OpenAI compatible clients, libraries or services, etc.
+Start Fast API:
+```
+python -m llama2_wrapper.server
+```
+it will use `llama.cpp` as the backend by default to run `llama-2-7b-chat.ggmlv3.q4_0.bin` model.
+Start Fast API for `gptq` backend:
+```
+python -m llama2_wrapper.server --backend_type gptq
+```
+Navigate to http://localhost:8000/docs to see the OpenAPI documentation.
+#### Basic settings
+| Flag             | Description                                                  |
+| ---------------- | ------------------------------------------------------------ |
+| `-h`, `--help`   | Show this help message.                                      |
+| `--model_path`   | The path to the model to use for generating completions.     |
+| `--backend_type` | Backend for llama2, options: llama.cpp, gptq, transformers   |
+| `--max_tokens`   | Maximum context size.                                        |
+| `--load_in_8bit` | Whether to use bitsandbytes to run model in 8 bit mode (only for transformers models). |
+| `--verbose`      | Whether to print verbose output to stderr.                   |
+| `--host`         | API address                                                  |
+| `--port`         | API port                                                     |
+## Benchmark
+Run benchmark script to compute performance on your device, `benchmark.py` will load the same `.env` as `app.py`.:
+```bash
+python benchmark.py
+```
+You can also select the `iter`, `backend_type` and `model_path` the benchmark will be run (overwrite .env args) :
+```bash
+python benchmark.py --iter NB_OF_ITERATIONS --backend_type gptq
+```
+ By default, the number of iterations is 5, but if you want a faster result or a more accurate one
+ you can set it to whatever value you want, but please only report results with at least 5 iterations.
+This [colab example](./colab/Llama_2_7b_Chat_GPTQ.ipynb) also show you how to benchmark gptq model on free Google Colab T4 GPU.
+Some benchmark performance:
+| Model                       | Precision | Device             | RAM / GPU VRAM | Speed (tokens/sec) | load time (s) |
+| --------------------------- | --------- | ------------------ | -------------- | ------------------ | ------------- |
+| Llama-2-7b-chat-hf          | 8 bit     | NVIDIA RTX 2080 Ti | 7.7 GB VRAM    | 3.76               | 641.36        |
+| Llama-2-7b-Chat-GPTQ        | 4 bit     | NVIDIA RTX 2080 Ti | 5.8 GB VRAM    | 18.85              | 192.91        |
+| Llama-2-7b-Chat-GPTQ        | 4 bit     | Google Colab T4    | 5.8 GB VRAM    | 18.19              | 37.44         |
+| llama-2-7b-chat.ggmlv3.q4_0 | 4 bit     | Apple M1 Pro CPU   | 5.4 GB RAM     | 17.90              | 0.18          |
+| llama-2-7b-chat.ggmlv3.q4_0 | 4 bit     | Apple M2 CPU       | 5.4 GB RAM     | 13.70              | 0.13          |
+| llama-2-7b-chat.ggmlv3.q4_0 | 4 bit     | Apple M2 Metal     | 5.4 GB RAM     | 12.60              | 0.10          |
+| llama-2-7b-chat.ggmlv3.q2_K | 2 bit     | Intel i7-8700      | 4.5 GB RAM     | 7.88               | 31.90         |
+Check/contribute the performance of your device in the full [performance doc](./docs/performance.md).
+## Download Llama-2 Models
+Llama 2 is a collection of pre-trained and fine-tuned generative text models ranging in scale from 7 billion to 70 billion parameters.
+Llama-2-7b-Chat-GPTQ is the GPTQ model files for [Meta's Llama 2 7b Chat](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf). GPTQ 4-bit Llama-2 model require less GPU VRAM to run it.
+### Model List
+| Model Name                          | set MODEL_PATH in .env                   | Download URL                                                 |
+| ----------------------------------- | ---------------------------------------- | ------------------------------------------------------------ |
+| meta-llama/Llama-2-7b-chat-hf       | /path-to/Llama-2-7b-chat-hf              | [Link](https://huggingface.co/llamaste/Llama-2-7b-chat-hf)   |
+| meta-llama/Llama-2-13b-chat-hf      | /path-to/Llama-2-13b-chat-hf             | [Link](https://huggingface.co/llamaste/Llama-2-13b-chat-hf)  |
+| meta-llama/Llama-2-70b-chat-hf      | /path-to/Llama-2-70b-chat-hf             | [Link](https://huggingface.co/llamaste/Llama-2-70b-chat-hf)  |
+| meta-llama/Llama-2-7b-hf            | /path-to/Llama-2-7b-hf                   | [Link](https://huggingface.co/meta-llama/Llama-2-7b-hf)      |
+| meta-llama/Llama-2-13b-hf           | /path-to/Llama-2-13b-hf                  | [Link](https://huggingface.co/meta-llama/Llama-2-13b-hf)     |
+| meta-llama/Llama-2-70b-hf           | /path-to/Llama-2-70b-hf                  | [Link](https://huggingface.co/meta-llama/Llama-2-70b-hf)     |
+| TheBloke/Llama-2-7b-Chat-GPTQ       | /path-to/Llama-2-7b-Chat-GPTQ            | [Link](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ) |
+| TheBloke/Llama-2-7b-Chat-GGUF       | /path-to/llama-2-7b-chat.Q4_0.gguf       | [Link](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF/blob/main/llama-2-7b-chat.Q4_0.gguf) |
+| TheBloke/Llama-2-7B-Chat-GGML       | /path-to/llama-2-7b-chat.ggmlv3.q4_0.bin | [Link](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML) |
+| TheBloke/CodeLlama-7B-Instruct-GPTQ | TheBloke/CodeLlama-7B-Instruct-GPTQ      | [Link](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ) |
+| ...                                 | ...                                      | ...                                                          |
+Running 4-bit model `Llama-2-7b-Chat-GPTQ` needs GPU with 6GB VRAM.
+Running 4-bit model `llama-2-7b-chat.ggmlv3.q4_0.bin` needs CPU with 6GB RAM. There is also a list of other 2, 3, 4, 5, 6, 8-bit GGML models that can be used from [TheBloke/Llama-2-7B-Chat-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML).
+### Download Script
+These models can be downloaded through:
+```bash
+python -m llama2_wrapper.download --repo_id TheBloke/CodeLlama-7B-Python-GPTQ
+python -m llama2_wrapper.download --repo_id TheBloke/Llama-2-7b-Chat-GGUF --filename llama-2-7b-chat.Q4_0.gguf --save_dir ./models
+```
+Or use CMD like:
+```bash
+# Make sure you have git-lfs installed (https://git-lfs.com)
+git lfs install
+git clone git@hf.co:meta-llama/Llama-2-7b-chat-hf
+```
+To download Llama 2 models, you need to request access from [https://ai.meta.com/llama/](https://ai.meta.com/llama/) and also enable access on repos like [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/tree/main). Requests will be processed in hours.
+For GPTQ models like [TheBloke/Llama-2-7b-Chat-GPTQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ), you can directly download without requesting access.
+For GGML models like [TheBloke/Llama-2-7B-Chat-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML), you can directly download without requesting access.
+## Tips
+### Env Examples
+There are some examples in `./env_examples/` folder.
+| Model Setup                                            | Example .env                |
+| ------------------------------------------------------ | --------------------------- |
+| Llama-2-7b-chat-hf 8-bit (transformers backend)        | .env.7b_8bit_example        |
+| Llama-2-7b-Chat-GPTQ 4-bit (gptq transformers backend) | .env.7b_gptq_example        |
+| Llama-2-7B-Chat-GGML 4bit (llama.cpp backend)          | .env.7b_ggmlv3_q4_0_example |
+| Llama-2-13b-chat-hf (transformers backend)             | .env.13b_example            |
+| ...                                                    | ...                         |
+### Run on Nvidia GPU
+The running requires around 14GB of GPU VRAM for Llama-2-7b and 28GB of GPU VRAM for Llama-2-13b.
+If you are running on multiple GPUs, the model will be loaded automatically on GPUs and split the VRAM usage. That allows you to run Llama-2-7b (requires 14GB of GPU VRAM) on a setup like 2 GPUs (11GB VRAM each).
+#### Run bitsandbytes 8 bit
+If you do not have enough memory,  you can set up your `LOAD_IN_8BIT` as `True` in `.env`. This can reduce memory usage by around half with slightly degraded model quality. It is compatible with the CPU, GPU, and Metal backend.
+Llama-2-7b with 8-bit compression can run on a single GPU with 8 GB of VRAM, like an Nvidia RTX 2080Ti, RTX 4080, T4, V100 (16GB).
+#### Run GPTQ 4 bit
+If you want to run 4 bit  Llama-2 model like `Llama-2-7b-Chat-GPTQ`,  you can set up your `BACKEND_TYPE` as `gptq` in `.env` like example `.env.7b_gptq_example`.
+Make sure you have downloaded the 4-bit model from `Llama-2-7b-Chat-GPTQ` and set the `MODEL_PATH` and arguments in `.env` file.
+`Llama-2-7b-Chat-GPTQ` can run on a single GPU with 6 GB of VRAM.
+If you encounter issue like `NameError: name 'autogptq_cuda_256' is not defined`, please refer to [here](https://huggingface.co/TheBloke/open-llama-13b-open-instruct-GPTQ/discussions/1)
+> pip install https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.3.0/auto_gptq-0.3.0+cu117-cp310-cp310-linux_x86_64.whl
+### Run on CPU
+Run Llama-2 model on CPU requires [llama.cpp](https://github.com/ggerganov/llama.cpp) dependency and [llama.cpp Python Bindings](https://github.com/abetlen/llama-cpp-python), which are already installed.
+Download GGML models like `llama-2-7b-chat.ggmlv3.q4_0.bin` following [Download Llama-2 Models](#download-llama-2-models) section. `llama-2-7b-chat.ggmlv3.q4_0.bin` model requires at least 6 GB RAM to run on CPU.
+Set up configs like `.env.7b_ggmlv3_q4_0_example` from `env_examples` as `.env`.
+Run web UI `python app.py` .
+#### Mac Metal Acceleration
+For Mac users, you can also set up Mac Metal for acceleration, try install this dependencies:
+```bash
+pip uninstall llama-cpp-python -y
+CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install -U llama-cpp-python --no-cache-dir
+pip install 'llama-cpp-python[server]'
+```
+or check details:
+- [MacOS Install with Metal GPU](https://github.com/abetlen/llama-cpp-python/blob/main/docs/install/macos.md)
+#### AMD/Nvidia GPU Acceleration
+If you would like to use AMD/Nvidia GPU for acceleration, check this:
+- [Installation with OpenBLAS / cuBLAS / CLBlast / Metal](https://github.com/abetlen/llama-cpp-python#installation-with-openblas--cublas--clblast--metal)
+## License
+MIT - see [MIT License](LICENSE)
+This project enables users to adapt it freely for proprietary purposes without any restrictions.
+## Contributing
+Kindly read our [Contributing Guide](CONTRIBUTING.md) to learn and understand our development process.
+### All Contributors
+<a href="https://github.com/liltom-eth/llama2-webui/graphs/contributors">
+  <img src="https://contrib.rocks/image?repo=liltom-eth/llama2-webui" />
+</a>
+### Review
+<a href='https://github.com/repo-reviews/repo-reviews.github.io/blob/main/create.md' target="_blank"><img alt='Github' src='https://img.shields.io/badge/review-100000?style=flat&logo=Github&logoColor=white&labelColor=888888&color=555555'/></a>
+### Star History
+[![Star History Chart](https://api.star-history.com/svg?repos=liltom-eth/llama2-webui&type=Date)](https://star-history.com/#liltom-eth/llama2-webui&Date)
+## Credits
+- https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+- https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat
+- https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ
+- [https://github.com/ggerganov/llama.cpp](https://github.com/ggerganov/llama.cpp)
+- [https://github.com/TimDettmers/bitsandbytes](https://github.com/TimDettmers/bitsandbytes)
+- [https://github.com/PanQiWei/AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)
+- [https://github.com/abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)

app.py ADDED Viewed

	@@ -0,0 +1,433 @@

+import os
+import time
+import argparse
+from typing import Iterator
+import gradio as gr
+from dotenv import load_dotenv
+from distutils.util import strtobool
+from llama2_wrapper import LLAMA2_WRAPPER
+import logging
+from prompts.utils import PromtsContainer
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", type=str, default="", help="model path")
+    parser.add_argument(
+        "--backend_type",
+        type=str,
+        default="",
+        help="Backend options: llama.cpp, gptq, transformers, llama2.cu",
+    )
+    parser.add_argument(
+        "--load_in_8bit",
+        type=bool,
+        default=False,
+        help="Whether to use bitsandbytes 8 bit.",
+    )
+    parser.add_argument(
+        "--share",
+        type=bool,
+        default=False,
+        help="Whether to share public for gradio.",
+    )
+    args = parser.parse_args()
+    load_dotenv()
+    DEFAULT_SYSTEM_PROMPT = os.getenv("DEFAULT_SYSTEM_PROMPT", "")
+    MAX_MAX_NEW_TOKENS = int(os.getenv("MAX_MAX_NEW_TOKENS", 2048))
+    DEFAULT_MAX_NEW_TOKENS = int(os.getenv("DEFAULT_MAX_NEW_TOKENS", 1024))
+    MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", 4000))
+    MODEL_PATH = os.getenv("MODEL_PATH")
+    assert MODEL_PATH is not None, f"MODEL_PATH is required, got: {MODEL_PATH}"
+    BACKEND_TYPE = os.getenv("BACKEND_TYPE")
+    assert BACKEND_TYPE is not None, f"BACKEND_TYPE is required, got: {BACKEND_TYPE}"
+    LOAD_IN_8BIT = bool(strtobool(os.getenv("LOAD_IN_8BIT", "True")))
+    if args.model_path != "":
+        MODEL_PATH = args.model_path
+    if args.backend_type != "":
+        BACKEND_TYPE = args.backend_type
+    if args.load_in_8bit:
+        LOAD_IN_8BIT = True
+    llama2_wrapper = LLAMA2_WRAPPER(
+        model_path=MODEL_PATH,
+        backend_type=BACKEND_TYPE,
+        max_tokens=MAX_INPUT_TOKEN_LENGTH,
+        load_in_8bit=LOAD_IN_8BIT,
+        verbose=True,
+    )
+    DESCRIPTION = """
+    # llama2-webui
+    """
+    DESCRIPTION2 = """
+    - Supporting models: [Llama-2-7b](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML)/[13b](https://huggingface.co/llamaste/Llama-2-13b-chat-hf)/[70b](https://huggingface.co/llamaste/Llama-2-70b-chat-hf), [Llama-2-GPTQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ), [Llama-2-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML), [CodeLlama](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ) ...
+    - Supporting model backends: [tranformers](https://github.com/huggingface/transformers), [bitsandbytes(8-bit inference)](https://github.com/TimDettmers/bitsandbytes), [AutoGPTQ(4-bit inference)](https://github.com/PanQiWei/AutoGPTQ), [llama.cpp](https://github.com/ggerganov/llama.cpp)
+    """
+    def clear_and_save_textbox(message: str) -> tuple[str, str]:
+        return "", message
+    def save_textbox_for_prompt(message: str) -> str:
+        logging.info("start save_textbox_from_prompt")
+        message = convert_summary_to_prompt(message)
+        return message
+    def display_input(
+        message: str, history: list[tuple[str, str]]
+    ) -> list[tuple[str, str]]:
+        history.append((message, ""))
+        return history
+    def delete_prev_fn(
+        history: list[tuple[str, str]]
+    ) -> tuple[list[tuple[str, str]], str]:
+        try:
+            message, _ = history.pop()
+        except IndexError:
+            message = ""
+        return history, message or ""
+    def generate(
+        message: str,
+        history_with_input: list[tuple[str, str]],
+        system_prompt: str,
+        max_new_tokens: int,
+        temperature: float,
+        top_p: float,
+        top_k: int,
+        platform: str,
+    ) -> tuple[Iterator[list[tuple[str, str]]], str]:
+        if max_new_tokens > MAX_MAX_NEW_TOKENS:
+            raise ValueError
+        try:
+            history = history_with_input[:-1]
+            yield history + [(message, "")], "## processing prompt"
+            generator = llama2_wrapper.run(
+                message,
+                history,
+                system_prompt,
+                max_new_tokens,
+                temperature,
+                top_p,
+                top_k,
+            )
+            t = -time.perf_counter()
+            try:
+                first_response = next(generator)
+                t += time.perf_counter()
+                yield history + [(message, first_response)], "## generating"
+                t -= time.perf_counter()
+            except StopIteration:
+                yield history + [(message, "")], "## terminated"
+            num_tokens = 1
+            t = -time.perf_counter()
+            for response in generator:
+                num_tokens += 1
+                t += time.perf_counter()
+                yield history + [(message, response)], "## generating"
+                t -= time.perf_counter()
+            t += time.perf_counter()
+            if platform == None:
+                platform = "CUDA by default"
+            yield history + [(message, response)], f"### num tok: {num_tokens}<br>time(sec): {t:.2f}<br>tok/sec: {num_tokens / t:.2f}<br>{BACKEND_TYPE}({platform})"
+        except Exception as e:
+            logging.exception(e)
+    def check_input_token_length(
+        message: str, chat_history: list[tuple[str, str]], system_prompt: str
+    ) -> None:
+        input_token_length = llama2_wrapper.get_input_token_length(
+            message, chat_history, system_prompt
+        )
+        if input_token_length > MAX_INPUT_TOKEN_LENGTH:
+            raise gr.Error(
+                f"The accumulated input is too long ({input_token_length} > {MAX_INPUT_TOKEN_LENGTH}). Clear your chat history and try again."
+            )
+    prompts_container = PromtsContainer()
+    prompts = prompts_container.get_prompts_tab_dict()
+    default_prompts_checkbox = False
+    default_advanced_checkbox = False
+    def convert_summary_to_prompt(summary):
+        return prompts_container.get_prompt_by_summary(summary)
+    def tab_list(tab_data, chatbot, perf, platform):
+        for item in tab_data:
+            with gr.Group():
+                gr.HTML(
+                    f'<p style="color: black; font-weight: bold;">{item["act"]}</p>'
+                )
+                prompt_text = gr.Button(
+                    value=f"{item['summary']}",
+                    size="sm",
+                    elem_classes="text-left-aligned",
+                )
+                prompt_text.click(
+                    fn=save_textbox_for_prompt,
+                    inputs=prompt_text,
+                    outputs=saved_input,
+                    api_name=False,
+                    queue=True,
+                ).then(
+                    fn=display_input,
+                    inputs=[saved_input, chatbot],
+                    outputs=chatbot,
+                    api_name=False,
+                    queue=True,
+                ).then(
+                    fn=check_input_token_length,
+                    inputs=[saved_input, chatbot, system_prompt],
+                    api_name=False,
+                    queue=False,
+                ).success(
+                    fn=generate,
+                    inputs=[
+                        saved_input,
+                        chatbot,
+                        system_prompt,
+                        max_new_tokens,
+                        temperature,
+                        top_p,
+                        top_k,
+                        platform,
+                    ],
+                    outputs=[
+                        chatbot,
+                        perf
+                    ],
+                    api_name=False,
+                )
+    CSS = """
+        .contain { display: flex; flex-direction: column;}
+        .text-left-aligned {text-align: left !important; font-size: 16px;}
+    """
+    with gr.Blocks(css=CSS, title="Gradio") as demo:
+        with gr.Row():
+            with gr.Column(visible=default_advanced_checkbox, variant="combat") as advanced_column:
+                system_prompt = gr.Textbox(
+                    label="System prompt", value=DEFAULT_SYSTEM_PROMPT, lines=6
+                )
+                max_new_tokens = gr.Slider(
+                    label="Max new tokens",
+                    minimum=1,
+                    maximum=MAX_MAX_NEW_TOKENS,
+                    step=1,
+                    value=DEFAULT_MAX_NEW_TOKENS,
+                )
+                temperature = gr.Slider(
+                    label="Temperature",
+                    minimum=0.1,
+                    maximum=4.0,
+                    step=0.1,
+                    value=1.0,
+                )
+                top_p = gr.Slider(
+                    label="Top-p (nucleus sampling)",
+                    minimum=0.05,
+                    maximum=1.0,
+                    step=0.05,
+                    value=0.95,
+                )
+                top_k = gr.Slider(
+                    label="Top-k",
+                    minimum=1,
+                    maximum=1000,
+                    step=1,
+                    value=50,
+                )
+            with gr.Column(scale=2):
+                with gr.Row():
+                    gr.Markdown("# llama2-webui")
+                    perf = gr.Markdown(value=f"## performance<br>Current Backend: {BACKEND_TYPE}", rtl=True)
+                with gr.Group():
+                    chatbot = gr.Chatbot(label="Chatbot")
+                    with gr.Row():
+                        textbox = gr.Textbox(
+                            container=False,
+                            show_label=False,
+                            placeholder="Type a message...",
+                            scale=10,
+                        )
+                        submit_button = gr.Button(
+                            "Submit", variant="primary",
+                        )
+                with gr.Row():
+                    retry_button = gr.Button("🔄  Retry", variant="secondary")
+                    undo_button = gr.Button("↩️ Undo", variant="secondary")
+                    clear_button = gr.Button("🗑️  Clear", variant="secondary")
+                saved_input = gr.State()
+                with gr.Row():
+                    advanced_checkbox = gr.Checkbox(
+                        label="Advanced",
+                        value=default_advanced_checkbox,
+                        container=False,
+                        elem_classes="min_check",
+                    )
+                    prompts_checkbox = gr.Checkbox(
+                        label="Prompts",
+                        value=default_prompts_checkbox,
+                        container=False,
+                        elem_classes="min_check",
+                    )
+                with gr.Row():
+                    platform = gr.Radio(["CUDA", "platform2"], label="Choose hardware platform", info="CUDA by default if no choosen")
+            with gr.Column(visible=default_prompts_checkbox) as prompt_column:
+                for k, v in prompts.items():
+                    with gr.Tab(k):
+                        tab_list(v, chatbot, perf, platform)
+        prompts_checkbox.change(
+            lambda x: gr.update(visible=x),
+            prompts_checkbox,
+            prompt_column,
+            queue=False,
+        )
+        advanced_checkbox.change(
+            lambda x: gr.update(visible=x),
+            advanced_checkbox,
+            advanced_column,
+            queue=False,
+        )
+        textbox.submit(
+            fn=clear_and_save_textbox,
+            inputs=textbox,
+            outputs=[textbox, saved_input],
+            api_name=False,
+            queue=False,
+        ).then(
+            fn=display_input,
+            inputs=[saved_input, chatbot],
+            outputs=chatbot,
+            api_name=False,
+            queue=False,
+        ).then(
+            fn=check_input_token_length,
+            inputs=[saved_input, chatbot, system_prompt],
+            api_name=False,
+            queue=False,
+        ).success(
+            fn=generate,
+            inputs=[
+                saved_input,
+                chatbot,
+                system_prompt,
+                max_new_tokens,
+                temperature,
+                top_p,
+                top_k,
+                platform,
+            ],
+            outputs=[
+                chatbot,
+                perf
+            ],
+            api_name=False,
+        )
+        submit_button.click(
+            fn=clear_and_save_textbox,
+            inputs=textbox,
+            outputs=[textbox, saved_input],
+            api_name=False,
+            queue=False,
+        ).then(
+            fn=display_input,
+            inputs=[saved_input, chatbot],
+            outputs=chatbot,
+            api_name=False,
+            queue=False,
+        ).then(
+            fn=check_input_token_length,
+            inputs=[saved_input, chatbot, system_prompt],
+            api_name=False,
+            queue=False,
+        ).success(
+            fn=generate,
+            inputs=[
+                saved_input,
+                chatbot,
+                system_prompt,
+                max_new_tokens,
+                temperature,
+                top_p,
+                top_k,
+                platform,
+            ],
+            outputs=[
+                chatbot,
+                perf
+            ],
+            api_name=False,
+        )
+        retry_button.click(
+            fn=delete_prev_fn,
+            inputs=chatbot,
+            outputs=[chatbot, saved_input],
+            api_name=False,
+            queue=False,
+        ).then(
+            fn=display_input,
+            inputs=[saved_input, chatbot],
+            outputs=chatbot,
+            api_name=False,
+            queue=False,
+        ).then(
+            fn=generate,
+            inputs=[
+                saved_input,
+                chatbot,
+                system_prompt,
+                max_new_tokens,
+                temperature,
+                top_p,
+                top_k,
+                platform,
+            ],
+            outputs=[
+                chatbot,
+                perf
+            ],
+            api_name=False,
+        )
+        undo_button.click(
+            fn=delete_prev_fn,
+            inputs=chatbot,
+            outputs=[chatbot, saved_input],
+            api_name=False,
+            queue=False,
+        ).then(
+            fn=lambda x: x,
+            inputs=[saved_input],
+            outputs=textbox,
+            api_name=False,
+            queue=False,
+        )
+        clear_button.click(
+            fn=lambda: ([], ""),
+            outputs=[chatbot, saved_input],
+            queue=False,
+            api_name=False,
+        )
+    demo.queue(max_size=20).launch(share=args.share)
+if __name__ == "__main__":
+    main()

benchmark.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import os
+import time
+import argparse
+from dotenv import load_dotenv
+from distutils.util import strtobool
+from memory_profiler import memory_usage
+from tqdm import tqdm
+from llama2_wrapper import LLAMA2_WRAPPER
+def run_iteration(
+    llama2_wrapper, prompt_example, DEFAULT_SYSTEM_PROMPT, DEFAULT_MAX_NEW_TOKENS
+):
+    def generation():
+        generator = llama2_wrapper.run(
+            prompt_example,
+            [],
+            DEFAULT_SYSTEM_PROMPT,
+            DEFAULT_MAX_NEW_TOKENS,
+            1,
+            0.95,
+            50,
+        )
+        model_response = None
+        try:
+            first_model_response = next(generator)
+        except StopIteration:
+            pass
+        for model_response in generator:
+            pass
+        return llama2_wrapper.get_token_length(model_response), model_response
+    tic = time.perf_counter()
+    mem_usage, (output_token_length, model_response) = memory_usage(
+        (generation,), max_usage=True, retval=True
+    )
+    toc = time.perf_counter()
+    generation_time = toc - tic
+    tokens_per_second = output_token_length / generation_time
+    return generation_time, tokens_per_second, mem_usage, model_response
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--iter", type=int, default=5, help="Number of iterations")
+    parser.add_argument("--model_path", type=str, default="", help="model path")
+    parser.add_argument(
+        "--backend_type",
+        type=str,
+        default="",
+        help="Backend options: llama.cpp, gptq, transformers",
+    )
+    parser.add_argument(
+        "--load_in_8bit",
+        type=bool,
+        default=False,
+        help="Whether to use bitsandbytes 8 bit.",
+    )
+    args = parser.parse_args()
+    load_dotenv()
+    DEFAULT_SYSTEM_PROMPT = os.getenv("DEFAULT_SYSTEM_PROMPT", "")
+    MAX_MAX_NEW_TOKENS = int(os.getenv("MAX_MAX_NEW_TOKENS", 2048))
+    DEFAULT_MAX_NEW_TOKENS = int(os.getenv("DEFAULT_MAX_NEW_TOKENS", 1024))
+    MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", 4000))
+    MODEL_PATH = os.getenv("MODEL_PATH")
+    assert MODEL_PATH is not None, f"MODEL_PATH is required, got: {MODEL_PATH}"
+    BACKEND_TYPE = os.getenv("BACKEND_TYPE")
+    assert BACKEND_TYPE is not None, f"BACKEND_TYPE is required, got: {BACKEND_TYPE}"
+    LOAD_IN_8BIT = bool(strtobool(os.getenv("LOAD_IN_8BIT", "True")))
+    if args.model_path != "":
+        MODEL_PATH = args.model_path
+    if args.backend_type != "":
+        BACKEND_TYPE = args.backend_type
+    if args.load_in_8bit:
+        LOAD_IN_8BIT = True
+    # Initialization
+    init_tic = time.perf_counter()
+    llama2_wrapper = LLAMA2_WRAPPER(
+        model_path=MODEL_PATH,
+        backend_type=BACKEND_TYPE,
+        max_tokens=MAX_INPUT_TOKEN_LENGTH,
+        load_in_8bit=LOAD_IN_8BIT,
+        # verbose=True,
+    )
+    init_toc = time.perf_counter()
+    initialization_time = init_toc - init_tic
+    total_time = 0
+    total_tokens_per_second = 0
+    total_memory_gen = 0
+    prompt_example = (
+        "Can you explain briefly to me what is the Python programming language?"
+    )
+    # Cold run
+    print("Performing cold run...")
+    run_iteration(
+        llama2_wrapper, prompt_example, DEFAULT_SYSTEM_PROMPT, DEFAULT_MAX_NEW_TOKENS
+    )
+    # Timed runs
+    print(f"Performing {args.iter} timed runs...")
+    for i in tqdm(range(args.iter)):
+        try:
+            gen_time, tokens_per_sec, mem_gen, model_response = run_iteration(
+                llama2_wrapper,
+                prompt_example,
+                DEFAULT_SYSTEM_PROMPT,
+                DEFAULT_MAX_NEW_TOKENS,
+            )
+            total_time += gen_time
+            total_tokens_per_second += tokens_per_sec
+            total_memory_gen += mem_gen
+        except:
+            break
+    avg_time = total_time / (i + 1)
+    avg_tokens_per_second = total_tokens_per_second / (i + 1)
+    avg_memory_gen = total_memory_gen / (i + 1)
+    print(f"Last model response: {model_response}")
+    print(f"Initialization time: {initialization_time:0.4f} seconds.")
+    print(
+        f"Average generation time over {(i + 1)} iterations: {avg_time:0.4f} seconds."
+    )
+    print(
+        f"Average speed over {(i + 1)} iterations: {avg_tokens_per_second:0.4f} tokens/sec."
+    )
+    print(f"Average memory usage during generation: {avg_memory_gen:.2f} MiB")
+if __name__ == "__main__":
+    main()

code_completion.py ADDED Viewed

	@@ -0,0 +1,216 @@

+import argparse
+import gradio as gr
+from llama2_wrapper import LLAMA2_WRAPPER
+FIM_PREFIX = "<PRE> "
+FIM_MIDDLE = " <MID>"
+FIM_SUFFIX = " <SUF>"
+FIM_INDICATOR = "<FILL_ME>"
+EOS_STRING = "</s>"
+EOT_STRING = "<EOT>"
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        default="./models/codellama-7b-instruct.ggmlv3.Q4_0.bin",
+        help="model path",
+    )
+    parser.add_argument(
+        "--backend_type",
+        type=str,
+        default="llama.cpp",
+        help="Backend options: llama.cpp, gptq, transformers",
+    )
+    parser.add_argument(
+        "--max_tokens",
+        type=int,
+        default=4000,
+        help="Maximum context size.",
+    )
+    parser.add_argument(
+        "--load_in_8bit",
+        type=bool,
+        default=False,
+        help="Whether to use bitsandbytes 8 bit.",
+    )
+    parser.add_argument(
+        "--share",
+        type=bool,
+        default=False,
+        help="Whether to share public for gradio.",
+    )
+    args = parser.parse_args()
+    llama2_wrapper = LLAMA2_WRAPPER(
+        model_path=args.model_path,
+        backend_type=args.backend_type,
+        max_tokens=args.max_tokens,
+        load_in_8bit=args.load_in_8bit,
+    )
+    def generate(
+        prompt,
+        temperature=0.9,
+        max_new_tokens=256,
+        top_p=0.95,
+        repetition_penalty=1.0,
+    ):
+        temperature = float(temperature)
+        if temperature < 1e-2:
+            temperature = 1e-2
+        top_p = float(top_p)
+        fim_mode = False
+        generate_kwargs = dict(
+            temperature=temperature,
+            max_new_tokens=max_new_tokens,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            stream=True,
+        )
+        if FIM_INDICATOR in prompt:
+            fim_mode = True
+            try:
+                prefix, suffix = prompt.split(FIM_INDICATOR)
+            except:
+                raise ValueError(f"Only one {FIM_INDICATOR} allowed in prompt!")
+            prompt = f"{FIM_PREFIX}{prefix}{FIM_SUFFIX}{suffix}{FIM_MIDDLE}"
+        stream = llama2_wrapper.__call__(prompt, **generate_kwargs)
+        if fim_mode:
+            output = prefix
+        else:
+            output = prompt
+        # for response in stream:
+        #     output += response
+        #     yield output
+        # return output
+        previous_token = ""
+        for response in stream:
+            if any([end_token in response for end_token in [EOS_STRING, EOT_STRING]]):
+                if fim_mode:
+                    output += suffix
+                    yield output
+                    return output
+                    print("output", output)
+                else:
+                    return output
+            else:
+                output += response
+            previous_token = response
+            yield output
+        return output
+    examples = [
+        'def remove_non_ascii(s: str) -> str:\n    """ <FILL_ME>\nprint(remove_non_ascii(\'afkdj$$(\'))',
+        "X_train, y_train, X_test, y_test = train_test_split(X, y, test_size=0.1)\n\n# Train a logistic regression model, predict the labels on the test set and compute the accuracy score",
+        "// Returns every other value in the array as a new array.\nfunction everyOther(arr) {",
+        "Poor English: She no went to the market. Corrected English:",
+        "def alternating(list1, list2):\n   results = []\n   for i in range(min(len(list1), len(list2))):\n       results.append(list1[i])\n       results.append(list2[i])\n   if len(list1) > len(list2):\n       <FILL_ME>\n   else:\n       results.extend(list2[i+1:])\n   return results",
+    ]
+    def process_example(args):
+        for x in generate(args):
+            pass
+        return x
+    description = """
+    <div style="text-align: center;">
+        <h1>Code Llama Playground</h1>
+    </div>
+    <div style="text-align: center;">
+        <p>This is a demo to complete code with Code Llama. For instruction purposes, please use llama2-webui app.py with CodeLlama-Instruct models. </p>
+    </div>
+    """
+    with gr.Blocks() as demo:
+        with gr.Column():
+            gr.Markdown(description)
+            with gr.Row():
+                with gr.Column():
+                    instruction = gr.Textbox(
+                        placeholder="Enter your code here",
+                        lines=5,
+                        label="Input",
+                        elem_id="q-input",
+                    )
+                    submit = gr.Button("Generate", variant="primary")
+                    output = gr.Code(elem_id="q-output", lines=30, label="Output")
+                    with gr.Row():
+                        with gr.Column():
+                            with gr.Accordion("Advanced settings", open=False):
+                                with gr.Row():
+                                    column_1, column_2 = gr.Column(), gr.Column()
+                                    with column_1:
+                                        temperature = gr.Slider(
+                                            label="Temperature",
+                                            value=0.1,
+                                            minimum=0.0,
+                                            maximum=1.0,
+                                            step=0.05,
+                                            interactive=True,
+                                            info="Higher values produce more diverse outputs",
+                                        )
+                                        max_new_tokens = gr.Slider(
+                                            label="Max new tokens",
+                                            value=256,
+                                            minimum=0,
+                                            maximum=8192,
+                                            step=64,
+                                            interactive=True,
+                                            info="The maximum numbers of new tokens",
+                                        )
+                                    with column_2:
+                                        top_p = gr.Slider(
+                                            label="Top-p (nucleus sampling)",
+                                            value=0.90,
+                                            minimum=0.0,
+                                            maximum=1,
+                                            step=0.05,
+                                            interactive=True,
+                                            info="Higher values sample more low-probability tokens",
+                                        )
+                                        repetition_penalty = gr.Slider(
+                                            label="Repetition penalty",
+                                            value=1.05,
+                                            minimum=1.0,
+                                            maximum=2.0,
+                                            step=0.05,
+                                            interactive=True,
+                                            info="Penalize repeated tokens",
+                                        )
+                    gr.Examples(
+                        examples=examples,
+                        inputs=[instruction],
+                        cache_examples=False,
+                        fn=process_example,
+                        outputs=[output],
+                    )
+        submit.click(
+            generate,
+            inputs=[
+                instruction,
+                temperature,
+                max_new_tokens,
+                top_p,
+                repetition_penalty,
+            ],
+            outputs=[output],
+        )
+    demo.queue(concurrency_count=16).launch(share=args.share)
+if __name__ == "__main__":
+    main()

colab/Llama_2_7b_Chat_GPTQ.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

colab/ggmlv3_q4_0.ipynb ADDED Viewed

	@@ -0,0 +1,109 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "toc_visible": true,
+      "authorship_tag": "ABX9TyM9WbudQYrVFksXUrt4Opt3",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/liltom-eth/llama2-webui/blob/main/colab/ggmlv3_q4_0.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "7O5JSosg5-rx"
+      },
+      "outputs": [],
+      "source": [
+        "%cd /content\n",
+        "!pip install llama2-wrapper\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from llama2_wrapper import LLAMA2_WRAPPER, get_prompt\n",
+        "\n",
+        "llama2_wrapper = LLAMA2_WRAPPER()"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "8rgb1ckl72wC",
+        "outputId": "d9ca2e20-26a5-490b-86f2-1a182e533b20"
+      },
+      "execution_count": 5,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Running on backend llama.cpp.\n",
+            "Use default model path: ./models/llama-2-7b-chat.ggmlv3.q4_0.bin\n",
+            "Start downloading model to: ./models/llama-2-7b-chat.ggmlv3.q4_0.bin\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "prompt = get_prompt(\"Hi do you know Pytorch?\")\n",
+        "print(llama2_wrapper(prompt))"
+      ],
+      "metadata": {
+        "id": "Qz2xAqozTIf6",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "1380fa52-3d4a-4ac5-ed02-7faefe7ec2f6"
+      },
+      "execution_count": 3,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "  Yes, I'm familiar with PyTorch! PyTorch is an open-source deep learning framework that is widely used for building and training neural networks. It was originally developed by Facebook and is now maintained by the PyTorch Foundation.\n",
+            "\n",
+            "Here are some key features and capabilities of PyTorch:\n",
+            "\n",
+            "1. **Tensor Computation**: PyTorch provides a powerful tensor computation engine that allows for complex mathematical operations on large datasets.\n",
+            "2. **Autograd**: PyTorch's autograd system automatically computes gradients, which can save a lot of time and effort during training.\n",
+            "3. **Dynamic Compute**: PyTorch's dynamic compute system allows for more efficient computation by only computing the necessary computations at runtime.\n",
+            "4. **Memory-efficient**: PyTorch is designed to be memory-efficient, which is important for training large models that require a lot of memory.\n",
+            "5. **Accelerators**: PyTorch supports a wide range of accelerators, including GPUs, TPUs, and FPGAs, which can significantly speed up training times.\n",
+            "6. **Modules**: PyTorch provides a wide range of pre-built modules for common tasks, such as convolutional layers, recurrent neural networks, and more.\n",
+            "7. **Extensive Community**: PyTorch has a large and active community of developers and users, which can be helpful for getting support and staying up-to-date with the latest developments.\n",
+            "8. **Easy Integration**: PyTorch can be easily integrated with other popular deep learning frameworks, such as TensorFlow and Keras.\n",
+            "9. **Pythonic**: PyTorch is written in Python, which is a popular and easy-to-learn programming language.\n",
+            "10. **Flexible**: PyTorch allows for a wide range of customization options, which can be useful for building and training unique models.\n",
+            "\n",
+            "Overall, PyTorch is a powerful and flexible deep learning framework that can be used for a wide range of applications, including computer vision, natural language processing, and more.\n"
+          ]
+        }
+      ]
+    }
+  ]
+}

colab/webui_CodeLlama_7B_Instruct_GPTQ.ipynb ADDED Viewed

	@@ -0,0 +1,514 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "gpuType": "T4",
+      "authorship_tag": "ABX9TyOZhPcZe61RhDjhEFQv0vrl",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/liltom-eth/llama2-webui/blob/main/colab/webui_CodeLlama_7B_Instruct_GPTQ.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "7O5JSosg5-rx"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install -U llama2-wrapper==0.1.12"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%cd /content\n",
+        "!git clone https://github.com/liltom-eth/llama2-webui\n",
+        "\n",
+        "%cd /content/llama2-webui\n",
+        "!python -m llama2_wrapper.download --repo_id TheBloke/CodeLlama-7B-Instruct-GPTQ\n",
+        "\n",
+        "%cd /content/llama2-webui\n",
+        "!python app.py --backend_type gptq --model_path ./models/CodeLlama-7B-Instruct-GPTQ/ --share True"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Y6A7bJdkmzY8",
+        "outputId": "0d702a7d-68ab-4747-f012-246d4dee3718"
+      },
+      "execution_count": 4,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "/content\n",
+            "fatal: destination path 'llama2-webui' already exists and is not an empty directory.\n",
+            "/content/llama2-webui\n",
+            "Start downloading model TheBloke/CodeLlama-7B-Instruct-GPTQ to: ./models/CodeLlama-7B-Instruct-GPTQ\n",
+            "Fetching 15 files:   0% 0/15 [00:00<?, ?it/s]\n",
+            "Downloading (…)d0d05/.gitattributes: 100% 1.52k/1.52k [00:00<00:00, 7.94MB/s]\n",
+            "Fetching 15 files:   7% 1/15 [00:01<00:16,  1.15s/it]\n",
+            "Downloading (…)478d0d05/LICENSE.txt: 100% 7.02k/7.02k [00:00<00:00, 31.6MB/s]\n",
+            "\n",
+            "Downloading (…)478d0d05/config.json: 100% 1.25k/1.25k [00:00<00:00, 7.95MB/s]\n",
+            "\n",
+            "Downloading (…)nfiguration_llama.py: 100% 8.56k/8.56k [00:00<00:00, 41.7MB/s]\n",
+            "\n",
+            "Downloading (…)81b84478d0d05/Notice: 100% 112/112 [00:00<00:00, 750kB/s]\n",
+            "\n",
+            "Downloading (…)neration_config.json: 100% 132/132 [00:00<00:00, 836kB/s]\n",
+            "\n",
+            "Downloading (…)8d0d05/USE_POLICY.md: 100% 105/105 [00:00<00:00, 686kB/s]\n",
+            "\n",
+            "Downloading (…)84478d0d05/README.md: 100% 22.0k/22.0k [00:00<00:00, 59.5MB/s]\n",
+            "\n",
+            "Downloading (…)05/modeling_llama.py: 100% 45.9k/45.9k [00:00<00:00, 27.5MB/s]\n",
+            "\n",
+            "Downloading (…)quantize_config.json: 100% 187/187 [00:00<00:00, 1.34MB/s]\n",
+            "\n",
+            "Downloading (…)cial_tokens_map.json: 100% 411/411 [00:00<00:00, 2.82MB/s]\n",
+            "\n",
+            "Downloading (…)d0d05/tokenizer.json:   0% 0.00/1.84M [00:00<?, ?B/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)okenizer_config.json: 100% 824/824 [00:00<00:00, 5.75MB/s]\n",
+            "\n",
+            "\n",
+            "Downloading model.safetensors:   0% 0.00/3.90G [00:00<?, ?B/s]\u001b[A\u001b[A\n",
+            "\n",
+            "\n",
+            "Downloading tokenizer.model: 100% 500k/500k [00:00<00:00, 16.3MB/s]\n",
+            "\n",
+            "Downloading (…)d0d05/tokenizer.json: 100% 1.84M/1.84M [00:00<00:00, 5.47MB/s]\n",
+            "\n",
+            "\n",
+            "Downloading model.safetensors:   0% 10.5M/3.90G [00:00<01:08, 56.4MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   1% 21.0M/3.90G [00:00<00:57, 67.1MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   1% 31.5M/3.90G [00:00<00:51, 75.5MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   1% 52.4M/3.90G [00:00<00:40, 94.5MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   2% 73.4M/3.90G [00:00<00:33, 113MB/s] \u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   2% 94.4M/3.90G [00:00<00:28, 133MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   3% 115M/3.90G [00:00<00:25, 148MB/s] \u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   3% 136M/3.90G [00:01<00:24, 156MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   4% 157M/3.90G [00:01<00:22, 167MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   5% 178M/3.90G [00:01<00:22, 168MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   5% 199M/3.90G [00:01<00:21, 169MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   6% 220M/3.90G [00:01<00:21, 170MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   6% 241M/3.90G [00:01<00:21, 174MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   7% 262M/3.90G [00:01<00:20, 177MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   7% 283M/3.90G [00:02<01:08, 52.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   8% 315M/3.90G [00:02<00:47, 75.6MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   9% 346M/3.90G [00:03<00:36, 97.8MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   9% 367M/3.90G [00:03<00:31, 111MB/s] \u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  10% 388M/3.90G [00:03<00:28, 122MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  10% 409M/3.90G [00:03<00:26, 134MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  11% 430M/3.90G [00:03<00:24, 141MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  12% 461M/3.90G [00:03<00:21, 160MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  12% 482M/3.90G [00:03<00:20, 165MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  13% 503M/3.90G [00:04<00:20, 166MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  13% 524M/3.90G [00:04<00:19, 170MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  14% 556M/3.90G [00:04<00:18, 181MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  15% 577M/3.90G [00:04<00:18, 182MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  15% 598M/3.90G [00:04<00:18, 183MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  16% 619M/3.90G [00:04<00:17, 184MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  16% 640M/3.90G [00:04<00:17, 184MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  17% 661M/3.90G [00:04<00:18, 178MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  17% 682M/3.90G [00:04<00:17, 180MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  18% 703M/3.90G [00:05<00:17, 180MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  19% 724M/3.90G [00:05<00:17, 181MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  19% 744M/3.90G [00:05<00:18, 171MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  20% 765M/3.90G [00:05<00:18, 173MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  20% 786M/3.90G [00:05<00:17, 175MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  21% 807M/3.90G [00:05<00:17, 178MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  21% 828M/3.90G [00:05<00:17, 180MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  22% 849M/3.90G [00:05<00:16, 182MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  22% 870M/3.90G [00:07<01:37, 30.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  23% 891M/3.90G [00:08<01:13, 40.8MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  24% 923M/3.90G [00:08<00:50, 59.3MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  24% 944M/3.90G [00:08<00:42, 70.2MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  25% 975M/3.90G [00:08<00:30, 94.3MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  26% 996M/3.90G [00:08<00:27, 107MB/s] \u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  26% 1.02G/3.90G [00:08<00:23, 121MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  27% 1.04G/3.90G [00:08<00:21, 134MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  27% 1.06G/3.90G [00:08<00:20, 141MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  28% 1.08G/3.90G [00:09<00:18, 151MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  28% 1.10G/3.90G [00:09<00:17, 160MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  29% 1.12G/3.90G [00:09<00:16, 166MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  29% 1.14G/3.90G [00:09<00:16, 171MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  30% 1.16G/3.90G [00:09<00:15, 175MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  30% 1.18G/3.90G [00:09<00:15, 178MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  31% 1.21G/3.90G [00:09<00:15, 179MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  31% 1.23G/3.90G [00:09<00:14, 181MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  32% 1.25G/3.90G [00:09<00:14, 182MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  33% 1.27G/3.90G [00:10<00:23, 113MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  33% 1.29G/3.90G [00:10<00:20, 128MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  34% 1.31G/3.90G [00:10<00:18, 139MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  34% 1.33G/3.90G [00:10<00:17, 150MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  35% 1.35G/3.90G [00:10<00:16, 158MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  35% 1.37G/3.90G [00:12<01:24, 29.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  36% 1.41G/3.90G [00:12<00:55, 45.3MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  37% 1.44G/3.90G [00:13<00:39, 63.0MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  37% 1.46G/3.90G [00:13<00:33, 72.6MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  38% 1.48G/3.90G [00:13<00:29, 82.0MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  38% 1.50G/3.90G [00:13<00:24, 98.6MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  39% 1.53G/3.90G [00:13<00:19, 124MB/s] \u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  40% 1.55G/3.90G [00:13<00:17, 132MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  40% 1.57G/3.90G [00:13<00:16, 143MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  41% 1.59G/3.90G [00:14<00:15, 153MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  41% 1.61G/3.90G [00:14<00:14, 160MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  42% 1.64G/3.90G [00:14<00:13, 167MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  43% 1.66G/3.90G [00:14<00:13, 171MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  43% 1.68G/3.90G [00:14<00:12, 177MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  44% 1.70G/3.90G [00:14<00:12, 174MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  44% 1.72G/3.90G [00:14<00:12, 173MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  45% 1.74G/3.90G [00:14<00:12, 175MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  45% 1.76G/3.90G [00:14<00:11, 179MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  46% 1.78G/3.90G [00:15<00:12, 172MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  46% 1.80G/3.90G [00:15<00:12, 174MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  47% 1.82G/3.90G [00:15<00:11, 177MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  47% 1.85G/3.90G [00:16<00:28, 71.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  48% 1.87G/3.90G [00:16<00:23, 87.4MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  49% 1.90G/3.90G [00:16<00:16, 118MB/s] \u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  49% 1.92G/3.90G [00:16<00:14, 132MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  50% 1.94G/3.90G [00:16<00:13, 143MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  50% 1.96G/3.90G [00:16<00:12, 152MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  51% 1.98G/3.90G [00:16<00:13, 142MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  51% 2.00G/3.90G [00:16<00:13, 144MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  52% 2.02G/3.90G [00:17<00:12, 144MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  52% 2.04G/3.90G [00:17<00:12, 148MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  53% 2.07G/3.90G [00:17<00:12, 152MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  54% 2.09G/3.90G [00:17<00:22, 81.2MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  54% 2.12G/3.90G [00:18<00:16, 107MB/s] \u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  55% 2.14G/3.90G [00:18<00:14, 119MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  55% 2.16G/3.90G [00:18<00:14, 123MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  56% 2.18G/3.90G [00:18<00:13, 131MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  57% 2.21G/3.90G [00:18<00:10, 156MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  57% 2.23G/3.90G [00:18<00:10, 162MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  58% 2.25G/3.90G [00:18<00:10, 160MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  59% 2.29G/3.90G [00:18<00:09, 174MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  59% 2.31G/3.90G [00:19<00:08, 178MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  60% 2.33G/3.90G [00:19<00:08, 180MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  60% 2.35G/3.90G [00:19<00:08, 181MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  61% 2.37G/3.90G [00:19<00:08, 181MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  61% 2.39G/3.90G [00:19<00:08, 181MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  62% 2.41G/3.90G [00:19<00:08, 182MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  62% 2.43G/3.90G [00:19<00:08, 182MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  63% 2.45G/3.90G [00:19<00:08, 177MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  64% 2.47G/3.90G [00:20<00:11, 124MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  64% 2.51G/3.90G [00:20<00:09, 149MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  65% 2.53G/3.90G [00:22<00:40, 34.2MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  66% 2.56G/3.90G [00:22<00:26, 50.1MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  66% 2.58G/3.90G [00:22<00:21, 60.1MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  67% 2.60G/3.90G [00:22<00:18, 69.4MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  67% 2.62G/3.90G [00:22<00:15, 84.0MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  68% 2.64G/3.90G [00:22<00:12, 99.4MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  68% 2.66G/3.90G [00:23<00:12, 96.0MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  69% 2.68G/3.90G [00:23<00:12, 95.4MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  69% 2.71G/3.90G [00:23<00:14, 84.2MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  70% 2.73G/3.90G [00:23<00:14, 82.0MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  70% 2.74G/3.90G [00:24<00:14, 80.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  70% 2.75G/3.90G [00:24<00:15, 75.8MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  71% 2.76G/3.90G [00:24<00:15, 75.3MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  71% 2.77G/3.90G [00:24<00:15, 72.2MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  71% 2.78G/3.90G [00:24<00:14, 74.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  72% 2.79G/3.90G [00:24<00:14, 74.7MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  72% 2.80G/3.90G [00:25<00:15, 69.4MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  72% 2.81G/3.90G [00:25<00:15, 71.3MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  72% 2.82G/3.90G [00:25<00:13, 77.5MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  73% 2.84G/3.90G [00:25<00:12, 84.6MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  73% 2.85G/3.90G [00:25<00:12, 83.8MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  73% 2.86G/3.90G [00:25<00:12, 81.6MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  74% 2.88G/3.90G [00:25<00:10, 97.2MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  75% 2.90G/3.90G [00:26<00:08, 118MB/s] \u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  75% 2.93G/3.90G [00:26<00:07, 134MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  76% 2.95G/3.90G [00:26<00:06, 149MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  76% 2.97G/3.90G [00:26<00:05, 159MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  77% 2.99G/3.90G [00:27<00:23, 37.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  77% 3.02G/3.90G [00:27<00:15, 57.4MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  78% 3.04G/3.90G [00:28<00:12, 67.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  79% 3.06G/3.90G [00:28<00:10, 78.8MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  79% 3.08G/3.90G [00:28<00:08, 92.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  80% 3.10G/3.90G [00:28<00:07, 109MB/s] \u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  80% 3.14G/3.90G [00:28<00:05, 138MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  81% 3.16G/3.90G [00:28<00:05, 146MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  82% 3.18G/3.90G [00:28<00:04, 152MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  82% 3.20G/3.90G [00:29<00:04, 161MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  83% 3.22G/3.90G [00:29<00:03, 170MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  83% 3.24G/3.90G [00:29<00:04, 158MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  84% 3.26G/3.90G [00:29<00:04, 156MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  84% 3.28G/3.90G [00:29<00:03, 160MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  85% 3.30G/3.90G [00:29<00:03, 162MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  85% 3.32G/3.90G [00:29<00:03, 160MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  86% 3.34G/3.90G [00:29<00:03, 171MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  87% 3.38G/3.90G [00:30<00:02, 191MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  87% 3.40G/3.90G [00:30<00:02, 188MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  88% 3.42G/3.90G [00:30<00:02, 187MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  88% 3.44G/3.90G [00:30<00:02, 182MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  89% 3.46G/3.90G [00:30<00:02, 183MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  89% 3.48G/3.90G [00:30<00:02, 183MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  90% 3.50G/3.90G [00:30<00:02, 184MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  90% 3.52G/3.90G [00:30<00:02, 185MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  91% 3.54G/3.90G [00:30<00:01, 183MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  91% 3.57G/3.90G [00:31<00:05, 55.5MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  92% 3.59G/3.90G [00:32<00:08, 38.3MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  93% 3.61G/3.90G [00:32<00:05, 50.7MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  93% 3.63G/3.90G [00:33<00:04, 65.0MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  94% 3.65G/3.90G [00:33<00:03, 80.3MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  94% 3.67G/3.90G [00:33<00:02, 97.3MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  95% 3.69G/3.90G [00:33<00:01, 113MB/s] \u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  95% 3.71G/3.90G [00:33<00:01, 128MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  96% 3.73G/3.90G [00:33<00:01, 139MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  96% 3.75G/3.90G [00:33<00:00, 153MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  97% 3.77G/3.90G [00:33<00:00, 158MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  97% 3.80G/3.90G [00:34<00:00, 165MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  98% 3.82G/3.90G [00:34<00:00, 167MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  98% 3.84G/3.90G [00:34<00:00, 169MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  99% 3.86G/3.90G [00:34<00:00, 174MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors: 100% 3.90G/3.90G [00:34<00:00, 113MB/s]\n",
+            "Fetching 15 files: 100% 15/15 [00:36<00:00,  2.41s/it]\n",
+            "/content/llama2-webui\n",
+            "Running on GPU with backend torch transformers.\n",
+            "2023-08-26 07:14:25.222792: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+            "skip module injection for FusedLlamaMLPForQuantizedModel not support integrate without triton yet.\n",
+            "Caching examples at: '/content/llama2-webui/gradio_cached_examples/19'\n",
+            "Caching example 1/5\n",
+            "Caching example 2/5\n",
+            "Caching example 3/5\n",
+            "Caching example 4/5\n",
+            "Caching example 5/5\n",
+            "Caching complete\n",
+            "\n",
+            "Running on local URL:  http://127.0.0.1:7860\n",
+            "Running on public URL: https://71c3606942c440e7dd.gradio.live\n",
+            "\n",
+            "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n",
+            "Keyboard interruption in main thread... closing server.\n",
+            "Traceback (most recent call last):\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/gradio/blocks.py\", line 2130, in block_thread\n",
+            "    time.sleep(0.1)\n",
+            "KeyboardInterrupt\n",
+            "\n",
+            "During handling of the above exception, another exception occurred:\n",
+            "\n",
+            "Traceback (most recent call last):\n",
+            "  File \"/content/llama2-webui/app.py\", line 322, in <module>\n",
+            "    main()\n",
+            "  File \"/content/llama2-webui/app.py\", line 318, in main\n",
+            "    demo.queue(max_size=20).launch(share=args.share)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/gradio/blocks.py\", line 2046, in launch\n",
+            "    self.block_thread()\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/gradio/blocks.py\", line 2132, in block_thread\n",
+            "    print(\"Keyboard interruption in main thread... closing server.\")\n",
+            "KeyboardInterrupt\n",
+            "Killing tunnel 127.0.0.1:7860 <> https://71c3606942c440e7dd.gradio.live\n",
+            "terminate called without an active exception\n"
+          ]
+        }
+      ]
+    }
+  ]
+}

docs/issues.md ADDED Viewed

File without changes

docs/news.md ADDED Viewed

	@@ -0,0 +1,38 @@

+# News
+- [2023/09] The newest `llama2-wrapper>=0.1.14` supports llama.cpp's `gguf` models.
+- [2023/08] 🔥 For developers, we offer a web server that acts as a drop-in replacement for the OpenAI API.
+  - Usage:
+    ```
+    python3 -m llama2_wrapper.server
+    ```
+- [2023/08] 🔥 For developers, we released `llama2-wrapper`  as a llama2 backend wrapper in [PYPI](https://pypi.org/project/llama2-wrapper/).
+  - Install: `pip install llama2-wrapper`
+  - Usage:
+    ```python
+    from llama2_wrapper import LLAMA2_WRAPPER, get_prompt
+    llama2_wrapper = LLAMA2_WRAPPER(
+        model_path="./models/Llama-2-7B-Chat-GGML/llama-2-7b-chat.ggmlv3.q4_0.bin",
+        backend_type="llama.cpp", #options: llama.cpp, transformers, gptq
+    )
+    prompt = "Do you know Pytorch"
+    llama2_promt = get_prompt(prompt)
+    answer = llama2_wrapper(llama2_promt, temperature=0.9)
+    ```
+- [2023/08] 🔥 We added `benchmark.py` for users to benchmark llama2 models on their local devices.
+  - Check/contribute the performance of your device in the full [performance doc](https://github.com/liltom-eth/llama2-webui/blob/main/docs/performance.md).
+- [2023/07] We released **[llama2-webui](https://github.com/liltom-eth/llama2-webui)**, a gradio web UI to run Llama 2 on GPU or CPU from anywhere (Linux/Windows/Mac).
+  - Supporting models: [Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)/[13b](https://huggingface.co/llamaste/Llama-2-13b-chat-hf)/[70b](https://huggingface.co/llamaste/Llama-2-70b-chat-hf), all [Llama-2-GPTQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ), all [Llama-2-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML) ...
+  - Supporting model backends:  [tranformers](https://github.com/huggingface/transformers), [bitsandbytes(8-bit inference)](https://github.com/TimDettmers/bitsandbytes), [AutoGPTQ(4-bit inference)](https://github.com/PanQiWei/AutoGPTQ), [llama.cpp](https://github.com/ggerganov/llama.cpp)

docs/performance.md ADDED Viewed

	@@ -0,0 +1,32 @@

+# Benchmark Performance
+## Performance on Nvidia GPU
+| Model                             | Precision | Device | GPU VRAM | Speed (tokens/sec) | load time (s) |
+| --------------------------------- | --------- | ---------- | ---------------------- | ---------------- | ---------------- |
+| Llama-2-7b-chat-hf | 16 bit |  |  |              |              |
+| Llama-2-7b-chat-hf          |   8bit   | NVIDIA RTX 2080 Ti    | 7.7 GB VRAM | 3.76 | 641.36 |
+| Llama-2-7b-Chat-GPTQ        |   4bit   | NVIDIA RTX 2080 Ti    | 5.8 GB VRAM | 18.85 | 192.91 |
+| Llama-2-7b-Chat-GPTQ        |   4bit   | NVIDIA GTX 1660 Super | 4.8 GB VRAM | 8.5   | 262.74        |
+| Llama-2-7b-Chat-GPTQ | 4 bit | Google Colab T4 | 5.8 GB VRAM | 18.19 | 37.44 |
+| Llama-2-13b-chat-hf               |   16 bit   |  |                  |                  |                  |
+|  |  | |  | | |
+## Performance on CPU / OpenBLAS / cuBLAS / CLBlast / Metal
+| Model                             | Precision | Device | RAM / GPU VRAM | Speed (tokens/sec) | load time (s) |
+| --------------------------------- | --------- | ---------- | ---------------------- | ---------------- | ---------------- |
+| llama-2-7b-chat.ggmlv3.q2_K | 2 bit     | Intel i7-8700 | 4.5 GB RAM     | 7.88               | 31.90         |
+| llama-2-7b-chat.ggmlv3.q2_K | 2 bit | Apple M2 CPU | 4.5 GB RAM | 11.10 | 0.10 |
+| llama-2-7b-chat.ggmlv3.q2_K | 2 bit | Apple M2 Metal | 4.5 GB RAM | 12.10 | 0.12 |
+| llama-2-7b-chat.ggmlv3.q4_0 | 4 bit     | Intel i7-8700 | 5.4 GB RAM     | 6.27            | 173.15 |
+| llama-2-7b-chat.ggmlv3.q4_0 | 4 bit     | Intel i7-9700 | 4.8 GB RAM   | 4.2                 | 87.9        |
+| llama-2-7b-chat.ggmlv3.q4_0 | 4 bit | Apple M1 Pro CPU | 5.4 GB RAM | 17.90 | 0.18 |
+| llama-2-7b-chat.ggmlv3.q4_0 | 4 bit     | Apple M2 CPU | 5.4 GB RAM | 13.70 | 0.13 |
+| llama-2-7b-chat.ggmlv3.q4_0 | 4 bit | Apple M2 Metal | 5.4 GB RAM | 12.60 | 0.10 |
+| llama-2-7b-chat.ggmlv3.q4_0 | 4 bit | AMD Ryzen 9 5900HS | 4.1 GB RAM | 6.01 | 0.15 |
+| llama-2-7b-chat.ggmlv3.q4_0 | 4 bit | Intel vServer 4 threads, eth services | 8 GB RAM | 1.31 | 0.5|
+| llama-2-7b-chat.ggmlv3.q8_0 | 8 bit | Intel i7-8700 | 8.6 GB RAM | 2.63 | 336.57 |
+| llama-2-7b-chat.ggmlv3.q8_0 | 8 bit     | Intel i7-9700 | 7.6 GB RAM   | 2.05              | 302.9    |
+|  |  |  |  |  |  |

docs/pypi.md ADDED Viewed

	@@ -0,0 +1,187 @@

+# llama2-wrapper
+- Use [llama2-wrapper](https://pypi.org/project/llama2-wrapper/) as your local llama2 backend for Generative Agents/Apps, [colab example](https://github.com/liltom-eth/llama2-webui/blob/main/colab/Llama_2_7b_Chat_GPTQ.ipynb).
+- [Run OpenAI Compatible API](https://github.com/liltom-eth/llama2-webui#start-openai-compatible-api) on Llama2 models.
+## Features
+- Supporting models: [Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)/[13b](https://huggingface.co/llamaste/Llama-2-13b-chat-hf)/[70b](https://huggingface.co/llamaste/Llama-2-70b-chat-hf), [Llama-2-GPTQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ), [Llama-2-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML), [CodeLlama](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ)...
+- Supporting model backends: [tranformers](https://github.com/huggingface/transformers), [bitsandbytes(8-bit inference)](https://github.com/TimDettmers/bitsandbytes), [AutoGPTQ(4-bit inference)](https://github.com/PanQiWei/AutoGPTQ), [llama.cpp](https://github.com/ggerganov/llama.cpp)
+- Demos: [Run Llama2 on MacBook Air](https://twitter.com/liltom_eth/status/1682791729207070720?s=20); [Run Llama2 on Colab T4 GPU](https://github.com/liltom-eth/llama2-webui/blob/main/colab/Llama_2_7b_Chat_GPTQ.ipynb)
+- Use  [llama2-wrapper](https://pypi.org/project/llama2-wrapper/)  as your local llama2 backend for Generative Agents/Apps; [colab example](./colab/Llama_2_7b_Chat_GPTQ.ipynb).
+- [Run OpenAI Compatible API](https://github.com/liltom-eth/llama2-webui#start-openai-compatible-api) on Llama2 models.
+- [News](https://github.com/liltom-eth/llama2-webui/blob/main/docs/news.md), [Benchmark](https://github.com/liltom-eth/llama2-webui/blob/main/docs/performance.md), [Issue Solutions](https://github.com/liltom-eth/llama2-webui/blob/main/docs/issues.md)
+[llama2-wrapper](https://pypi.org/project/llama2-wrapper/)  is the backend and part of [llama2-webui](https://github.com/liltom-eth/llama2-webui), which can run any Llama 2 locally with gradio UI on GPU or CPU from anywhere (Linux/Windows/Mac).
+## Install
+```bash
+pip install llama2-wrapper
+```
+## Start OpenAI Compatible  API
+```
+python -m llama2_wrapper.server
+```
+it will use `llama.cpp` as the backend by default to run `llama-2-7b-chat.ggmlv3.q4_0.bin` model.
+Start Fast API for `gptq` backend:
+```
+python -m llama2_wrapper.server --backend_type gptq
+```
+Navigate to http://localhost:8000/docs to see the OpenAPI documentation.
+## API Usage
+###  `__call__`
+`__call__()` is the function to generate text from a prompt.
+For example, run ggml llama2 model on CPU, [colab example](https://github.com/liltom-eth/llama2-webui/blob/main/colab/ggmlv3_q4_0.ipynb):
+```python
+from llama2_wrapper import LLAMA2_WRAPPER, get_prompt
+llama2_wrapper = LLAMA2_WRAPPER()
+# Default running on backend llama.cpp.
+# Automatically downloading model to: ./models/llama-2-7b-chat.ggmlv3.q4_0.bin
+prompt = "Do you know Pytorch"
+# llama2_wrapper() will run __call__()
+answer = llama2_wrapper(get_prompt(prompt), temperature=0.9)
+```
+Run gptq llama2 model on Nvidia GPU, [colab example](https://github.com/liltom-eth/llama2-webui/blob/main/colab/Llama_2_7b_Chat_GPTQ.ipynb):
+```python
+from llama2_wrapper import LLAMA2_WRAPPER
+llama2_wrapper = LLAMA2_WRAPPER(backend_type="gptq")
+# Automatically downloading model to: ./models/Llama-2-7b-Chat-GPTQ
+```
+Run llama2 7b with bitsandbytes 8 bit with a `model_path`:
+```python
+from llama2_wrapper import LLAMA2_WRAPPER
+llama2_wrapper = LLAMA2_WRAPPER(
+	model_path = "./models/Llama-2-7b-chat-hf",
+  backend_type = "transformers",
+  load_in_8bit = True
+)
+```
+### completion
+  `completion()`  is the function to generate text from a prompt for OpenAI compatible API `/v1/completions`.
+```python
+llama2_wrapper = LLAMA2_WRAPPER()
+prompt = get_prompt("Hi do you know Pytorch?")
+print(llm.completion(prompt))
+```
+### chat_completion
+  `chat_completion()`  is the function to generate text from a dialog (chat history) for OpenAI compatible API `/v1/chat/completions`.
+```python
+llama2_wrapper = LLAMA2_WRAPPER()
+dialog = [
+    {
+        "role":"system",
+        "content":"You are a helpful, respectful and honest assistant. "
+    },{
+        "role":"user",
+        "content":"Hi do you know Pytorch?",
+    },
+]
+print(llm.chat_completion(dialog))
+```
+### generate
+`generate()` is the function to create a generator of response from a prompt.
+This is useful when you want to stream the output like typing in the chatbot.
+```python
+llama2_wrapper = LLAMA2_WRAPPER()
+prompt = get_prompt("Hi do you know Pytorch?")
+for response in llama2_wrapper.generate(prompt):
+	print(response)
+```
+The response will be like:
+```
+Yes,
+Yes, I'm
+Yes, I'm familiar
+Yes, I'm familiar with
+Yes, I'm familiar with PyTorch!
+...
+```
+### run
+`run()` is similar to `generate()`, but `run()`can also accept `chat_history`and `system_prompt` from the users.
+It will process the input message to llama2 prompt template with `chat_history` and `system_prompt` for a chatbot-like app.
+### get_prompt
+`get_prompt()` will process the input message to llama2 prompt with `chat_history` and `system_prompt`for chatbot.
+By default, `chat_history` and `system_prompt` are empty and `get_prompt()` will add llama2 prompt template to your message:
+```python
+prompt = get_prompt("Hi do you know Pytorch?")
+```
+prompt will be:
+```
+[INST] <<SYS>>
+<</SYS>>
+Hi do you know Pytorch? [/INST]
+```
+If use `get_prompt("Hi do you know Pytorch?", system_prompt="You are a helpful...")`:
+```
+[INST] <<SYS>>
+You are a helpful, respectful and honest assistant.
+<</SYS>>
+Hi do you know Pytorch? [/INST]
+```
+### get_prompt_for_dialog
+`get_prompt_for_dialog()` will process dialog (chat history) to llama2 prompt for OpenAI compatible API `/v1/chat/completions`.
+```python
+dialog = [
+    {
+        "role":"system",
+        "content":"You are a helpful, respectful and honest assistant. "
+    },{
+        "role":"user",
+        "content":"Hi do you know Pytorch?",
+    },
+]
+prompt = get_prompt_for_dialog("Hi do you know Pytorch?")
+# [INST] <<SYS>>
+# You are a helpful, respectful and honest assistant.
+# <</SYS>>
+#
+# Hi do you know Pytorch? [/INST]
+```

env_examples/.env.13b_example ADDED Viewed

	@@ -0,0 +1,13 @@

+MODEL_PATH = "./models/Llama-2-13b-chat-hf"
+# options: llama.cpp, gptq, transformers
+BACKEND_TYPE = "transformers"
+# only for transformers bitsandbytes 8 bit
+LOAD_IN_8BIT = True
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = 4000
+DEFAULT_SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."

env_examples/.env.7b_8bit_example ADDED Viewed

	@@ -0,0 +1,13 @@

+MODEL_PATH = "./models/Llama-2-7b-chat-hf"
+# options: llama.cpp, gptq, transformers
+BACKEND_TYPE = "transformers"
+# only for transformers bitsandbytes 8 bit
+LOAD_IN_8BIT = True
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = 4000
+DEFAULT_SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."

env_examples/.env.7b_ggmlv3_q4_0_example ADDED Viewed

	@@ -0,0 +1,18 @@

+MODEL_PATH = ""
+# if MODEL_PATH is "", default llama.cpp/gptq models
+# will be downloaded to: ./models
+# Example ggml path:
+# MODEL_PATH = "./models/llama-2-7b-chat.ggmlv3.q4_0.bin"
+# options: llama.cpp, gptq, transformers
+BACKEND_TYPE = "llama.cpp"
+# only for transformers bitsandbytes 8 bit
+LOAD_IN_8BIT = False
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = 4000
+DEFAULT_SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."

env_examples/.env.7b_gptq_example ADDED Viewed

	@@ -0,0 +1,18 @@

+MODEL_PATH = "./models/Llama-2-7b-Chat-GPTQ"
+# if MODEL_PATH is "", default llama.cpp/gptq models
+# will be downloaded to: ./models
+# Example gptq path:
+# MODEL_PATH = "./models/Llama-2-7b-Chat-GPTQ"
+# options: llama.cpp, gptq, transformers
+BACKEND_TYPE = "gptq"
+# only for transformers bitsandbytes 8 bit
+LOAD_IN_8BIT = False
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = 4000
+DEFAULT_SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."

llama2_cu_python/Makefile ADDED Viewed

	@@ -0,0 +1,9 @@

+NVCC = nvcc
+.PHONY: libllama2
+libllama2: llama2.cu
+	$(NVCC) -DUSE_CUDA --shared -O3 -lcublas -lm -o libllama2.so llama2.cu --compiler-options '-fPIC'
+.PHONY: clean
+clean:
+	rm -f libllama2.so

llama2_cu_python/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .llama2_cu import *
2	+
3	+ __version__ = "0.1"

llama2_cu_python/libllama2.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:722e0a53e9d9afbd37491c3eff84e7fcd1c1e2331a575c83d283ba4ff62e269f
+size 1038952

llama2_cu_python/llama2.cu ADDED Viewed

	@@ -0,0 +1,1394 @@

+/* Inference for Llama-2 Transformer model in pure C
+ * With added CUDA support initially drawing from
+ * https://github.com/ankan-ban/llama2.cu/blob/master/llama2.cu
+ * and structured in a way that hopefully makes keeping it
+ * up-to-date straightforward.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <time.h>
+#include <math.h>
+#include <string.h>
+#include <fcntl.h>
+#include <assert.h>
+#include <future>
+#if defined _WIN32
+    #include "win.h"
+#else
+    #include <unistd.h>
+    #include <sys/mman.h>
+#endif
+#include "llama2.h"
+#ifdef USE_CUDA
+#include <cuda_runtime.h>
+#include <cub/cub.cuh>
+#include <cublas_v2.h>
+// Each CUDA function call should be checked for errors.
+#define CUCHK(err) cuda_check((err), __FILE__, __LINE__)
+inline void cuda_check(cudaError_t error_code, const char *file, int line)
+{
+    if (error_code != cudaSuccess)
+    {
+        fprintf(stderr, "CUDA Error %d: %s. In file '%s' on line %d\n", error_code, cudaGetErrorString(error_code), file, line);
+        fflush(stderr);
+        exit(error_code);
+    }
+}
+// cublasHandle_t g_cublas_handle = nullptr;
+// void create_cublas_handle() {
+//     cublasStatus_t stat = cublasCreate(&g_cublas_handle);  // FIXME cublasDestroy
+//     if (stat != CUBLAS_STATUS_SUCCESS) {
+//         printf ("CUBLAS initialization failed\n");
+//         exit(EXIT_FAILURE);
+//     }
+// }
+// void destroy_cublas_handle() {
+//     cublasStatus_t stat = cublasDestroy(g_cublas_handle);
+//     if (stat != CUBLAS_STATUS_SUCCESS) {
+//         printf ("CUBLAS initialization failed\n");
+//         exit(EXIT_FAILURE);
+//     }
+// }
+#endif
+// ----------------------------------------------------------------------------
+// Transformer model
+typedef struct {
+    int dim; // transformer dimension
+    int hidden_dim; // for ffn layers
+    int n_layers; // number of layers
+    int n_heads; // number of query heads
+    int n_kv_heads; // number of key/value heads (can be < query heads because of multiquery)
+    int vocab_size; // vocabulary size, usually 256 (byte-level)
+    int seq_len; // max sequence length
+} Config;
+// CUDA NOTE: The TransformerWeights structure will be stored on the host,
+// but all of the pointers in the structure will point to data on the GPU.
+// The checkpoint file is mmap-ed to the host and the weights portion
+// is allocated on and copied to the GPU.  Then, memory_map_weights() updates
+// these structure pointers to point to the proper location.  Happily, this
+// function is the same for both C and CUDA.
+typedef struct {
+    // token embedding table
+    float* token_embedding_table;    // (vocab_size, dim)
+    // weights for rmsnorms
+    float* rms_att_weight; // (layer, dim) rmsnorm weights
+    float* rms_ffn_weight; // (layer, dim)
+    // weights for matmuls. note dim == n_heads * head_size
+    float* wq; // (layer, dim, n_heads * head_size)
+    float* wk; // (layer, dim, n_kv_heads * head_size)
+    float* wv; // (layer, dim, n_kv_heads * head_size)
+    float* wo; // (layer, n_heads * head_size, dim)
+    // weights for ffn
+    float* w1; // (layer, hidden_dim, dim)
+    float* w2; // (layer, dim, hidden_dim)
+    float* w3; // (layer, hidden_dim, dim)
+    // final rmsnorm
+    float* rms_final_weight; // (dim,)
+    // (optional) classifier weights for the logits, on the last layer
+    float* wcls;
+} TransformerWeights;
+// CUDA NOTE: The RunState structure will be stored on the host, but all of the
+// pointers in the structure will point to data on the GPU, created via
+// cudaMalloc.  The exception is logits which is the final result of the
+// transformer & is copied from the GPU as the last step in the transformer
+// and is used by the host.
+typedef struct {
+    // current wave of activations
+    float *x; // activation at current time stamp (dim,)
+    float *xb; // same, but inside a residual branch (dim,)
+    float *xb2; // an additional buffer just for convenience (dim,)
+    float *hb; // buffer for hidden dimension in the ffn (hidden_dim,)
+    float *hb2; // buffer for hidden dimension in the ffn (hidden_dim,)
+    float *q; // query (dim,)
+    float *k; // key (dim,)
+    float *v; // value (dim,)
+    float *att; // buffer for scores/attention values (n_heads, seq_len)
+#ifdef USE_CUDA
+    float *logits_gpu; // output logits in GPU
+#endif
+    float *logits; // output logits in CPU
+    // kv cache
+    float* key_cache;   // (layer, seq_len, dim)
+    float* value_cache; // (layer, seq_len, dim)
+} RunState;
+typedef struct {
+    Config config; // the hyperparameters of the architecture (the blueprint)
+    TransformerWeights weights; // the weights of the model
+    RunState state; // buffers for the "wave" of activations in the forward pass
+    // some more state needed to properly clean up the memory mapping (sigh)
+    int fd; // file descriptor for memory mapping
+    float* data; // memory mapped data pointer
+    ssize_t file_size; // size of the checkpoint file in bytes
+} Transformer;
+#ifdef USE_CUDA
+void malloc_run_state(RunState* s, Config* p) {
+    // we calloc instead of malloc to keep valgrind happy
+    int kv_dim = (p->dim * p->n_kv_heads) / p->n_heads;
+    CUCHK(cudaMalloc((void**)&s->x, p->dim * sizeof(float)));
+    CUCHK(cudaMalloc((void**)&s->xb, p->dim * sizeof(float)));
+    CUCHK(cudaMalloc((void**)&s->xb2, p->dim * sizeof(float)));
+    CUCHK(cudaMalloc((void**)&s->hb, p->hidden_dim * sizeof(float)));
+    CUCHK(cudaMalloc((void**)&s->hb2, p->hidden_dim * sizeof(float)));
+    CUCHK(cudaMalloc((void**)&s->q, p->dim * sizeof(float)));
+    CUCHK(cudaMalloc((void**)&s->key_cache, p->n_layers * p->seq_len * kv_dim * sizeof(float)));
+    CUCHK(cudaMalloc((void**)&s->value_cache, p->n_layers * p->seq_len * kv_dim * sizeof(float)));
+    CUCHK(cudaMalloc((void**)&s->att, p->n_heads * p->seq_len * sizeof(float)));
+    CUCHK(cudaMalloc((void**)&s->logits_gpu, p->vocab_size * sizeof(float)));
+    s->logits = (float *)calloc(p->vocab_size, sizeof(float));
+    // ensure all mallocs went fine
+    if (!s->x || !s->xb || !s->xb2 || !s->hb || !s->hb2 || !s->q
+     || !s->key_cache || !s->value_cache || !s->att || !s->logits_gpu || !s->logits) {
+        fprintf(stderr, "malloc failed!\n");
+        exit(EXIT_FAILURE);
+    }
+}
+#else
+void malloc_run_state(RunState* s, Config* p) {
+    // we calloc instead of malloc to keep valgrind happy
+    int kv_dim = (p->dim * p->n_kv_heads) / p->n_heads;
+    s->x = (float *)calloc(p->dim, sizeof(float));
+    s->xb = (float *)calloc(p->dim, sizeof(float));
+    s->xb2 = (float *)calloc(p->dim, sizeof(float));
+    s->hb = (float *)calloc(p->hidden_dim, sizeof(float));
+    s->hb2 = (float *)calloc(p->hidden_dim, sizeof(float));
+    s->q = (float *)calloc(p->dim, sizeof(float));
+    s->key_cache = (float *)calloc(p->n_layers * p->seq_len * kv_dim, sizeof(float));
+    s->value_cache = (float *)calloc(p->n_layers * p->seq_len * kv_dim, sizeof(float));
+    s->att = (float *)calloc(p->n_heads * p->seq_len, sizeof(float));
+    s->logits = (float *)calloc(p->vocab_size, sizeof(float));
+    // ensure all mallocs went fine
+    if (!s->x || !s->xb || !s->xb2 || !s->hb || !s->hb2 || !s->q
+     || !s->key_cache || !s->value_cache || !s->att || !s->logits) {
+        fprintf(stderr, "malloc failed!\n");
+        exit(EXIT_FAILURE);
+    }
+}
+#endif
+#ifdef USE_CUDA
+void free_run_state(RunState* s) {
+    CUCHK(cudaFree(s->x));
+    CUCHK(cudaFree(s->xb));
+    CUCHK(cudaFree(s->xb2));
+    CUCHK(cudaFree(s->hb));
+    CUCHK(cudaFree(s->hb2));
+    CUCHK(cudaFree(s->q));
+    CUCHK(cudaFree(s->att));
+    CUCHK(cudaFree(s->logits_gpu));
+    free(s->logits);
+    CUCHK(cudaFree(s->key_cache));
+    CUCHK(cudaFree(s->value_cache));
+}
+#else
+void free_run_state(RunState* s) {
+    free(s->x);
+    free(s->xb);
+    free(s->xb2);
+    free(s->hb);
+    free(s->hb2);
+    free(s->q);
+    free(s->att);
+    free(s->logits);
+    free(s->key_cache);
+    free(s->value_cache);
+}
+#endif
+void memory_map_weights(TransformerWeights *w, Config* p, float* ptr, int shared_weights) {
+    int head_size = p->dim / p->n_heads;
+    // make sure the multiplications below are done in 64bit to fit the parameter counts of 13B+ models
+    unsigned long long n_layers = p->n_layers;
+    w->token_embedding_table = ptr;
+    ptr += p->vocab_size * p->dim;
+    w->rms_att_weight = ptr;
+    ptr += n_layers * p->dim;
+    w->wq = ptr;
+    ptr += n_layers * p->dim * (p->n_heads * head_size);
+    w->wk = ptr;
+    ptr += n_layers * p->dim * (p->n_kv_heads * head_size);
+    w->wv = ptr;
+    ptr += n_layers * p->dim * (p->n_kv_heads * head_size);
+    w->wo = ptr;
+    ptr += n_layers * (p->n_heads * head_size) * p->dim;
+    w->rms_ffn_weight = ptr;
+    ptr += n_layers * p->dim;
+    w->w1 = ptr;
+    ptr += n_layers * p->dim * p->hidden_dim;
+    w->w2 = ptr;
+    ptr += n_layers * p->hidden_dim * p->dim;
+    w->w3 = ptr;
+    ptr += n_layers * p->dim * p->hidden_dim;
+    w->rms_final_weight = ptr;
+    ptr += p->dim;
+    ptr += p->seq_len * head_size / 2; // skip what used to be freq_cis_real (for RoPE)
+    ptr += p->seq_len * head_size / 2; // skip what used to be freq_cis_imag (for RoPE)
+    w->wcls = shared_weights ? w->token_embedding_table : ptr;
+}
+void read_checkpoint(char* checkpoint, Config* config, TransformerWeights* weights,
+                     int* fd, float** data, ssize_t* file_size) {
+    FILE *file = fopen(checkpoint, "rb");
+    if (!file) { fprintf(stderr, "Couldn't open file %s\n", checkpoint); exit(EXIT_FAILURE); }
+    // read in the config header
+    if (fread(config, sizeof(Config), 1, file) != 1) { exit(EXIT_FAILURE); }
+    // negative vocab size is hacky way of signaling unshared weights. bit yikes.
+    int shared_weights = config->vocab_size > 0 ? 1 : 0;
+    config->vocab_size = abs(config->vocab_size);
+    // figure out the file size
+    fseek(file, 0, SEEK_END); // move file pointer to end of file
+    *file_size = ftell(file); // get the file size, in bytes
+    fclose(file);
+    // memory map the Transformer weights into the data pointer
+    *fd = open(checkpoint, O_RDONLY); // open in read only mode
+    if (*fd == -1) { fprintf(stderr, "open failed!\n"); exit(EXIT_FAILURE); }
+    *data = (float *)mmap(NULL, *file_size, PROT_READ, MAP_PRIVATE, *fd, 0);
+    if (*data == MAP_FAILED) { fprintf(stderr, "mmap failed!\n"); exit(EXIT_FAILURE); }
+#ifdef USE_CUDA
+    // allocate & copy mmap data to the gpu first
+    // TODO: allocate & copy just a portion to the GPU if the weights are too big
+    // to fit in the GPU, then copy the data only as needed while running.
+    float* weights_ptr;
+    size_t weights_size = *file_size - sizeof(Config);
+    CUCHK(cudaMalloc((void**)&weights_ptr, weights_size));
+    CUCHK(cudaMemcpy(weights_ptr, *data + sizeof(Config)/sizeof(float), weights_size, cudaMemcpyHostToDevice));
+#else
+    float* weights_ptr = *data + sizeof(Config)/sizeof(float);
+#endif
+    memory_map_weights(weights, config, weights_ptr, shared_weights);
+}
+void build_transformer(Transformer *t, char* checkpoint_path) {
+    // read in the Config and the Weights from the checkpoint
+    read_checkpoint(checkpoint_path, &t->config, &t->weights, &t->fd, &t->data, &t->file_size);
+    // allocate the RunState buffers
+    malloc_run_state(&t->state, &t->config);
+}
+void free_transformer(Transformer* t) {
+    // close the memory mapping
+    if (t->data != MAP_FAILED) { munmap(t->data, t->file_size); }
+    if (t->fd != -1) { close(t->fd); }
+#ifdef USE_CUDA
+    // we cudaMalloc a region of memory, then hand the address to
+    // the token_embedding_table field.  Free it here.
+    CUCHK(cudaFree(t->weights.token_embedding_table));
+#endif
+    // free the RunState buffers
+    free_run_state(&t->state);
+}
+// ----------------------------------------------------------------------------
+// neural net blocks; the dynamics of the Transformer
+#ifdef USE_CUDA
+// Utility routine to divide a into ceiling of b parts
+int divUp(int a, int b) {
+    return (a - 1) / b + 1;
+}
+const int num_threads_lrg = 1024;
+const int num_threads_med = 256;
+__global__ void rmsnorm_kernel(float* o, float* x, float* weight, int size, int elementsPerThread) {
+    // parallel reduction of sum of squares via CUB
+    float ss = 0.0f;
+    for (int i = 0; i < elementsPerThread; i++) {
+        int j = threadIdx.x + i * num_threads_lrg;
+        if (j < size)
+            ss += x[j] * x[j];
+    }
+    using BlockReduce = cub::BlockReduce<float, num_threads_lrg>;
+    __shared__ typename BlockReduce::TempStorage temp;
+    ss = BlockReduce(temp).Sum(ss);
+    // serialization point to calculate normalization factor
+    __shared__ float shared_ss;
+    if (threadIdx.x == 0) {
+        ss /= size;
+        ss += 1e-5f;
+        ss = 1.0f / sqrtf(ss);
+        shared_ss = ss;
+    }
+    __syncthreads();
+    ss = shared_ss;
+    // normalize and scale
+    for (int i = 0; i < elementsPerThread; i++) {
+        int j = threadIdx.x + i * num_threads_lrg;
+        if (j < size) {
+            o[j] = weight[j] * (ss * x[j]);
+        }
+    }
+}
+void rmsnorm(float* o, float* x, float* weight, int size) {
+    int elementsPerThread = divUp(size, num_threads_lrg);
+    rmsnorm_kernel <<<1, num_threads_lrg >>> (o, x, weight, size, elementsPerThread);
+}
+#else
+void rmsnorm(float* o, float* x, float* weight, int size) {
+    // calculate sum of squares
+    float ss = 0.0f;
+    for (int j = 0; j < size; j++) {
+        ss += x[j] * x[j];
+    }
+    ss /= size;
+    ss += 1e-5f;
+    ss = 1.0f / sqrtf(ss);
+    // normalize and scale
+    for (int j = 0; j < size; j++) {
+        o[j] = weight[j] * (ss * x[j]);
+    }
+}
+#endif
+#ifdef USE_CUDA
+__device__ void softmax_gpu(float* __restrict__ x, int size) {
+    int tid = threadIdx.x;
+    int step = blockDim.x;
+    // find max value (for numerical stability)
+    float max_val = tid < size ? x[tid] : 0;
+    for (int i = tid + step; i < size; i += step) {
+        if (x[i] > max_val) {
+            max_val = x[i];
+        }
+    }
+    using BlockReduce = cub::BlockReduce<float, num_threads_lrg>;
+    __shared__ typename BlockReduce::TempStorage temp;
+    __shared__ float shared_val;
+    max_val = BlockReduce(temp).Reduce(max_val, cub::Max());
+    if (threadIdx.x == 0) {
+        shared_val = max_val;
+    }
+    __syncthreads();
+    max_val = shared_val;
+    // exp and sum
+    float sum = 0.0f;
+    for (int i = tid; i < size; i += step) {
+        x[i] = expf(x[i] - max_val);
+        sum += x[i];
+    }
+    sum = BlockReduce(temp).Sum(sum);
+    if (threadIdx.x == 0) {
+        shared_val = sum;
+    }
+    __syncthreads();
+    sum = shared_val;
+    // normalize
+    for (int i = tid; i < size; i += step) {
+        x[i] /= sum;
+    }
+}
+#endif
+void softmax(float* x, int size) {
+    // find max value (for numerical stability)
+    float max_val = x[0];
+    for (int i = 1; i < size; i++) {
+        if (x[i] > max_val) {
+            max_val = x[i];
+        }
+    }
+    // exp and sum
+    float sum = 0.0f;
+    for (int i = 0; i < size; i++) {
+        x[i] = expf(x[i] - max_val);
+        sum += x[i];
+    }
+    // normalize
+    for (int i = 0; i < size; i++) {
+        x[i] /= sum;
+    }
+}
+#ifdef USE_CUDA
+// Use cuBLAS for matmul to leverage this included, high-performance library.
+void matmul(cublasHandle_t handle, float* xout, float* x, float* w, int n, int d) {
+    // W (d,n) @ x (n,) -> xout (d,)
+    // W is stored in this order: (n=0,d=0), (n=1,d=0), (n=2,d=0), ...
+    // so W is n x d in cublas terms & we'll need to transpose.
+    // Sgemv does y = alpha * op(A) * x + beta * y (modifying y)
+    //   where op can transpose the matrix A
+    // Translating to our local vars, that is
+    // xout = 1.0*op(w)*x + 0.0*xout
+    float alpha = 1.0f;
+    float beta = 0.0f; // when this is 0, xout will not be used for input
+    cublasSgemv(handle, CUBLAS_OP_T, n, d, &alpha, w, n, x, 1, &beta, xout, 1);
+}
+#else
+void matmul(float* xout, float* x, float* w, int n, int d) {
+    // W (d,n) @ x (n,) -> xout (d,)
+    // by far the most amount of time is spent inside this little function
+    int i;
+    #pragma omp parallel for private(i)
+    for (i = 0; i < d; i++) {
+        float val = 0.0f;
+        for (int j = 0; j < n; j++) {
+            val += w[i * n + j] * x[j];
+        }
+        xout[i] = val;
+    }
+}
+#endif
+// Additional neural net blocks (brought out from transformer function)
+#ifdef USE_CUDA
+__global__ void RoPe_rotation_kernel(int pos, float *sq, float *sk, int kv_dim, int head_size) {
+    int i = threadIdx.x * 2 + blockIdx.x * head_size;
+    int head_dim = i % head_size;
+    float freq = 1.0f / powf(10000.0f, head_dim / (float)head_size);
+    float val = pos * freq;
+    float fcr = cosf(val);
+    float fci = sinf(val);
+    int rotn = i < kv_dim ? 2 : 1; // how many vectors? 2 = q & k, 1 = q only
+    for (int v = 0; v < rotn; v++) {
+        float* vec = v == 0 ? sq : sk; // the vector to rotate (query or key)
+        float v0 = vec[i];
+        float v1 = vec[i+1];
+        vec[i]   = v0 * fcr - v1 * fci;
+        vec[i+1] = v0 * fci + v1 * fcr;
+    }
+}
+void RoPe_rotation(int pos, RunState* s, int dim, int kv_dim, int head_size) {
+    RoPe_rotation_kernel <<<dim/head_size, head_size/2 >>> (pos, s->q, s->k, kv_dim, head_size);
+}
+#else
+void RoPe_rotation(int pos, RunState* s, int dim, int kv_dim, int head_size) { //s->q, s->k, freq_cis_real_row, freq_cis_imag_row, p->n_heads, head_size) {
+    for (int i = 0; i < dim; i+=2) {
+        int head_dim = i % head_size;
+        float freq = 1.0f / powf(10000.0f, head_dim / (float)head_size);
+        float val = pos * freq;
+        float fcr = cosf(val);
+        float fci = sinf(val);
+        int rotn = i < kv_dim ? 2 : 1; // how many vectors? 2 = q & k, 1 = q only
+        for (int v = 0; v < rotn; v++) {
+            float* vec = v == 0 ? s->q : s->k; // the vector to rotate (query or key)
+            float v0 = vec[i];
+            float v1 = vec[i+1];
+            vec[i]   = v0 * fcr - v1 * fci;
+            vec[i+1] = v0 * fci + v1 * fcr;
+        }
+    }
+}
+#endif
+#ifdef USE_CUDA
+// TODO refactor vs C code
+__global__ void multi_head_attention_kernel(int pos, int seq_len, float *sq, float *satt, float *sxb, float *key_cache, float *value_cache, int kv_dim, int kv_mul, int head_size, int loff) {
+    int h = blockIdx.x;
+    // get the query vector for this head
+    float* q = sq + h * head_size;
+    // attention scores for this head
+    float* att = satt + h * seq_len;
+    // iterate over all timesteps, including the current one
+    // In CUDA, each thread does a small portion of the calc
+    for (int t = threadIdx.x; t <= pos; t += blockDim.x) {
+        // get the key vector for this head and at this timestep
+        float* k = key_cache + loff + t * kv_dim + (h / kv_mul) * head_size;
+        // calculate the attention score as the dot product of q and k
+        float score = 0.0f;
+        for (int i = 0; i < head_size; i++) {
+            score += q[i] * k[i];
+        }
+        score /= sqrtf(head_size);
+        // save the score to the attention buffer
+        att[t] = score;
+    }
+    // above was this threads portion of the iteration.  wait for all threads to finish
+    __syncthreads();
+    // softmax the scores to get attention weights, from 0..pos inclusively
+    softmax_gpu(att, pos + 1);
+    __syncthreads();
+    // weighted sum of the values, store back into xb
+    // NOTE: by swapping the order of the for loops (vs. C) a simpler
+    // version of the code accomplishes the same task and fits more
+    // naturally with the CUDA way of subdividing the problem.
+    float* xb = sxb + h * head_size;
+    for (int i = threadIdx.x; i < head_size; i += blockDim.x) {
+        float val = 0.0f;
+        for (int t = 0; t <= pos; t++) {
+            // get the value vector for this head and at this timestep
+            float* v = value_cache + loff + t * kv_dim + (h / kv_mul) * head_size;
+            // get the attention weight for this timestep
+            float a = att[t];
+            val += a * v[i];
+        }
+        xb[i] = val;
+    }
+}
+void multi_head_attention(int pos, Config* p, RunState* s, int kv_dim, int kv_mul, int head_size, int loff) {
+    multi_head_attention_kernel <<<p->n_heads, num_threads_lrg>>> (pos, p->seq_len, s->q, s->att, s->xb, s->key_cache, s->value_cache, kv_dim, kv_mul, head_size, loff);
+}
+#else
+void multi_head_attention(int pos, Config* p, RunState* s, int kv_dim, int kv_mul, int head_size, int loff) {
+    int h;
+    #pragma omp parallel for private(h)
+    for (h = 0; h < p->n_heads; h++) {
+        // get the query vector for this head
+        float* q = s->q + h * head_size;
+        // attention scores for this head
+        float* att = s->att + h * p->seq_len;
+        // iterate over all timesteps, including the current one
+        for (int t = 0; t <= pos; t++) {
+            // get the key vector for this head and at this timestep
+            float* k = s->key_cache + loff + t * kv_dim + (h / kv_mul) * head_size;
+            // calculate the attention score as the dot product of q and k
+            float score = 0.0f;
+            for (int i = 0; i < head_size; i++) {
+                score += q[i] * k[i];
+            }
+            score /= sqrtf(head_size);
+            // save the score to the attention buffer
+            att[t] = score;
+        }
+        // softmax the scores to get attention weights, from 0..pos inclusively
+        softmax(att, pos + 1);
+        // weighted sum of the values, store back into xb
+        float* xb = s->xb + h * head_size;
+        memset(xb, 0, head_size * sizeof(float));
+        for (int t = 0; t <= pos; t++) {
+            // get the value vector for this head and at this timestep
+            float* v = s->value_cache + loff + t * kv_dim + (h / kv_mul) * head_size;
+            // get the attention weight for this timestep
+            float a = att[t];
+            // accumulate the weighted value into xb
+            for (int i = 0; i < head_size; i++) {
+                xb[i] += a * v[i];
+            }
+        }
+    }
+}
+#endif
+#ifdef USE_CUDA
+__global__ void f_silu_elementwise_mul_w3_kernel(float *shb, float *shb2, int hidden_dim) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < hidden_dim) {
+        float val = shb[i];
+        // silu(x)=x*σ(x), where σ(x) is the logistic sigmoid
+        val *= (1.0f / (1.0f + expf(-val)));
+        // elementwise multiply with w3(x)
+        val *= shb2[i];
+        shb[i] = val;
+    }
+}
+void f_silu_elementwise_mul_w3(RunState *s, int hidden_dim) {
+    f_silu_elementwise_mul_w3_kernel<<<divUp(hidden_dim, num_threads_med), num_threads_med>>>(s->hb, s->hb2, hidden_dim);
+}
+#else
+void f_silu_elementwise_mul_w3(RunState *s, int hidden_dim) {
+    for (int i = 0; i < hidden_dim; i++) {
+        float val = s->hb[i];
+        // silu(x)=x*σ(x), where σ(x) is the logistic sigmoid
+        val *= (1.0f / (1.0f + expf(-val)));
+        // elementwise multiply with w3(x)
+        val *= s->hb2[i];
+        s->hb[i] = val;
+    }
+}
+#endif
+#ifdef USE_CUDA
+__global__ void accum_kernel(float* a, float* b, int size) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < size) {
+        a[i] += b[i];
+    }
+}
+void accum(float *a, float *b, int size) {
+    accum_kernel<<<divUp(size, num_threads_med), num_threads_med>>>(a,b,size);
+}
+#else
+void accum(float *a, float *b, int size) {
+    for (int i = 0; i < size; i++) {
+        a[i] += b[i];
+    }
+}
+#endif
+#ifdef USE_CUDA
+float* forward(Transformer* transformer, int token, int pos, cublasHandle_t handle) {
+#else
+float* forward(Transformer* transformer, int token, int pos) {
+#endif
+    // a few convenience variables
+    Config* p = &transformer->config;
+    TransformerWeights* w = &transformer->weights;
+    RunState* s = &transformer->state;
+    float *x = s->x;
+    int dim = p->dim;
+    int kv_dim = (p->dim * p->n_kv_heads) / p->n_heads;
+    int kv_mul = p->n_heads / p->n_kv_heads; // integer multiplier of the kv sharing in multiquery
+    int hidden_dim =  p->hidden_dim;
+    int head_size = dim / p->n_heads;
+    // copy the token embedding into x
+    float* content_row = w->token_embedding_table + token * dim;
+#ifdef USE_CUDA
+    CUCHK(cudaMemcpy(x, content_row, dim*sizeof(*x), cudaMemcpyDeviceToDevice));
+#else
+    memcpy(x, content_row, dim*sizeof(*x));
+#endif
+    // forward all the layers
+    for(unsigned long long l = 0; l < p->n_layers; l++) {
+        // attention rmsnorm
+        rmsnorm(s->xb, x, w->rms_att_weight + l*dim, dim);
+        // key and value point to the kv cache
+        int loff = l * p->seq_len * kv_dim; // kv cache layer offset for convenience
+        s->k = s->key_cache + loff + pos * kv_dim;
+        s->v = s->value_cache + loff + pos * kv_dim;
+        // qkv matmuls for this position
+#ifdef USE_CUDA
+        matmul(handle, s->q, s->xb, w->wq + l*dim*dim, dim, dim);
+        matmul(handle, s->k, s->xb, w->wk + l*dim*kv_dim, dim, kv_dim);
+        matmul(handle, s->v, s->xb, w->wv + l*dim*kv_dim, dim, kv_dim);
+#else
+        matmul(s->q, s->xb, w->wq + l*dim*dim, dim, dim);
+        matmul(s->k, s->xb, w->wk + l*dim*kv_dim, dim, kv_dim);
+        matmul(s->v, s->xb, w->wv + l*dim*kv_dim, dim, kv_dim);
+#endif
+        // RoPE relative positional encoding: complex-valued rotate q and k in each head
+        RoPe_rotation(pos, s, dim, kv_dim, head_size);
+        // multihead attention. iterate over all heads
+        multi_head_attention(pos, p, s, kv_dim, kv_mul, head_size, loff);
+        // final matmul to get the output of the attention
+#ifdef USE_CUDA
+        matmul(handle, s->xb2, s->xb, w->wo + l*dim*dim, dim, dim);
+#else
+        matmul(s->xb2, s->xb, w->wo + l*dim*dim, dim, dim);
+#endif
+        // residual connection back into x
+        accum(x, s->xb2, dim);
+        // ffn rmsnorm
+        rmsnorm(s->xb, x, w->rms_ffn_weight + l*dim, dim);
+        // Now for FFN in PyTorch we have: self.w2(F.silu(self.w1(x)) * self.w3(x))
+        // first calculate self.w1(x) and self.w3(x)
+#ifdef USE_CUDA
+        matmul(handle, s->hb, s->xb, w->w1 + l*dim*hidden_dim, dim, hidden_dim);
+        matmul(handle, s->hb2, s->xb, w->w3 + l*dim*hidden_dim, dim, hidden_dim);
+#else
+        matmul(s->hb, s->xb, w->w1 + l*dim*hidden_dim, dim, hidden_dim);
+        matmul(s->hb2, s->xb, w->w3 + l*dim*hidden_dim, dim, hidden_dim);
+#endif
+        // SwiGLU non-linearity
+        f_silu_elementwise_mul_w3(s, hidden_dim);
+        // final matmul to get the output of the ffn
+#ifdef USE_CUDA
+        matmul(handle, s->xb, s->hb, w->w2 + l*dim*hidden_dim, hidden_dim, dim);
+#else
+        matmul(s->xb, s->hb, w->w2 + l*dim*hidden_dim, hidden_dim, dim);
+#endif
+        // residual connection
+        accum(x, s->xb, dim);
+    }
+    // final rmsnorm
+    rmsnorm(x, x, w->rms_final_weight, dim);
+    // classifier into logits
+#ifdef USE_CUDA
+    matmul(handle, s->logits_gpu, x, w->wcls, p->dim, p->vocab_size);
+    CUCHK(cudaMemcpy(s->logits, s->logits_gpu, p->vocab_size * sizeof(float), cudaMemcpyDeviceToHost));
+#else
+    matmul(s->logits, x, w->wcls, p->dim, p->vocab_size);
+#endif
+    return s->logits;
+}
+// ----------------------------------------------------------------------------
+// The Byte Pair Encoding (BPE) Tokenizer that translates strings <-> tokens
+typedef struct {
+    char *str;
+    int id;
+} TokenIndex;
+typedef struct {
+    char** vocab;
+    float* vocab_scores;
+    TokenIndex *sorted_vocab;
+    int vocab_size;
+    unsigned int max_token_length;
+    unsigned char byte_pieces[512]; // stores all single-byte strings
+} Tokenizer;
+int compare_tokens(const void *a, const void *b) {
+    return strcmp(((TokenIndex*)a)->str, ((TokenIndex*)b)->str);
+}
+void build_tokenizer(Tokenizer* t, char* tokenizer_path, int vocab_size) {
+    // i should have written the vocab_size into the tokenizer file... sigh
+    t->vocab_size = vocab_size;
+    // malloc space to hold the scores and the strings
+    t->vocab = (char**)malloc(vocab_size * sizeof(char*));
+    t->vocab_scores = (float*)malloc(vocab_size * sizeof(float));
+    t->sorted_vocab = NULL; // initialized lazily
+    for (int i = 0; i < 256; i++) {
+        t->byte_pieces[i * 2] = (unsigned char)i;
+        t->byte_pieces[i * 2 + 1] = '\0';
+    }
+    // read in the file
+    FILE *file = fopen(tokenizer_path, "rb");
+    if (!file) { fprintf(stderr, "couldn't load %s\n", tokenizer_path); exit(EXIT_FAILURE); }
+    if (fread(&t->max_token_length, sizeof(int), 1, file) != 1) { fprintf(stderr, "failed read\n"); exit(EXIT_FAILURE); }
+    int len;
+    for (int i = 0; i < vocab_size; i++) {
+        if (fread(t->vocab_scores + i, sizeof(float), 1, file) != 1) { fprintf(stderr, "failed read\n"); exit(EXIT_FAILURE);}
+        if (fread(&len, sizeof(int), 1, file) != 1) { fprintf(stderr, "failed read\n"); exit(EXIT_FAILURE); }
+        t->vocab[i] = (char *)malloc(len + 1);
+        if (fread(t->vocab[i], len, 1, file) != 1) { fprintf(stderr, "failed read\n"); exit(EXIT_FAILURE); }
+        t->vocab[i][len] = '\0'; // add the string terminating token
+    }
+    fclose(file);
+}
+void free_tokenizer(Tokenizer* t) {
+    for (int i = 0; i < t->vocab_size; i++) { free(t->vocab[i]); }
+    free(t->vocab);
+    free(t->vocab_scores);
+    free(t->sorted_vocab);
+}
+char* decode(Tokenizer* t, int prev_token, int token) {
+    char *piece = t->vocab[token];
+    // following BOS (1) token, sentencepiece decoder strips any leading whitespace (see PR #89)
+    if (prev_token == 1 && piece[0] == ' ') { piece++; }
+    // careful, some tokens designate raw bytes, and look like e.g. '<0x01>'
+    // parse this and convert and return the actual byte
+    unsigned char byte_val;
+    if (sscanf(piece, "<0x%02hhX>", &byte_val) == 1) {
+        piece = (char*)t->byte_pieces + byte_val * 2;
+    }
+    return piece;
+}
+void safe_printf(char *piece) {
+    // piece might be a raw byte token, and we only want to print printable chars or whitespace
+    // because some of the other bytes can be various control codes, backspace, etc.
+    if (piece == NULL) { return; }
+    if (piece[0] == '\0') { return; }
+    if (piece[1] == '\0') {
+        unsigned char byte_val = piece[0];
+        if (!(isprint(byte_val) || isspace(byte_val))) {
+            return; // bad byte, don't print it
+        }
+    }
+    printf("%s", piece);
+}
+int str_lookup(char *str, TokenIndex *sorted_vocab, int vocab_size) {
+    // efficiently find the perfect match for str in vocab, return its index or -1 if not found
+#if defined USE_CUDA && defined _WIN32
+    // CUDA on Windows was not capable of handling the syntax below
+    TokenIndex tok;
+    tok.str = str;
+#else
+    TokenIndex tok = { .str = str }; // acts as the key to search for
+#endif
+    TokenIndex *res = (TokenIndex *)bsearch(&tok, sorted_vocab, vocab_size, sizeof(TokenIndex), compare_tokens);
+    return res != NULL ? res->id : -1;
+}
+void encode(Tokenizer* t, char *text, int8_t bos, int8_t eos, int *tokens, int *n_tokens) {
+    // encode the string text (input) into an upper-bound preallocated tokens[] array
+    // bos != 0 means prepend the BOS token (=1), eos != 0 means append the EOS token (=2)
+    if (text == NULL) { fprintf(stderr, "cannot encode NULL text\n"); exit(EXIT_FAILURE); }
+    if (t->sorted_vocab == NULL) {
+        // lazily malloc and sort the vocabulary
+        t->sorted_vocab = (TokenIndex *)malloc(t->vocab_size * sizeof(TokenIndex));
+        for (int i = 0; i < t->vocab_size; i++) {
+            t->sorted_vocab[i].str = t->vocab[i];
+            t->sorted_vocab[i].id = i;
+        }
+        qsort(t->sorted_vocab, t->vocab_size, sizeof(TokenIndex), compare_tokens);
+    }
+    // create a temporary buffer that will store merge candidates of always two consecutive tokens
+    // *2 for concat, +1 for null terminator +2 for UTF8 (in case max_token_length is 1)
+    char* str_buffer = (char *)malloc((t->max_token_length*2 +1 +2) * sizeof(char));
+    size_t str_len = 0;
+    // start at 0 tokens
+    *n_tokens = 0;
+    // add optional BOS (=1) token, if desired
+    if (bos) tokens[(*n_tokens)++] = 1;
+    // add_dummy_prefix is true by default
+    // so prepend a dummy prefix token to the input string, but only if text != ""
+    // TODO: pretty sure this isn't correct in the general case but I don't have the
+    // energy to read more of the sentencepiece code to figure out what it's doing
+    if (text[0] != '\0') {
+        int dummy_prefix = str_lookup((char *)" ", t->sorted_vocab, t->vocab_size);
+        tokens[(*n_tokens)++] = dummy_prefix;
+    }
+    // Okay UTF-8 time. This will get messy. Here is the reference from Wikipedia:
+    // Code point ↔ UTF-8 conversion
+    // First code point	Last code point	Byte 1	Byte 2	Byte 3	Byte 4
+    // U+0000	U+007F	    0xxxxxxx
+    // U+0080	U+07FF	    110xxxxx	10xxxxxx
+    // U+0800	U+FFFF	    1110xxxx	10xxxxxx	10xxxxxx
+    // U+10000	U+10FFFF    11110xxx	10xxxxxx	10xxxxxx	10xxxxxx
+    // process the raw (UTF-8) byte sequence of the input string
+    for (char *c = text; *c != '\0'; c++) {
+        // reset buffer if the current byte is ASCII or a leading byte
+        // 0xC0 is 11000000, so (*c & 0xC0) keeps the first 2 bits and zeros the rest
+        // 0x80 is 10000000
+        // in UTF-8, all continuation bytes start with "10" in first two bits
+        // so in English this is: "if this byte is not a continuation byte"
+        if ((*c & 0xC0) != 0x80) {
+            // this byte must be either a leading byte (11...) or an ASCII char (0x...)
+            // => reset our location, as we're starting a new UTF-8 codepoint
+            str_len = 0;
+        }
+        // append the current byte to the buffer
+        str_buffer[str_len++] = *c; // ++ is post-increment, incremented after this line
+        str_buffer[str_len] = '\0';
+        // while the next character is a continuation byte, continue appending
+        // but if there are too many of them, just stop to avoid overruning str_buffer size.
+        if ((*(c+1) & 0xC0) == 0x80 && str_len < 4) {
+            continue;
+        }
+        // ok c+1 is not a continuation byte, so we've read in a full codepoint
+        int id = str_lookup(str_buffer, t->sorted_vocab, t->vocab_size);
+        if (id != -1) {
+            // we found this codepoint in vocab, add it as a token
+            tokens[(*n_tokens)++] = id;
+        } else {
+            // byte_fallback encoding: just encode each byte as a token
+            // +3 is here because the first 3 vocab elements are <unk>, <s>, </s>
+            // so the individual bytes only start at index 3
+            for (int i=0; i < str_len; i++) {
+                tokens[(*n_tokens)++] = (unsigned char)str_buffer[i] + 3;
+            }
+        }
+        str_len = 0; // protect against a sequence of stray UTF8 continuation bytes
+    }
+    // merge the best consecutive pair each iteration, according the scores in vocab_scores
+    while (1) {
+        float best_score = -1e10;
+        int best_id = -1;
+        int best_idx = -1;
+        for (int i=0; i < (*n_tokens-1); i++) {
+            // check if we can merge the pair (tokens[i], tokens[i+1])
+            sprintf(str_buffer, "%s%s", t->vocab[tokens[i]], t->vocab[tokens[i+1]]);
+            int id = str_lookup(str_buffer, t->sorted_vocab, t->vocab_size);
+            if (id != -1 && t->vocab_scores[id] > best_score) {
+                // this merge pair exists in vocab! record its score and position
+                best_score = t->vocab_scores[id];
+                best_id = id;
+                best_idx = i;
+            }
+        }
+        if (best_idx == -1) {
+            break; // we couldn't find any more pairs to merge, so we're done
+        }
+        // merge the consecutive pair (best_idx, best_idx+1) into new token best_id
+        tokens[best_idx] = best_id;
+        // delete token at position best_idx+1, shift the entire sequence back 1
+        for (int i = best_idx+1; i < (*n_tokens-1); i++) {
+            tokens[i] = tokens[i+1];
+        }
+        (*n_tokens)--; // token length decreased
+    }
+    // add optional EOS (=2) token, if desired
+    if (eos) tokens[(*n_tokens)++] = 2;
+    free(str_buffer);
+}
+// ----------------------------------------------------------------------------
+// The Sampler, which takes logits and returns a sampled token
+// sampling can be done in a few ways: greedy argmax, sampling, top-p sampling
+typedef struct {
+    float prob;
+    int index;
+} ProbIndex; // struct used when sorting probabilities during top-p sampling
+typedef struct {
+    int vocab_size;
+    ProbIndex* probindex; // buffer used in top-p sampling
+    float temperature;
+    float topp;
+    unsigned long long rng_state;
+} Sampler;
+int sample_argmax(float* probabilities, int n) {
+    // return the index that has the highest probability
+    int max_i = 0;
+    float max_p = probabilities[0];
+    for (int i = 1; i < n; i++) {
+        if (probabilities[i] > max_p) {
+            max_i = i;
+            max_p = probabilities[i];
+        }
+    }
+    return max_i;
+}
+int sample_mult(float* probabilities, int n, float coin) {
+    // sample index from probabilities (they must sum to 1!)
+    // coin is a random number in [0, 1), usually from random_f32()
+    float cdf = 0.0f;
+    for (int i = 0; i < n; i++) {
+        cdf += probabilities[i];
+        if (coin < cdf) {
+            return i;
+        }
+    }
+    return n - 1; // in case of rounding errors
+}
+int compare(const void* a, const void* b) {
+    ProbIndex* a_ = (ProbIndex*) a;
+    ProbIndex* b_ = (ProbIndex*) b;
+    if (a_->prob > b_->prob) return -1;
+    if (a_->prob < b_->prob) return 1;
+    return 0;
+}
+int sample_topp(float* probabilities, int n, float topp, ProbIndex* probindex, float coin) {
+    // top-p sampling (or "nucleus sampling") samples from the smallest set of
+    // tokens that exceed probability topp. This way we never sample tokens that
+    // have very low probabilities and are less likely to go "off the rails".
+    // coin is a random number in [0, 1), usually from random_f32()
+    int n0 = 0;
+    // quicksort indices in descending order of probabilities
+    // values smaller than (1 - topp) / (n - 1) cannot be part of the result
+    // so for efficiency we crop these out as candidates before sorting
+    const float cutoff = (1.0f - topp) / (n - 1);
+    for (int i = 0; i < n; i++) {
+        if (probabilities[i] >= cutoff) {
+            probindex[n0].index = i;
+            probindex[n0].prob = probabilities[i];
+            n0++;
+        }
+    }
+    qsort(probindex, n0, sizeof(ProbIndex), compare);
+    // truncate the list where cumulative probability exceeds topp
+    float cumulative_prob = 0.0f;
+    int last_idx = n0 - 1; // in case of rounding errors consider all elements
+    for (int i = 0; i < n0; i++) {
+        cumulative_prob += probindex[i].prob;
+        if (cumulative_prob > topp) {
+            last_idx = i;
+            break; // we've exceeded topp by including last_idx
+        }
+    }
+    // sample from the truncated list
+    float r = coin * cumulative_prob;
+    float cdf = 0.0f;
+    for (int i = 0; i <= last_idx; i++) {
+        cdf += probindex[i].prob;
+        if (r < cdf) {
+            return probindex[i].index;
+        }
+    }
+    return probindex[last_idx].index; // in case of rounding errors
+}
+void build_sampler(Sampler* sampler, int vocab_size, float temperature, float topp, unsigned long long rng_seed) {
+    sampler->vocab_size = vocab_size;
+    sampler->temperature = temperature;
+    sampler->topp = topp;
+    sampler->rng_state = rng_seed;
+    // buffer only used with nucleus sampling; may not need but it's ~small
+    sampler->probindex = (ProbIndex *)malloc(sampler->vocab_size * sizeof(ProbIndex));
+}
+void free_sampler(Sampler* sampler) {
+    free(sampler->probindex);
+    sampler->probindex = NULL;
+}
+unsigned int random_u32(unsigned long long *state) {
+    // xorshift rng: https://en.wikipedia.org/wiki/Xorshift#xorshift.2A
+    *state ^= *state >> 12;
+    *state ^= *state << 25;
+    *state ^= *state >> 27;
+    return (*state * 0x2545F4914F6CDD1Dull) >> 32;
+}
+float random_f32(unsigned long long *state) { // random float32 in [0,1)
+    return (random_u32(state) >> 8) / 16777216.0f;
+}
+int sample(Sampler* sampler, float* logits) {
+    // sample the token given the logits and some hyperparameters
+    int next;
+    if (sampler->temperature == 0.0f) {
+        // greedy argmax sampling: take the token with the highest probability
+        next = sample_argmax(logits, sampler->vocab_size);
+    } else {
+        // apply the temperature to the logits
+        for (int q=0; q<sampler->vocab_size; q++) { logits[q] /= sampler->temperature; }
+        // apply softmax to the logits to get the probabilities for next token
+        softmax(logits, sampler->vocab_size);
+        // flip a (float) coin (this is our source of entropy for sampling)
+        float coin = random_f32(&sampler->rng_state);
+        // we sample from this distribution to get the next token
+        if (sampler->topp <= 0 || sampler->topp >= 1) {
+            // simply sample from the predicted probability distribution
+            next = sample_mult(logits, sampler->vocab_size, coin);
+        } else {
+            // top-p (nucleus) sampling, clamping the least likely tokens to zero
+            next = sample_topp(logits, sampler->vocab_size, sampler->topp, sampler->probindex, coin);
+        }
+    }
+    return next;
+}
+// ----------------------------------------------------------------------------
+// utilities: time
+long time_in_ms() {
+    // return time in milliseconds, for benchmarking the model speed
+    struct timespec time;
+    clock_gettime(CLOCK_REALTIME, &time);
+    return time.tv_sec * 1000 + time.tv_nsec / 1000000;
+}
+// ----------------------------------------------------------------------------
+// generation loop
+// void generate(Transformer *transformer, Tokenizer *tokenizer, Sampler *sampler, char *prompt, int steps) {
+//     char *empty_prompt = (char *)"";
+//     if (prompt == NULL) { prompt = empty_prompt; }
+//     // encode the (string) prompt into tokens sequence
+//     int num_prompt_tokens = 0;
+//     int* prompt_tokens = (int*)malloc((strlen(prompt)+3) * sizeof(int)); // +3 for '\0', ?BOS, ?EOS
+//     encode(tokenizer, prompt, 1, 0, prompt_tokens, &num_prompt_tokens);
+//     if (num_prompt_tokens < 1) {
+//         fprintf(stderr, "something is wrong, expected at least 1 prompt token\n");
+//         exit(EXIT_FAILURE);
+//     }
+//     // start the main loop
+//     long start = 0;  // used to time our code, only initialized after first iteration
+//     int next;        // will store the next token in the sequence
+//     int token = prompt_tokens[0]; // kick off with the first token in the prompt
+//     int pos = 0;     // position in the sequence
+//     while (pos < steps) {
+//         // forward the transformer to get logits for the next token
+//         float* logits = forward(transformer, token, pos);
+//         // advance the state machine
+//         if (pos < num_prompt_tokens - 1) {
+//             // if we are still processing the input prompt, force the next prompt token
+//             next = prompt_tokens[pos + 1];
+//         } else {
+//             // otherwise sample the next token from the logits
+//             next = sample(sampler, logits);
+//         }
+//         pos++;
+//         // data-dependent terminating condition: the BOS (=1) token delimits sequences
+//         if (next == 1) { break; }
+//         // print the token as string, decode it with the Tokenizer object
+//         char* piece = decode(tokenizer, token, next);
+//         safe_printf(piece); // same as printf("%s", piece), but skips "unsafe" bytes
+//         fflush(stdout);
+//         token = next;
+//         // init the timer here because the first iteration can be slower
+//         if (start == 0) { start = time_in_ms(); }
+//     }
+//     printf("\n");
+//     // report achieved tok/s (pos-1 because the timer starts after first iteration)
+//     if (pos > 1) {
+//         long end = time_in_ms();
+//         fprintf(stderr, "achieved tok/s: %f\n", (pos-1) / (double)(end-start)*1000);
+//     }
+//     free(prompt_tokens);
+// }
+// void read_stdin(const char* guide, char* buffer, size_t bufsize) {
+//     // read a line from stdin, up to but not including \n
+//     printf("%s", guide);
+//     if (fgets(buffer, bufsize, stdin) != NULL) {
+//         size_t len = strlen(buffer);
+//         if (len > 0 && buffer[len - 1] == '\n') {
+//             buffer[len - 1] = '\0'; // strip newline
+//         }
+//     }
+// }
+// // ----------------------------------------------------------------------------
+// // chat loop
+// // I manually inspected the tokens for a few chat conversations compared to
+// // python reference and that seemed ok, but this was not thoroughly tested and
+// // is not safely implemented, it's more a proof of concept atm.
+// void chat(Transformer *transformer, Tokenizer *tokenizer, Sampler *sampler,
+//           char *cli_user_prompt, char *cli_system_prompt, int steps) {
+//     // buffers for reading the system prompt and user prompt from stdin
+//     // you'll notice they are soomewhat haphazardly and unsafely set atm
+//     char system_prompt[512];
+//     char user_prompt[512];
+//     char rendered_prompt[1152];
+//     int num_prompt_tokens = 0;
+//     int* prompt_tokens = (int*)malloc(1152 * sizeof(int));
+//     int user_idx;
+//     // start the main loop
+//     int8_t user_turn = 1; // user starts
+//     int next;        // will store the next token in the sequence
+//     int token;       // stores the current token to feed into the transformer
+//     int prev_token;
+//     int pos = 0;     // position in the sequence
+//     while (pos < steps) {
+//         // when it is the user's turn to contribute tokens to the dialog...
+//         if (user_turn) {
+//             // get the (optional) system prompt at position 0
+//             if (pos == 0) {
+//                 // at position 0, the user can also contribute a system prompt
+//                 if (cli_system_prompt == NULL) {
+//                     // system prompt was not passed in, attempt to get it from stdin
+//                     read_stdin("Enter system prompt (optional): ", system_prompt, sizeof(system_prompt));
+//                 } else {
+//                     // system prompt was passed in, use it
+//                     strcpy(system_prompt, cli_system_prompt);
+//                 }
+//             }
+//             // get the user prompt
+//             if (pos == 0 && cli_user_prompt != NULL) {
+//                 // user prompt for position 0 was passed in, use it
+//                 strcpy(user_prompt, cli_user_prompt);
+//             } else {
+//                 // otherwise get user prompt from stdin
+//                 read_stdin("User: ", user_prompt, sizeof(user_prompt));
+//             }
+//             // render user/system prompts into the Llama 2 Chat schema
+//             if (pos == 0 && system_prompt[0] != '\0') {
+//                 char system_template[] = "[INST] <<SYS>>\n%s\n<</SYS>>\n\n%s [/INST]";
+//                 sprintf(rendered_prompt, system_template, system_prompt, user_prompt);
+//             } else {
+//                 char user_template[] = "[INST] %s [/INST]";
+//                 sprintf(rendered_prompt, user_template, user_prompt);
+//             }
+//             // encode the rendered prompt into tokens
+//             encode(tokenizer, rendered_prompt, 1, 0, prompt_tokens, &num_prompt_tokens);
+//             user_idx = 0; // reset the user index
+//             user_turn = 0;
+//             printf("Assistant: ");
+//         }
+//         // determine the token to pass into the transformer next
+//         if (user_idx < num_prompt_tokens) {
+//             // if we are still processing the input prompt, force the next prompt token
+//             token = prompt_tokens[user_idx++];
+//         } else {
+//             // otherwise use the next token sampled from previous turn
+//             token = next;
+//         }
+//         // EOS (=2) token ends the Assistant turn
+//         if (token == 2) { user_turn = 1; }
+//         // forward the transformer to get logits for the next token
+//         float* logits = forward(transformer, token, pos);
+//         next = sample(sampler, logits);
+//         pos++;
+//         if (user_idx >= num_prompt_tokens && next != 2) {
+//             // the Assistant is responding, so print its output
+//             char* piece = decode(tokenizer, token, next);
+//             safe_printf(piece); // same as printf("%s", piece), but skips "unsafe" bytes
+//             fflush(stdout);
+//         }
+//         if (next == 2) { printf("\n"); }
+//     }
+//     printf("\n");
+//     free(prompt_tokens);
+// }
+typedef struct {
+    Transformer transformer;
+    Tokenizer tokenizer;
+    Sampler sampler;
+    int *output;    // buffer to store the output tokens(max_tokens + 1)
+    int output_idx; // current index in the output buffer(0 ... max_tokens - 1)
+    int gen_idx;    // generated tokens(0 ... max_tokens)
+    int finished;
+#ifdef USE_CUDA
+    cublasHandle_t g_cublas_handle;
+#endif
+} llama2_ctx;
+void *llama2_init(char *model_path, char *tokenizer_path) {
+    llama2_ctx *ctx = (llama2_ctx *)malloc(sizeof(llama2_ctx));
+    build_transformer(&ctx->transformer, model_path);
+    build_tokenizer(&ctx->tokenizer, tokenizer_path, ctx->transformer.config.vocab_size);
+    ctx->output = NULL;
+#ifdef USE_CUDA
+    cublasStatus_t stat = cublasCreate(&ctx->g_cublas_handle);  // FIXME cublasDestroy
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+        printf ("CUBLAS initialization failed\n");
+        exit(EXIT_FAILURE);
+    }
+#endif
+    return ctx;
+}
+void llama2_free(void *ctx) {
+    llama2_ctx *c = (llama2_ctx *)ctx;
+    free_transformer(&c->transformer);
+    free_tokenizer(&c->tokenizer);
+    if (c->sampler.probindex != NULL)
+        free_sampler(&c->sampler);
+#ifdef USE_CUDA
+    cublasStatus_t stat = cublasDestroy(c->g_cublas_handle);
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+        printf ("CUBLAS destroy failed\n");
+        exit(EXIT_FAILURE);
+    }
+#endif
+    if (c->output != NULL)
+        free(c->output);
+}
+void llama2_generate_loop(llama2_ctx *ctx, int *prompt_tokens, int num_prompt_tokens, int steps, int *output_tokens) {
+    // printf("generate loop started\n");
+    // start the main loop
+    // long start = 0;  // used to time our code, only initialized after first iteration
+    int next;        // will store the next token in the sequence
+    int token = prompt_tokens[0]; // kick off with the first token in the prompt
+    int pos = 0;     // position in the sequence
+    while (pos < steps) {
+        // forward the transformer to get logits for the next token
+#ifdef USE_CUDA
+        float* logits = forward(&ctx->transformer, token, pos, ctx->g_cublas_handle);
+#else
+        float* logits = forward(&ctx->transformer, token, pos);
+#endif
+        // advance the state machine
+        if (pos < num_prompt_tokens - 1) {
+            // if we are still processing the input prompt, force the next prompt token
+            next = prompt_tokens[pos + 1];
+        } else {
+            // otherwise sample the next token from the logits
+            next = sample(&ctx->sampler, logits);
+        }
+        // printf("current gen idx: %d, %d\n", ctx->gen_idx, next);
+        if (pos == num_prompt_tokens - 1)
+            output_tokens[ctx->gen_idx] = token;
+        if (pos >= num_prompt_tokens - 1)
+            output_tokens[ctx->gen_idx++ + 1] = next;
+        pos++;
+        token = next;
+        // EOS (=2) token ends the Assistant turn
+        if (next == 2)
+            break;
+    }
+    // report achieved tok/s (pos-1 because the timer starts after first iteration)
+    // if (pos > 1) {
+    //     long end = time_in_ms();
+    //     fprintf(stderr, "achieved tok/s: %f\n", (pos-1) / (double)(end-start)*1000);
+    // }
+    ctx->finished = 1;
+    free(prompt_tokens);
+    free_sampler(&ctx->sampler);
+    // printf("generate loop finished\n");
+}
+int llama2_generate(void *ctx, char *prompt, int steps, float temperature, float topp, int seed) {
+    llama2_ctx *c = (llama2_ctx *)ctx;
+    build_sampler(&c->sampler, c->transformer.config.vocab_size, temperature, topp, seed);
+    char *empty_prompt = (char *)"";
+    if (prompt == NULL) { prompt = empty_prompt; }
+    // encode the (string) prompt into tokens sequence
+    int num_prompt_tokens = 0;
+    int* prompt_tokens = (int*)malloc((strlen(prompt)+3) * sizeof(int)); // +3 for '\0', ?BOS, ?EOS
+    encode(&c->tokenizer, prompt, 1, 0, prompt_tokens, &num_prompt_tokens);
+    if (num_prompt_tokens < 1) {
+        fprintf(stderr, "something is wrong, expected at least 1 prompt token\n");
+        return 1;
+    }
+    if (num_prompt_tokens >= steps) {
+        fprintf(stderr, "prompt tokens exceeds max token length\n");
+        return 1;
+    }
+    c->output = (int *)malloc((steps + 1) * sizeof(int));
+    c->gen_idx = 0;
+    c->output_idx = 0;
+    c->finished = 0;
+    std::thread t(llama2_generate_loop, c, prompt_tokens, num_prompt_tokens, steps, c->output);
+    t.detach();
+    return 0;
+}
+char *llama2_get_last(void *ctx) {
+    llama2_ctx *c = (llama2_ctx *)ctx;
+    assert(c->output != NULL);  // shouldn't be called again after finished
+    while(!c->finished && c->output_idx >= c->gen_idx) {
+        // printf("current idx: %d, %d\n", c->output_idx, c->gen_idx);
+        usleep(100000);
+    }   // wait for next token
+    if (c->finished && c->output_idx >= c->gen_idx) {
+        free(c->output);
+        c->output = NULL;
+        return NULL;
+    }
+    // printf("current idx: %d, %d, finished:%d\n", c->output_idx, c->gen_idx, c->finished);
+    char *piece = decode(&c->tokenizer, c->output[c->output_idx], c->output[c->output_idx + 1]);
+    c->output_idx++;
+    return piece;
+}
+void llama2_tokenize(void *ctx, char *text, int8_t bos, int8_t eos, int *tokens, int *n_tokens) {
+    llama2_ctx *c = (llama2_ctx *)ctx;
+    encode(&c->tokenizer, text, bos, eos, tokens, n_tokens);
+}

llama2_cu_python/llama2.h ADDED Viewed

	@@ -0,0 +1,23 @@

+#ifndef __LLAMA2_H__
+#define __LLAMA2_H__
+#include <stdint.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+void *llama2_init(char *model_path, char *tokenizer_path);
+void llama2_free(void *ctx);
+int llama2_generate(void *ctx, char *prompt, int steps, float temperature, float topp, int seed);
+char *llama2_get_last(void *ctx);
+void llama2_tokenize(void *ctx, char *text, int8_t bos, int8_t eos, int *tokens, int *n_tokens);
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+#endif  // __LLAMA2_H__

llama2_cu_python/llama2_cu.py ADDED Viewed

	@@ -0,0 +1,151 @@

+from typing import Callable, Generator, Iterator, List, Optional, Union
+import ctypes
+from ctypes import (
+    c_bool,
+    c_char_p,
+    c_int,
+    c_int8,
+    c_int32,
+    c_uint8,
+    c_uint32,
+    c_size_t,
+    c_float,
+    c_double,
+    c_void_p,
+    POINTER,
+    _Pointer,  # type: ignore
+    Structure,
+    Array,
+)
+import pathlib
+import os
+import sys
+# Load the library
+def _load_shared_library(lib_base_name: str):
+    # Construct the paths to the possible shared library names
+    _base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__)))
+    # Searching for the library in the current directory under the name "libllama2" (default name
+    # for llama2.cu) and "llama" (default name for this repo)
+    _lib_paths: List[pathlib.Path] = []
+    # Determine the file extension based on the platform
+    if sys.platform.startswith("linux"):
+        _lib_paths += [
+            _base_path / f"lib{lib_base_name}.so",
+        ]
+    else:
+        raise RuntimeError("Unsupported platform")
+    if "LLAMA2_CU_LIB" in os.environ:
+        lib_base_name = os.environ["LLAMA2_CU_LIB"]
+        _lib = pathlib.Path(lib_base_name)
+        _base_path = _lib.parent.resolve()
+        _lib_paths = [_lib.resolve()]
+    cdll_args = dict()  # type: ignore
+    # Add the library directory to the DLL search path on Windows (if needed)
+    # Try to load the shared library, handling potential errors
+    for _lib_path in _lib_paths:
+        if _lib_path.exists():
+            try:
+                return ctypes.CDLL(str(_lib_path), **cdll_args)
+            except Exception as e:
+                raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}")
+    raise FileNotFoundError(
+        f"Shared library with base name '{lib_base_name}' not found"
+    )
+# Specify the base name of the shared library to load
+_lib_base_name = "llama2"
+# Load the library
+_lib = _load_shared_library(_lib_base_name)
+def llama2_init(model_path: str, tokenizer_path: str) -> c_void_p:
+    return _lib.llama2_init(model_path.encode('utf-8'), tokenizer_path.encode('utf-8'))
+_lib.llama2_init.argtypes = [c_char_p, c_char_p]
+_lib.llama2_init.restype = c_void_p
+def llama2_free(ctx: c_void_p) -> None:
+    _lib.llama2_free(ctx)
+_lib.llama2_free.argtypes = [c_void_p]
+_lib.llama2_free.restype = None
+def llama2_generate(ctx: c_void_p, prompt: str, max_tokens: int, temperature: float, top_p: float, seed: int) -> int:
+    return _lib.llama2_generate(ctx, prompt.encode('utf-8'), max_tokens, temperature, top_p, seed)
+_lib.llama2_generate.argtypes = [c_void_p, c_char_p, c_int, c_float, c_float, c_int]
+_lib.llama2_generate.restype = c_int
+def llama2_get_last(ctx: c_void_p) -> bytes:
+    return _lib.llama2_get_last(ctx)    # bytes or None
+_lib.llama2_get_last.argtypes = [c_void_p]
+_lib.llama2_get_last.restype = c_char_p
+def llama2_tokenize(ctx: c_void_p, text: str, add_bos: bool, add_eos: bool) -> List[int]:
+    tokens = (c_int * (len(text) + 3))()
+    n_tokens = (c_int * 1)()
+    _lib.llama2_tokenize(ctx, text.encode('utf-8'), add_bos, add_eos, tokens, n_tokens)
+    return tokens[:n_tokens[0]]
+_lib.llama2_tokenize.argtypes = [c_void_p, c_char_p, c_int8, c_int8, POINTER(c_int), POINTER(c_int)]
+_lib.llama2_tokenize.restype = None
+class Llama2:
+    def __init__(
+        self,
+        model_path: str,
+        tokenizer_path: str='tokenizer.bin',
+        n_ctx: int = 0,
+        n_batch: int = 0) -> None:
+        self.n_ctx = n_ctx
+        self.n_batch = n_batch
+        self.llama2_ctx = llama2_init(model_path, tokenizer_path)
+    def tokenize(
+        self, text: str, add_bos: bool = True, add_eos: bool = False
+    ) -> List[int]:
+        return llama2_tokenize(self.llama2_ctx, text, add_bos, add_eos)
+    def __call__(
+        self,
+        prompt: str,
+        max_tokens: int = 128,
+        temperature: float = 0.8,
+        top_p: float = 0.95,
+        min_p: float = 0.05,
+        typical_p: float = 1.0,
+        logprobs: Optional[int] = None,
+        frequency_penalty: float = 0.0,
+        presence_penalty: float = 0.0,
+        repeat_penalty: float = 1.1,
+        top_k: int = 40,
+        stream: bool = False,
+        seed: Optional[int] = None,
+    ) -> Iterator[str]:
+        if seed is None:
+            seed = 42
+        ret = llama2_generate(self.llama2_ctx, prompt, max_tokens, temperature, top_p, seed)
+        if ret != 0:
+            raise RuntimeError(f"Failed to launch generation for prompt '{prompt}'")
+        bytes_buffer = b''  # store generated bytes until decoded (in case of multi-byte characters)
+        while True:
+            result = llama2_get_last(self.llama2_ctx)
+            if result is None:
+                break
+            bytes_buffer += result
+            try:
+                string = bytes_buffer.decode('utf-8')
+            except UnicodeDecodeError:
+                pass
+            else:
+                bytes_buffer = b''
+                yield string

llama2_wrapper/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .model import LLAMA2_WRAPPER, get_prompt, get_prompt_for_dialog

llama2_wrapper/download/__init__.py ADDED Viewed

File without changes

llama2_wrapper/download/__main__.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import os
+import argparse
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--repo_id",
+        type=str,
+        default="",
+        required=True,
+        help="Repo ID like 'TheBloke/Llama-2-7B-Chat-GGML' ",
+    )
+    parser.add_argument(
+        "--filename",
+        type=str,
+        default=None,
+        help="Filename like llama-2-7b-chat.ggmlv3.q4_0.bin",
+    )
+    parser.add_argument(
+        "--save_dir", type=str, default="./models", help="Directory to save models"
+    )
+    args = parser.parse_args()
+    repo_id = args.repo_id
+    save_dir = args.save_dir
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+    if args.filename:
+        filename = args.filename
+        from huggingface_hub import hf_hub_download
+        print(f"Start downloading model {repo_id} {filename} to: {save_dir}")
+        hf_hub_download(
+            repo_id=repo_id,
+            filename=filename,
+            local_dir=save_dir,
+        )
+    else:
+        repo_name = repo_id.split("/")[1]
+        save_path = os.path.join(save_dir, repo_name)
+        if not os.path.exists(save_path):
+            os.makedirs(save_path)
+        print(f"Start downloading model {repo_id} to: {save_path}")
+        from huggingface_hub import snapshot_download
+        snapshot_download(
+            repo_id=repo_id,
+            local_dir=save_path,
+        )
+if __name__ == "__main__":
+    main()

llama2_wrapper/model.py ADDED Viewed

	@@ -0,0 +1,839 @@

+import os
+import time
+import uuid
+from enum import Enum
+from threading import Thread
+from typing import Any, Iterator, Union, List
+from llama2_wrapper.types import (
+    Completion,
+    CompletionChunk,
+    ChatCompletion,
+    ChatCompletionChunk,
+    # ChatCompletionMessage,
+    Message,
+    B_INST,
+    E_INST,
+    B_SYS,
+    E_SYS,
+)
+class LLAMA2_WRAPPER:
+    def __init__(
+        self,
+        model_path: str = "",
+        tokenizer_path: str = "",
+        backend_type: str = "llama.cpp",
+        max_tokens: int = 4000,
+        load_in_8bit: bool = True,
+        verbose: bool = False,
+    ):
+        """Load a llama2 model from `model_path`.
+        Args:
+            model_path: Path to the model.
+            backend_type: Backend for llama2, options: llama.cpp, gptq, transformers
+            max_tokens: Maximum context size.
+            load_in_8bit: Use bitsandbytes to run model in 8 bit mode (only for transformers models).
+            verbose: Print verbose output to stderr.
+        Raises:
+            ValueError: If the model path does not exist.
+        Returns:
+            A LLAMA2_WRAPPER instance.
+        """
+        self.model_path = model_path
+        self.tokenizer_path = tokenizer_path
+        self.backend_type = BackendType.get_type(backend_type)
+        self.max_tokens = max_tokens
+        self.load_in_8bit = load_in_8bit
+        self.model = None
+        self.tokenizer = None
+        self.verbose = verbose
+        if self.backend_type is BackendType.LLAMA_CPP:
+            print("Running on backend llama.cpp.")
+        elif self.backend_type is BackendType.LLAMA2_CU:
+            print("Running on backend llama2.cu.")
+        else:
+            import torch
+            if torch.cuda.is_available():
+                print("Running on GPU with backend torch transformers.")
+            else:
+                print("GPU CUDA not found.")
+        self.default_llamacpp_path = "./models/llama-2-7b-chat.Q4_0.gguf"
+        self.default_gptq_path = "./models/Llama-2-7b-Chat-GPTQ"
+        self.default_llama2cu_path = "./models/llama2_7b.bin"
+        # Download default ggml/gptq model
+        if self.model_path == "":
+            print("Model path is empty.")
+            if self.backend_type is BackendType.LLAMA_CPP:
+                print("Use default llama.cpp model path: " + self.default_llamacpp_path)
+                if not os.path.exists(self.default_llamacpp_path):
+                    print("Start downloading model to: " + self.default_llamacpp_path)
+                    from huggingface_hub import hf_hub_download
+                    hf_hub_download(
+                        repo_id="TheBloke/Llama-2-7b-Chat-GGUF",
+                        filename="llama-2-7b-chat.Q4_0.gguf",
+                        local_dir="./models/",
+                    )
+                else:
+                    print("Model exists in ./models/llama-2-7b-chat.Q4_0.gguf.")
+                self.model_path = self.default_llamacpp_path
+            elif self.backend_type is BackendType.LLAMA2_CU:
+                if not os.path.exists(self.default_llama2cu_path):
+                    print("Default model not found in " + self.default_llama2cu_path)
+                    exit(1)
+                else:
+                    print("Model exists in " + self.default_llama2cu_path)
+                self.model_path = self.default_llama2cu_path
+            elif self.backend_type is BackendType.GPTQ:
+                print("Use default gptq model path: " + self.default_gptq_path)
+                if not os.path.exists(self.default_gptq_path):
+                    print("Start downloading model to: " + self.default_gptq_path)
+                    from huggingface_hub import snapshot_download
+                    snapshot_download(
+                        "TheBloke/Llama-2-7b-Chat-GPTQ",
+                        local_dir=self.default_gptq_path,
+                    )
+                else:
+                    print("Model exists in " + self.default_gptq_path)
+                self.model_path = self.default_gptq_path
+        self.init_tokenizer()
+        self.init_model()
+    def init_model(self):
+        if self.model is None:
+            self.model = LLAMA2_WRAPPER.create_llama2_model(
+                self.model_path,
+                self.backend_type,
+                self.max_tokens,
+                self.load_in_8bit,
+                self.verbose,
+                self.tokenizer_path,
+            )
+        if self.backend_type not in [BackendType.LLAMA_CPP, BackendType.LLAMA2_CU]:
+            self.model.eval()
+    def init_tokenizer(self):
+        if self.backend_type not in [BackendType.LLAMA_CPP, BackendType.LLAMA2_CU]:
+            if self.tokenizer is None:
+                self.tokenizer = LLAMA2_WRAPPER.create_llama2_tokenizer(self.model_path)
+        elif self.backend_type is BackendType.LLAMA2_CU:
+            self.default_llama2cu_tokenizer = "./models/tokenizer.bin"
+            if not os.path.exists(self.default_llama2cu_tokenizer):
+                print("Default tokenizer not found in " + self.default_llama2cu_tokenizer)
+                exit(1)
+            else:
+                print("Tokenizer exists in " + self.default_llama2cu_tokenizer)
+            self.tokenizer_path = self.default_llama2cu_tokenizer
+    @classmethod
+    def create_llama2_model(
+        cls, model_path, backend_type, max_tokens, load_in_8bit, verbose, tokenizer_path
+    ):
+        if backend_type is BackendType.LLAMA_CPP:
+            from llama_cpp import Llama
+            model = Llama(
+                model_path=model_path,
+                n_ctx=max_tokens,
+                n_batch=max_tokens,
+                verbose=verbose,
+            )
+        elif backend_type is BackendType.LLAMA2_CU:
+            from llama2_cu_python import Llama2
+            model = Llama2(model_path=model_path, tokenizer_path=tokenizer_path, n_ctx=max_tokens, n_batch=max_tokens)
+        elif backend_type is BackendType.GPTQ:
+            from auto_gptq import AutoGPTQForCausalLM
+            model = AutoGPTQForCausalLM.from_quantized(
+                model_path,
+                use_safetensors=True,
+                trust_remote_code=True,
+                device="cuda:0",
+                use_triton=False,
+                quantize_config=None,
+            )
+        elif backend_type is BackendType.TRANSFORMERS:
+            import torch
+            from transformers import AutoModelForCausalLM
+            model = AutoModelForCausalLM.from_pretrained(
+                model_path,
+                device_map="auto",
+                torch_dtype=torch.float16,
+                load_in_8bit=load_in_8bit,
+            )
+        else:
+            print(backend_type + "not implemented.")
+        return model
+    @classmethod
+    def create_llama2_tokenizer(cls, model_path):
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
+        return tokenizer
+    def get_token_length(
+        self,
+        prompt: str,
+    ) -> int:
+        if self.backend_type is BackendType.LLAMA_CPP:
+            input_ids = self.model.tokenize(bytes(prompt, "utf-8"))
+            return len(input_ids)
+        elif self.backend_type is BackendType.LLAMA2_CU:
+            input_ids = self.model.tokenize(prompt)
+            return len(input_ids)
+        else:
+            input_ids = self.tokenizer([prompt], return_tensors="np")["input_ids"]
+            return input_ids.shape[-1]
+    def get_input_token_length(
+        self,
+        message: str,
+        chat_history: list[tuple[str, str]] = [],
+        system_prompt: str = "",
+    ) -> int:
+        prompt = get_prompt(message, chat_history, system_prompt)
+        return self.get_token_length(prompt)
+    def generate(
+        self,
+        prompt: str,
+        max_new_tokens: int = 1000,
+        temperature: float = 0.9,
+        top_p: float = 1.0,
+        top_k: int = 40,
+        repetition_penalty: float = 1.0,
+        **kwargs: Any,
+    ) -> Iterator[str]:
+        """Create a generator of response from a prompt.
+        Examples:
+            >>> llama2_wrapper = LLAMA2_WRAPPER()
+            >>> prompt = get_prompt("Hi do you know Pytorch?")
+            >>> for response in llama2_wrapper.generate(prompt):
+            ...     print(response)
+        Args:
+            prompt: The prompt to generate text from.
+            max_new_tokens: The maximum number of tokens to generate.
+            temperature: The temperature to use for sampling.
+            top_p: The top-p value to use for sampling.
+            top_k: The top-k value to use for sampling.
+            repetition_penalty: The penalty to apply to repeated tokens.
+            kwargs: all other arguments.
+        Yields:
+            The generated text.
+        """
+        if self.backend_type is BackendType.LLAMA_CPP:
+            result = self.model(
+                prompt=prompt,
+                stream=True,
+                max_tokens=max_new_tokens,
+                top_k=top_k,
+                top_p=top_p,
+                temperature=temperature,
+                repeat_penalty=repetition_penalty,
+                **kwargs,
+            )
+            outputs = []
+            for part in result:
+                text = part["choices"][0]["text"]
+                outputs.append(text)
+                yield "".join(outputs)
+        elif self.backend_type is BackendType.LLAMA2_CU:
+            result = self.model(
+                prompt=prompt,
+                stream=True,
+                max_tokens=max_new_tokens,
+                top_k=top_k,
+                top_p=top_p,
+                temperature=temperature,
+                repeat_penalty=repetition_penalty,
+                **kwargs,
+            )
+            outputs = []
+            for part in result:
+                outputs.append(part)
+                yield "".join(outputs)
+        else:
+            from transformers import TextIteratorStreamer
+            inputs = self.tokenizer([prompt], return_tensors="pt").to("cuda")
+            streamer = TextIteratorStreamer(
+                self.tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
+            )
+            generate_kwargs = dict(
+                inputs,
+                streamer=streamer,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                repetition_penalty=repetition_penalty,
+                # num_beams=1,
+            )
+            generate_kwargs = (
+                generate_kwargs if kwargs is None else {**generate_kwargs, **kwargs}
+            )
+            t = Thread(target=self.model.generate, kwargs=generate_kwargs)
+            t.start()
+            outputs = []
+            for text in streamer:
+                outputs.append(text)
+                yield "".join(outputs)
+    def run(
+        self,
+        message: str,
+        chat_history: list[tuple[str, str]] = [],
+        system_prompt: str = "",
+        max_new_tokens: int = 1000,
+        temperature: float = 0.9,
+        top_p: float = 1.0,
+        top_k: int = 40,
+        repetition_penalty: float = 1.0,
+    ) -> Iterator[str]:
+        """Create a generator of response from a chat message.
+        Process message to llama2 prompt with chat history
+        and system_prompt for chatbot.
+        Args:
+            message: The origianl chat message to generate text from.
+            chat_history: Chat history list from chatbot.
+            system_prompt: System prompt for chatbot.
+            max_new_tokens: The maximum number of tokens to generate.
+            temperature: The temperature to use for sampling.
+            top_p: The top-p value to use for sampling.
+            top_k: The top-k value to use for sampling.
+            repetition_penalty: The penalty to apply to repeated tokens.
+            kwargs: all other arguments.
+        Yields:
+            The generated text.
+        """
+        prompt = get_prompt(message, chat_history, system_prompt)
+        return self.generate(
+            prompt, max_new_tokens, temperature, top_p, top_k, repetition_penalty
+        )
+    def __call__(
+        self,
+        prompt: str,
+        stream: bool = False,
+        max_new_tokens: int = 1000,
+        temperature: float = 0.9,
+        top_p: float = 1.0,
+        top_k: int = 40,
+        repetition_penalty: float = 1.0,
+        **kwargs: Any,
+    ) -> Union[str, Iterator[str]]:
+        """Generate text from a prompt.
+        Examples:
+            >>> llama2_wrapper = LLAMA2_WRAPPER()
+            >>> prompt = get_prompt("Hi do you know Pytorch?")
+            >>> print(llama2_wrapper(prompt))
+        Args:
+            prompt: The prompt to generate text from.
+            stream: Whether to stream the results.
+            max_new_tokens: The maximum number of tokens to generate.
+            temperature: The temperature to use for sampling.
+            top_p: The top-p value to use for sampling.
+            top_k: The top-k value to use for sampling.
+            repetition_penalty: The penalty to apply to repeated tokens.
+            kwargs: all other arguments.
+        Raises:
+            ValueError: If the requested tokens exceed the context window.
+            RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt.
+        Returns:
+            Generated text.
+        """
+        if self.backend_type is BackendType.LLAMA_CPP:
+            completion_or_chunks = self.model.__call__(
+                prompt,
+                stream=stream,
+                max_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                repeat_penalty=repetition_penalty,
+                **kwargs,
+            )
+            if stream:
+                def chunk_generator(chunks):
+                    for part in chunks:
+                        chunk = part["choices"][0]["text"]
+                        yield chunk
+                chunks: Iterator[str] = chunk_generator(completion_or_chunks)
+                return chunks
+            return completion_or_chunks["choices"][0]["text"]
+        elif self.backend_type is BackendType.LLAMA2_CU:
+            pass    # TODO
+        else:
+            inputs = self.tokenizer([prompt], return_tensors="pt").input_ids
+            prompt_tokens_len = len(inputs[0])
+            inputs = inputs.to("cuda")
+            generate_kwargs = dict(
+                inputs=inputs,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                repetition_penalty=repetition_penalty,
+                # num_beams=1,
+            )
+            generate_kwargs = (
+                generate_kwargs if kwargs is None else {**generate_kwargs, **kwargs}
+            )
+            if stream:
+                from transformers import TextIteratorStreamer
+                streamer = TextIteratorStreamer(
+                    self.tokenizer,
+                    timeout=10.0,
+                    skip_prompt=True,
+                    skip_special_tokens=True,
+                )
+                generate_kwargs["streamer"] = streamer
+                t = Thread(target=self.model.generate, kwargs=generate_kwargs)
+                t.start()
+                return streamer
+            else:
+                output_ids = self.model.generate(
+                    **generate_kwargs,
+                )
+                # skip prompt, skip special tokens
+                output = self.tokenizer.decode(
+                    output_ids[0][prompt_tokens_len:], skip_special_tokens=True
+                )
+                return output
+    def completion(
+        self,
+        prompt: str,
+        stream: bool = False,
+        max_new_tokens: int = 1000,
+        temperature: float = 0.9,
+        top_p: float = 1.0,
+        top_k: int = 40,
+        repetition_penalty: float = 1.0,
+        **kwargs: Any,
+    ) -> Union[Completion, Iterator[CompletionChunk]]:
+        """For OpenAI compatible API /v1/completions
+        Generate text from a prompt.
+        Examples:
+            >>> llama2_wrapper = LLAMA2_WRAPPER()
+            >>> prompt = get_prompt("Hi do you know Pytorch?")
+            >>> print(llm.completion(prompt))
+        Args:
+            prompt: The prompt to generate text from.
+            stream: Whether to stream the results.
+            max_new_tokens: The maximum number of tokens to generate.
+            temperature: The temperature to use for sampling.
+            top_p: The top-p value to use for sampling.
+            top_k: The top-k value to use for sampling.
+            repetition_penalty: The penalty to apply to repeated tokens.
+            kwargs: all other arguments.
+        Raises:
+            ValueError: If the requested tokens exceed the context window.
+            RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt.
+        Returns:
+            Response object containing the generated text.
+        """
+        completion_id: str = f"cmpl-{str(uuid.uuid4())}"
+        created: int = int(time.time())
+        model_name: str = (
+            self.backend_type + " default model"
+            if self.model_path == ""
+            else self.model_path
+        )
+        if self.backend_type is BackendType.LLAMA_CPP:
+            completion_or_chunks = self.model.__call__(
+                prompt,
+                stream=stream,
+                max_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                repeat_penalty=repetition_penalty,
+                **kwargs,
+            )
+            if stream:
+                chunks: Iterator[CompletionChunk] = completion_or_chunks
+                return chunks
+            return completion_or_chunks
+        elif self.backend_type is BackendType.LLAMA2_CU:
+            pass # TODO
+        else:
+            inputs = self.tokenizer([prompt], return_tensors="pt").input_ids
+            prompt_tokens_len = len(inputs[0])
+            inputs = inputs.to("cuda")
+            generate_kwargs = dict(
+                inputs=inputs,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                repetition_penalty=repetition_penalty,
+                # num_beams=1,
+            )
+            generate_kwargs = (
+                generate_kwargs if kwargs is None else {**generate_kwargs, **kwargs}
+            )
+            if stream:
+                from transformers import TextIteratorStreamer
+                streamer = TextIteratorStreamer(
+                    self.tokenizer,
+                    timeout=10.0,
+                    skip_prompt=True,
+                    skip_special_tokens=True,
+                )
+                generate_kwargs["streamer"] = streamer
+                t = Thread(target=self.model.generate, kwargs=generate_kwargs)
+                t.start()
+                def chunk_generator(chunks):
+                    for part in chunks:
+                        yield {
+                            "id": completion_id,
+                            "object": "text_completion",
+                            "created": created,
+                            "model": model_name,
+                            "choices": [
+                                {
+                                    "text": part,
+                                    "index": 0,
+                                    "logprobs": None,
+                                    "finish_reason": None,
+                                }
+                            ],
+                        }
+                chunks: Iterator[CompletionChunk] = chunk_generator(streamer)
+                return chunks
+            else:
+                output_ids = self.model.generate(
+                    **generate_kwargs,
+                )
+                total_tokens_len = len(output_ids[0])
+                output = self.tokenizer.decode(
+                    output_ids[0][prompt_tokens_len:], skip_special_tokens=True
+                )
+                completion: Completion = {
+                    "id": completion_id,
+                    "object": "text_completion",
+                    "created": created,
+                    "model": model_name,
+                    "choices": [
+                        {
+                            "text": output,
+                            "index": 0,
+                            "logprobs": None,
+                            "finish_reason": None,
+                        }
+                    ],
+                    "usage": {
+                        "prompt_tokens": prompt_tokens_len,
+                        "completion_tokens": total_tokens_len - prompt_tokens_len,
+                        "total_tokens": total_tokens_len,
+                    },
+                }
+                return completion
+    def chat_completion(
+        self,
+        messages: List[Message],
+        stream: bool = False,
+        max_new_tokens: int = 1000,
+        temperature: float = 0.9,
+        top_p: float = 1.0,
+        top_k: int = 40,
+        repetition_penalty: float = 1.0,
+        **kwargs: Any,
+    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
+        """For OpenAI compatible API /v1/chat/completions
+        Generate text from a dialog (chat history).
+        Examples:
+            >>> llama2_wrapper = LLAMA2_WRAPPER()
+            >>> dialog = [
+                    {
+                        "role":"system",
+                        "content":"You are a helpful, respectful and honest assistant. "
+                    },{
+                        "role":"user",
+                        "content":"Hi do you know Pytorch?",
+                    },
+                ]
+            >>> print(llm.chat_completion(dialog))
+        Args:
+            dialog: The dialog (chat history) to generate text from.
+            stream: Whether to stream the results.
+            max_new_tokens: The maximum number of tokens to generate.
+            temperature: The temperature to use for sampling.
+            top_p: The top-p value to use for sampling.
+            top_k: The top-k value to use for sampling.
+            repetition_penalty: The penalty to apply to repeated tokens.
+            kwargs: all other arguments.
+        Raises:
+            ValueError: If the requested tokens exceed the context window.
+            RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt.
+        Returns:
+            Response object containing the generated text.
+        """
+        completion_id: str = f"cmpl-{str(uuid.uuid4())}"
+        created: int = int(time.time())
+        model_name: str = (
+            self.backend_type + " default model"
+            if self.model_path == ""
+            else self.model_path
+        )
+        if self.backend_type is BackendType.LLAMA_CPP:
+            completion_or_chunks = self.model.create_chat_completion(
+                messages,
+                stream=stream,
+                max_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                repeat_penalty=repetition_penalty,
+                **kwargs,
+            )
+            if stream:
+                chunks: Iterator[ChatCompletionChunk] = completion_or_chunks
+                return chunks
+            return completion_or_chunks
+        elif self.backend_type is BackendType.LLAMA2_CU:
+            pass # TODO
+        else:
+            prompt = get_prompt_for_dialog(messages)
+            inputs = self.tokenizer([prompt], return_tensors="pt").input_ids
+            prompt_tokens_len = len(inputs[0])
+            inputs = inputs.to("cuda")
+            generate_kwargs = dict(
+                inputs=inputs,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                repetition_penalty=repetition_penalty,
+                # num_beams=1,
+            )
+            generate_kwargs = (
+                generate_kwargs if kwargs is None else {**generate_kwargs, **kwargs}
+            )
+            if stream:
+                from transformers import TextIteratorStreamer
+                streamer = TextIteratorStreamer(
+                    self.tokenizer,
+                    timeout=10.0,
+                    skip_prompt=True,
+                    skip_special_tokens=True,
+                )
+                generate_kwargs["streamer"] = streamer
+                t = Thread(target=self.model.generate, kwargs=generate_kwargs)
+                t.start()
+                def chunk_generator(chunks):
+                    yield {
+                        "id": "chat" + completion_id,
+                        "model": model_name,
+                        "created": created,
+                        "object": "chat.completion.chunk",
+                        "choices": [
+                            {
+                                "index": 0,
+                                "delta": {
+                                    "role": "assistant",
+                                },
+                                "finish_reason": None,
+                            }
+                        ],
+                    }
+                    for part in enumerate(chunks):
+                        yield {
+                            "id": "chat" + completion_id,
+                            "model": model_name,
+                            "created": created,
+                            "object": "chat.completion.chunk",
+                            "choices": [
+                                {
+                                    "index": 0,
+                                    "delta": {
+                                        "content": part,
+                                    },
+                                    "finish_reason": None,
+                                }
+                            ],
+                        }
+                chunks: Iterator[ChatCompletionChunk] = chunk_generator(streamer)
+                return chunks
+            else:
+                output_ids = self.model.generate(
+                    **generate_kwargs,
+                )
+                total_tokens_len = len(output_ids[0])
+                output = self.tokenizer.decode(
+                    output_ids[0][prompt_tokens_len:], skip_special_tokens=True
+                )
+                chatcompletion: ChatCompletion = {
+                    "id": "chat" + completion_id,
+                    "object": "chat.completion",
+                    "created": created,
+                    "model": model_name,
+                    "choices": [
+                        {
+                            "index": 0,
+                            "message": {
+                                "role": "assistant",
+                                "content": output,
+                            },
+                            "finish_reason": None,
+                        }
+                    ],
+                    "usage": {
+                        "prompt_tokens": prompt_tokens_len,
+                        "completion_tokens": total_tokens_len - prompt_tokens_len,
+                        "total_tokens": total_tokens_len,
+                    },
+                }
+                return chatcompletion
+def get_prompt_for_dialog(dialog: List[Message]) -> str:
+    """Process dialog (chat history) to llama2 prompt for
+    OpenAI compatible API /v1/chat/completions.
+    Examples:
+        >>> dialog = [
+                {
+                    "role":"system",
+                    "content":"You are a helpful, respectful and honest assistant. "
+                },{
+                    "role":"user",
+                    "content":"Hi do you know Pytorch?",
+                },
+            ]
+        >>> prompt = get_prompt_for_dialog("Hi do you know Pytorch?")
+    Args:
+        dialog: The dialog (chat history) to generate text from.
+    Yields:
+        prompt string.
+    """
+    # add "<<SYS>>\n{system_prompt}\n<</SYS>>\n\n" in first dialog
+    if dialog[0]["role"] == "system":
+        dialog = [
+            {
+                "role": dialog[1]["role"],
+                "content": B_SYS + dialog[0]["content"] + E_SYS + dialog[1]["content"],
+            }
+        ] + dialog[2:]
+    # check roles
+    assert all([msg["role"] == "user" for msg in dialog[::2]]) and all(
+        [msg["role"] == "assistant" for msg in dialog[1::2]]
+    ), (
+        "model only supports 'system', 'user' and 'assistant' roles, "
+        "starting with 'system', then 'user' and alternating (u/a/u/a/u...)"
+    )
+    # add chat history
+    texts = []
+    for prompt, answer in zip(
+        dialog[::2],
+        dialog[1::2],
+    ):
+        texts.append(
+            f"{B_INST} {(prompt['content']).strip()} {E_INST} {(answer['content']).strip()} "
+        )
+    # check last message if role is user, then add it to prompt text
+    assert (
+        dialog[-1]["role"] == "user"
+    ), f"Last message must be from user, got {dialog[-1]['role']}"
+    texts.append(f"{B_INST} {(dialog[-1]['content']).strip()} {E_INST}")
+    return "".join(texts)
+def get_prompt(
+    message: str, chat_history: list[tuple[str, str]] = [], system_prompt: str = ""
+) -> str:
+    """Process message to llama2 prompt with chat history
+    and system_prompt for chatbot.
+    Examples:
+        >>> prompt = get_prompt("Hi do you know Pytorch?")
+    Args:
+        message: The origianl chat message to generate text from.
+        chat_history: Chat history list from chatbot.
+        system_prompt: System prompt for chatbot.
+    Yields:
+        prompt string.
+    """
+    texts = [f"[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n"]
+    for user_input, response in chat_history:
+        texts.append(f"{user_input.strip()} [/INST] {response.strip()} </s><s> [INST] ")
+    texts.append(f"{message.strip()} [/INST]")
+    return "".join(texts)
+class BackendType(Enum):
+    UNKNOWN = 0
+    TRANSFORMERS = 1
+    GPTQ = 2
+    LLAMA_CPP = 3
+    LLAMA2_CU = 4
+    @classmethod
+    def get_type(cls, backend_name: str):
+        backend_type = None
+        backend_name_lower = backend_name.lower()
+        if "transformers" in backend_name_lower:
+            backend_type = BackendType.TRANSFORMERS
+        elif "gptq" in backend_name_lower:
+            backend_type = BackendType.GPTQ
+        elif "cpp" in backend_name_lower:
+            backend_type = BackendType.LLAMA_CPP
+        elif "cu" in backend_name_lower:
+            backend_type = BackendType.LLAMA2_CU
+        else:
+            raise Exception("Unknown backend: " + backend_name)
+            # backend_type = BackendType.UNKNOWN
+        return backend_type

llama2_wrapper/server/__init__.py ADDED Viewed

File without changes

llama2_wrapper/server/__main__.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""Example FastAPI server for llama2_wrapper.
+To run this example:
+```
+python3 -m llama2_wrapper.server
+```
+or
+```
+uvicorn llama2_wrapper.server.app:app --reload
+```
+Then visit http://localhost:8000/docs to see the interactive API docs.
+"""
+import os
+import argparse
+import uvicorn
+from llama2_wrapper.server.app import create_app, Settings
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    for name, field in Settings.model_fields.items():
+        description = field.description
+        if field.default is not None and description is not None:
+            description += f" (default: {field.default})"
+        parser.add_argument(
+            f"--{name}",
+            dest=name,
+            type=field.annotation if field.annotation is not None else str,
+            help=description,
+        )
+    args = parser.parse_args()
+    settings = Settings(**{k: v for k, v in vars(args).items() if v is not None})
+    app = create_app(settings=settings)
+    uvicorn.run(
+        app,
+        host=os.getenv("HOST", settings.host),
+        port=int(os.getenv("PORT", settings.port)),
+    )

llama2_wrapper/server/app.py ADDED Viewed

	@@ -0,0 +1,526 @@

+import json
+import multiprocessing
+from re import compile, Match, Pattern
+from threading import Lock
+from functools import partial
+from typing import Callable, Coroutine, Iterator, List, Optional, Tuple, Union, Dict
+from typing_extensions import TypedDict, Literal
+import anyio
+from anyio.streams.memory import MemoryObjectSendStream
+from starlette.concurrency import run_in_threadpool, iterate_in_threadpool
+from fastapi import Depends, FastAPI, APIRouter, Request, Response
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from fastapi.routing import APIRoute
+from pydantic import BaseModel, Field
+from pydantic_settings import BaseSettings
+from sse_starlette.sse import EventSourceResponse
+from llama2_wrapper.model import LLAMA2_WRAPPER
+from llama2_wrapper.types import (
+    Completion,
+    CompletionChunk,
+    ChatCompletion,
+    ChatCompletionChunk,
+)
+class Settings(BaseSettings):
+    model_path: str = Field(
+        default="",
+        description="The path to the model to use for generating completions.",
+    )
+    backend_type: str = Field(
+        default="llama.cpp",
+        description="Backend for llama2, options: llama.cpp, gptq, transformers",
+    )
+    max_tokens: int = Field(default=4000, ge=1, description="Maximum context size.")
+    load_in_8bit: bool = Field(
+        default=False,
+        description="`Whether to use bitsandbytes to run model in 8 bit mode (only for transformers models).",
+    )
+    verbose: bool = Field(
+        default=False,
+        description="Whether to print verbose output to stderr.",
+    )
+    host: str = Field(default="localhost", description="API address")
+    port: int = Field(default=8000, description="API port")
+    interrupt_requests: bool = Field(
+        default=True,
+        description="Whether to interrupt requests when a new request is received.",
+    )
+class ErrorResponse(TypedDict):
+    """OpenAI style error response"""
+    message: str
+    type: str
+    param: Optional[str]
+    code: Optional[str]
+class ErrorResponseFormatters:
+    """Collection of formatters for error responses.
+    Args:
+        request (Union[CreateCompletionRequest, CreateChatCompletionRequest]):
+            Request body
+        match (Match[str]): Match object from regex pattern
+    Returns:
+        Tuple[int, ErrorResponse]: Status code and error response
+    """
+    @staticmethod
+    def context_length_exceeded(
+        request: Union["CreateCompletionRequest", "CreateChatCompletionRequest"],
+        match,  # type: Match[str] # type: ignore
+    ) -> Tuple[int, ErrorResponse]:
+        """Formatter for context length exceeded error"""
+        context_window = int(match.group(2))
+        prompt_tokens = int(match.group(1))
+        completion_tokens = request.max_new_tokens
+        if hasattr(request, "messages"):
+            # Chat completion
+            message = (
+                "This model's maximum context length is {} tokens. "
+                "However, you requested {} tokens "
+                "({} in the messages, {} in the completion). "
+                "Please reduce the length of the messages or completion."
+            )
+        else:
+            # Text completion
+            message = (
+                "This model's maximum context length is {} tokens, "
+                "however you requested {} tokens "
+                "({} in your prompt; {} for the completion). "
+                "Please reduce your prompt; or completion length."
+            )
+        return 400, ErrorResponse(
+            message=message.format(
+                context_window,
+                completion_tokens + prompt_tokens,
+                prompt_tokens,
+                completion_tokens,
+            ),
+            type="invalid_request_error",
+            param="messages",
+            code="context_length_exceeded",
+        )
+    @staticmethod
+    def model_not_found(
+        request: Union["CreateCompletionRequest", "CreateChatCompletionRequest"],
+        match,  # type: Match[str] # type: ignore
+    ) -> Tuple[int, ErrorResponse]:
+        """Formatter for model_not_found error"""
+        model_path = str(match.group(1))
+        message = f"The model `{model_path}` does not exist"
+        return 400, ErrorResponse(
+            message=message,
+            type="invalid_request_error",
+            param=None,
+            code="model_not_found",
+        )
+class RouteErrorHandler(APIRoute):
+    """Custom APIRoute that handles application errors and exceptions"""
+    # key: regex pattern for original error message from llama_cpp
+    # value: formatter function
+    pattern_and_formatters: Dict[
+        "Pattern",
+        Callable[
+            [
+                Union["CreateCompletionRequest", "CreateChatCompletionRequest"],
+                "Match[str]",
+            ],
+            Tuple[int, ErrorResponse],
+        ],
+    ] = {
+        compile(
+            r"Requested tokens \((\d+)\) exceed context window of (\d+)"
+        ): ErrorResponseFormatters.context_length_exceeded,
+        compile(
+            r"Model path does not exist: (.+)"
+        ): ErrorResponseFormatters.model_not_found,
+    }
+    def error_message_wrapper(
+        self,
+        error: Exception,
+        body: Optional[
+            Union[
+                "CreateChatCompletionRequest",
+                "CreateCompletionRequest",
+            ]
+        ] = None,
+    ) -> Tuple[int, ErrorResponse]:
+        """Wraps error message in OpenAI style error response"""
+        if body is not None and isinstance(
+            body,
+            (
+                CreateCompletionRequest,
+                CreateChatCompletionRequest,
+            ),
+        ):
+            # When text completion or chat completion
+            for pattern, callback in self.pattern_and_formatters.items():
+                match = pattern.search(str(error))
+                if match is not None:
+                    return callback(body, match)
+        # Wrap other errors as internal server error
+        return 500, ErrorResponse(
+            message=str(error),
+            type="internal_server_error",
+            param=None,
+            code=None,
+        )
+    def get_route_handler(
+        self,
+    ) -> Callable[[Request], Coroutine[None, None, Response]]:
+        """Defines custom route handler that catches exceptions and formats
+        in OpenAI style error response"""
+        original_route_handler = super().get_route_handler()
+        async def custom_route_handler(request: Request) -> Response:
+            try:
+                return await original_route_handler(request)
+            except Exception as exc:
+                json_body = await request.json()
+                try:
+                    if "messages" in json_body:
+                        # Chat completion
+                        body: Optional[
+                            Union[
+                                CreateChatCompletionRequest,
+                                CreateCompletionRequest,
+                            ]
+                        ] = CreateChatCompletionRequest(**json_body)
+                    elif "prompt" in json_body:
+                        # Text completion
+                        body = CreateCompletionRequest(**json_body)
+                    # else:
+                    #     # Embedding
+                    #     body = CreateEmbeddingRequest(**json_body)
+                except Exception:
+                    # Invalid request body
+                    body = None
+                # Get proper error message from the exception
+                (
+                    status_code,
+                    error_message,
+                ) = self.error_message_wrapper(error=exc, body=body)
+                return JSONResponse(
+                    {"error": error_message},
+                    status_code=status_code,
+                )
+        return custom_route_handler
+router = APIRouter(route_class=RouteErrorHandler)
+settings: Optional[Settings] = None
+llama2: Optional[LLAMA2_WRAPPER] = None
+def create_app(settings: Optional[Settings] = None):
+    if settings is None:
+        settings = Settings()
+    app = FastAPI(
+        title="llama2-wrapper Fast API",
+        version="0.0.1",
+    )
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=["*"],
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+    app.include_router(router)
+    global llama2
+    llama2 = LLAMA2_WRAPPER(
+        model_path=settings.model_path,
+        backend_type=settings.backend_type,
+        max_tokens=settings.max_tokens,
+        load_in_8bit=settings.load_in_8bit,
+        verbose=settings.load_in_8bit,
+    )
+    def set_settings(_settings: Settings):
+        global settings
+        settings = _settings
+    set_settings(settings)
+    return app
+llama_outer_lock = Lock()
+llama_inner_lock = Lock()
+def get_llama():
+    # NOTE: This double lock allows the currently streaming llama model to
+    # check if any other requests are pending in the same thread and cancel
+    # the stream if so.
+    llama_outer_lock.acquire()
+    release_outer_lock = True
+    try:
+        llama_inner_lock.acquire()
+        try:
+            llama_outer_lock.release()
+            release_outer_lock = False
+            yield llama2
+        finally:
+            llama_inner_lock.release()
+    finally:
+        if release_outer_lock:
+            llama_outer_lock.release()
+def get_settings():
+    yield settings
+async def get_event_publisher(
+    request: Request,
+    inner_send_chan: MemoryObjectSendStream,
+    iterator: Iterator,
+):
+    async with inner_send_chan:
+        try:
+            async for chunk in iterate_in_threadpool(iterator):
+                await inner_send_chan.send(dict(data=json.dumps(chunk)))
+                if await request.is_disconnected():
+                    raise anyio.get_cancelled_exc_class()()
+                if settings.interrupt_requests and llama_outer_lock.locked():
+                    await inner_send_chan.send(dict(data="[DONE]"))
+                    raise anyio.get_cancelled_exc_class()()
+            await inner_send_chan.send(dict(data="[DONE]"))
+        except anyio.get_cancelled_exc_class() as e:
+            print("disconnected")
+            with anyio.move_on_after(1, shield=True):
+                print(f"Disconnected from client (via refresh/close) {request.client}")
+                raise e
+stream_field = Field(
+    default=False,
+    description="Whether to stream the results as they are generated. Useful for chatbots.",
+)
+max_new_tokens_field = Field(
+    default=1000, ge=1, description="The maximum number of tokens to generate."
+)
+temperature_field = Field(
+    default=0.9,
+    ge=0.0,
+    le=2.0,
+    description="The temperature to use for sampling.",
+)
+top_p_field = Field(
+    default=1.0,
+    ge=0.0,
+    le=1.0,
+    description="The top-p value to use for sampling.",
+)
+top_k_field = Field(
+    default=40,
+    ge=0,
+    description="The top-k value to use for sampling.",
+)
+repetition_penalty_field = Field(
+    default=1.0,
+    ge=0.0,
+    description="The penalty to apply to repeated tokens.",
+)
+# stop_field = Field(
+#     default=None,
+#     description="A list of tokens at which to stop generation. If None, no stop tokens are used.",
+# )
+class CreateCompletionRequest(BaseModel):
+    prompt: Union[str, List[str]] = Field(
+        default="", description="The prompt to generate text from."
+    )
+    stream: bool = stream_field
+    max_new_tokens: int = max_new_tokens_field
+    temperature: float = temperature_field
+    top_p: float = top_p_field
+    top_k: int = top_k_field
+    repetition_penalty: float = repetition_penalty_field
+    # stop: Optional[Union[str, List[str]]] = stop_field
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
+                    # "stop": ["\n", "###"],
+                }
+            ]
+        }
+    }
+@router.post(
+    "/v1/completions",
+)
+async def create_completion(
+    request: Request,
+    body: CreateCompletionRequest,
+    llama2: LLAMA2_WRAPPER = Depends(get_llama),
+) -> Completion:
+    if isinstance(body.prompt, list):
+        assert len(body.prompt) <= 1
+        body.prompt = body.prompt[0] if len(body.prompt) > 0 else ""
+    kwargs = body.model_dump()
+    iterator_or_completion: Union[
+        Completion, Iterator[CompletionChunk]
+    ] = await run_in_threadpool(llama2.completion, **kwargs)
+    if isinstance(iterator_or_completion, Iterator):
+        first_response = await run_in_threadpool(next, iterator_or_completion)
+        # If no exception was raised from first_response, we can assume that
+        # the iterator is valid and we can use it to stream the response.
+        def iterator() -> Iterator[CompletionChunk]:
+            yield first_response
+            yield from iterator_or_completion
+        send_chan, recv_chan = anyio.create_memory_object_stream(10)
+        return EventSourceResponse(
+            recv_chan,
+            data_sender_callable=partial(  # type: ignore
+                get_event_publisher,
+                request=request,
+                inner_send_chan=send_chan,
+                iterator=iterator(),
+            ),
+        )
+    else:
+        return iterator_or_completion
+class ChatCompletionRequestMessage(BaseModel):
+    role: Literal["system", "user", "assistant"] = Field(
+        default="user", description="The role of the message."
+    )
+    content: str = Field(default="", description="The content of the message.")
+class CreateChatCompletionRequest(BaseModel):
+    messages: List[ChatCompletionRequestMessage] = Field(
+        default=[], description="A list of messages to generate completions for."
+    )
+    stream: bool = stream_field
+    max_new_tokens: int = max_new_tokens_field
+    temperature: float = temperature_field
+    top_p: float = top_p_field
+    top_k: int = top_k_field
+    repetition_penalty: float = repetition_penalty_field
+    # stop: Optional[List[str]] = stop_field
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "messages": [
+                        ChatCompletionRequestMessage(
+                            role="system", content="You are a helpful assistant."
+                        ).model_dump(),
+                        ChatCompletionRequestMessage(
+                            role="user", content="What is the capital of France?"
+                        ).model_dump(),
+                    ]
+                }
+            ]
+        }
+    }
+@router.post(
+    "/v1/chat/completions",
+)
+async def create_chat_completion(
+    request: Request,
+    body: CreateChatCompletionRequest,
+    llama2: LLAMA2_WRAPPER = Depends(get_llama),
+    settings: Settings = Depends(get_settings),
+) -> ChatCompletion:
+    kwargs = body.model_dump()
+    iterator_or_completion: Union[
+        ChatCompletion, Iterator[ChatCompletionChunk]
+    ] = await run_in_threadpool(llama2.chat_completion, **kwargs)
+    if isinstance(iterator_or_completion, Iterator):
+        first_response = await run_in_threadpool(next, iterator_or_completion)
+        # If no exception was raised from first_response, we can assume that
+        # the iterator is valid and we can use it to stream the response.
+        def iterator() -> Iterator[ChatCompletionChunk]:
+            yield first_response
+            yield from iterator_or_completion
+        send_chan, recv_chan = anyio.create_memory_object_stream(10)
+        return EventSourceResponse(
+            recv_chan,
+            data_sender_callable=partial(  # type: ignore
+                get_event_publisher,
+                request=request,
+                inner_send_chan=send_chan,
+                iterator=iterator(),
+            ),
+        )
+    else:
+        return iterator_or_completion
+class ModelData(TypedDict):
+    id: str
+    object: Literal["model"]
+    owned_by: str
+    permissions: List[str]
+class ModelList(TypedDict):
+    object: Literal["list"]
+    data: List[ModelData]
+@router.get("/v1/models")
+async def get_models(
+    settings: Settings = Depends(get_settings),
+) -> ModelList:
+    assert llama2 is not None
+    return {
+        "object": "list",
+        "data": [
+            {
+                "id": settings.backend_type + " default model"
+                if settings.model_path == ""
+                else settings.model_path,
+                "object": "model",
+                "owned_by": "me",
+                "permissions": [],
+            }
+        ],
+    }

llama2_wrapper/types.py ADDED Viewed

	@@ -0,0 +1,115 @@

+from typing import Any, List, Optional, Dict, Union
+from typing_extensions import TypedDict, NotRequired, Literal
+B_INST, E_INST = "[INST]", "[/INST]"
+B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
+# Role = Literal["system", "user", "assistant"]
+# class Message(TypedDict):
+#     role: Role
+#     content: str
+class ChatCompletionMessage(TypedDict):
+    role: Literal["assistant", "user", "system"]
+    content: str
+    user: NotRequired[str]
+# transformers: Message; llama.cpp: ChatCompletionMessage
+Message = ChatCompletionMessage
+Dialog = List[Message]
+class EmbeddingUsage(TypedDict):
+    prompt_tokens: int
+    total_tokens: int
+class EmbeddingData(TypedDict):
+    index: int
+    object: str
+    embedding: List[float]
+class Embedding(TypedDict):
+    object: Literal["list"]
+    model: str
+    data: List[EmbeddingData]
+    usage: EmbeddingUsage
+class CompletionLogprobs(TypedDict):
+    text_offset: List[int]
+    token_logprobs: List[Optional[float]]
+    tokens: List[str]
+    top_logprobs: List[Optional[Dict[str, float]]]
+class CompletionChoice(TypedDict):
+    text: str
+    index: int
+    logprobs: Optional[CompletionLogprobs]
+    finish_reason: Optional[str]
+class CompletionUsage(TypedDict):
+    prompt_tokens: int
+    completion_tokens: int
+    total_tokens: int
+class CompletionChunk(TypedDict):
+    id: str
+    object: Literal["text_completion"]
+    created: int
+    model: str
+    choices: List[CompletionChoice]
+class Completion(TypedDict):
+    id: str
+    object: Literal["text_completion"]
+    created: int
+    model: str
+    choices: List[CompletionChoice]
+    usage: CompletionUsage
+class ChatCompletionChoice(TypedDict):
+    index: int
+    message: ChatCompletionMessage
+    finish_reason: Optional[str]
+class ChatCompletion(TypedDict):
+    id: str
+    object: Literal["chat.completion"]
+    created: int
+    model: str
+    choices: List[ChatCompletionChoice]
+    usage: CompletionUsage
+class ChatCompletionChunkDeltaEmpty(TypedDict):
+    pass
+class ChatCompletionChunkDelta(TypedDict):
+    role: NotRequired[Literal["assistant"]]
+    content: NotRequired[str]
+class ChatCompletionChunkChoice(TypedDict):
+    index: int
+    delta: Union[ChatCompletionChunkDelta, ChatCompletionChunkDeltaEmpty]
+    finish_reason: Optional[str]
+class ChatCompletionChunk(TypedDict):
+    id: str
+    model: str
+    object: Literal["chat.completion.chunk"]
+    created: int
+    choices: List[ChatCompletionChunkChoice]

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

prompts/prompts_en.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

prompts/utils.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import csv
+import os
+from hashlib import md5
+def read_csv_to_dict_list(file_path):
+    with open(file_path, mode="r", encoding="utf-8") as file:
+        reader = csv.DictReader(file)
+        list_of_dicts = [row for row in reader]
+        return list_of_dicts
+def split_list_with_key(lst, dict_key):
+    result = {}
+    for row in lst:
+        if row.get(dict_key) not in result:
+            result[row.get(dict_key)] = []
+        result[row.get(dict_key)].append(row)
+    return result
+def read_csv_to_type_dict(file_path, type_key):
+    lst = read_csv_to_dict_list(file_path=file_path)
+    return split_list_with_key(lst=lst, dict_key=type_key)
+def md5_str(str):
+    return md5(str.encode("utf8")).hexdigest()
+current_dir = os.path.dirname(__file__)
+class PromtsContainer(object):
+    def __init__(self) -> None:
+        prompts_path = os.path.join(current_dir, "prompts_en.csv")
+        self.data = read_csv_to_type_dict(prompts_path, "type")
+        self.summary_dict = {
+            md5_str(row.get("summary")): row.get("prompt")
+            for chunk in self.data.values()
+            for row in chunk
+        }
+    def get_prompts_tab_dict(self):
+        return self.data
+    def get_prompt_by_summary(self, summary):
+        return self.summary_dict.get(md5_str(summary), summary)

pyproject.toml ADDED Viewed

	@@ -0,0 +1,47 @@

+[tool.poetry]
+name = "llama2-wrapper"
+version = "0.1.14"
+description = "Use llama2-wrapper as your local llama2 backend for Generative Agents / Apps"
+authors = ["liltom-eth <liltom.eth@gmail.com>"]
+license = "MIT"
+homepage = "https://github.com/liltom-eth/llama2-webui"
+repository = "https://github.com/liltom-eth/llama2-webui"
+readme = "./docs/pypi.md"
+packages = [{include = "llama2_wrapper"}]
+[tool.poetry.dependencies]
+python = ">=3.10,<3.13"
+accelerate = "^0.21.0"
+auto-gptq = "0.3.0"
+gradio = "3.37.0"
+protobuf = "3.20.3"
+scipy = "1.11.1"
+sentencepiece = "0.1.99"
+torch = "2.0.1"
+transformers = "4.31.0"
+tqdm = "4.65.0"
+python-dotenv = "1.0.0"
+llama-cpp-python = "0.2.11"
+bitsandbytes = [
+    {platform = 'linux', version = "0.40.2"},
+    {platform = 'darwin', version = "0.40.2"},
+]
+memory-profiler = "0.61.0"
+huggingface-hub = "0.16.4"
+fastapi = "0.100.0"
+uvicorn = "0.23.1"
+sse-starlette = "1.6.5"
+pydantic = "2.2.1"
+pydantic-settings = "2.0.3"
+pytest = "7.4.0"
+black = "23.7.0"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
+[virtualenvs]
+create = true
+in-project = true

requirements.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+accelerate==0.21.0
+auto-gptq==0.3.0
+bitsandbytes==0.40.2
+gradio==3.37.0
+protobuf==3.20.3
+scipy==1.11.1
+sentencepiece==0.1.99
+torch==2.0.1
+transformers==4.31.0
+tqdm==4.65.0
+python-dotenv==1.0.0
+llama-cpp-python==0.2.11
+memory-profiler==0.61.0
+huggingface-hub==0.16.4
+fastapi==0.100.0
+uvicorn==0.23.1
+sse-starlette==1.6.5
+pydantic==2.2.1
+pydantic-settings==2.0.3
+pytest==7.4.0
+black==23.7.0

static/screenshot.png ADDED Viewed

tests/__init__.py ADDED Viewed

File without changes

tests/test_get_prompt.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import pytest
+from llama2_wrapper.model import get_prompt_for_dialog
+class TestClassGetPromptForDialog:
+    from llama2_wrapper.types import Message
+    dialog = []
+    message1 = Message(
+        role="system",
+        content="You are a helpful, respectful and honest assistant. ",
+    )
+    message2 = Message(
+        role="user",
+        content="Hi do you know Pytorch?",
+    )
+    dialog.append(message1)
+    dialog.append(message2)
+    dialog2 = []
+    dialog2.append(message1)
+    dialog2.append(message2)
+    message3 = Message(
+        role="assistant",
+        content="Yes I know Pytorch. ",
+    )
+    message4 = Message(
+        role="user",
+        content="Can you write a CNN in Pytorch?",
+    )
+    dialog2.append(message3)
+    dialog2.append(message4)
+    dialog3 = []
+    dialog3.append(message3)
+    dialog3.append(message4)
+    dialog3.append(message3)
+    dialog3.append(message4)
+    message5 = Message(
+        role="assistant",
+        content="Yes I can write a CNN in Pytorch.",
+    )
+    dialog3.append(message5)
+    def test_dialog1(self):
+        prompt = get_prompt_for_dialog(self.dialog)
+        # print(prompt)
+        result = """[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. \n<</SYS>>\n\nHi do you know Pytorch? [/INST]"""
+        assert prompt == result
+    def test_dialog2(self):
+        prompt = get_prompt_for_dialog(self.dialog2)
+        # print(prompt)
+        result = """[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. \n<</SYS>>\n\nHi do you know Pytorch? [/INST] Yes I know Pytorch. [INST] Can you write a CNN in Pytorch? [/INST]"""
+        assert prompt == result
+    def test_dialog3(self):
+        with pytest.raises(AssertionError):
+            prompt = get_prompt_for_dialog(self.dialog3)