Spaces:
Running
Running
Commit
β’
f3dc796
1
Parent(s):
29e37fd
update
Browse files- app.py +4 -6
- src/assets/text_content.py +8 -6
app.py
CHANGED
@@ -24,12 +24,14 @@ ALL_COLUMNS_MAPPING = {
|
|
24 |
# model
|
25 |
"Model": "Model π€",
|
26 |
"Arch": "Arch ποΈ",
|
27 |
-
"Size": "Size
|
28 |
# deployment settings
|
29 |
"backend.name": "Backend π",
|
30 |
"backend.torch_dtype": "Dtype π₯",
|
31 |
"optimizations": "Optimizations π οΈ",
|
32 |
"quantization": "Quantization ποΈ",
|
|
|
|
|
33 |
# throughput measurements
|
34 |
"decode.throughput(tokens/s)": "Decode Throughput (tokens/s) β¬οΈ",
|
35 |
"generate.throughput(tokens/s)": "E2E Throughput (tokens/s) β¬οΈ",
|
@@ -42,8 +44,6 @@ ALL_COLUMNS_MAPPING = {
|
|
42 |
"generate.max_memory_used(MB)": "Used Memory (MB) β¬οΈ",
|
43 |
# energy measurements
|
44 |
"generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh) β¬οΈ",
|
45 |
-
# quality measurements
|
46 |
-
"Score": "Avg Score (%) β¬οΈ",
|
47 |
}
|
48 |
SORTING_COLUMN = ["Score", "generate.throughput(tokens/s)"]
|
49 |
SORTING_ASCENDING = [False, True]
|
@@ -148,9 +148,7 @@ def get_benchmark_chart(bench_df):
|
|
148 |
copy_df = bench_df.copy()
|
149 |
# transform
|
150 |
copy_df["Arch ποΈ"] = copy_df["Arch ποΈ"].apply(process_model_arch)
|
151 |
-
#
|
152 |
-
# copy_df = copy_df[copy_df["E2E Latency (s) β¬οΈ"] <= 100]
|
153 |
-
|
154 |
fig = px.scatter(
|
155 |
copy_df,
|
156 |
y="Avg Score (%) β¬οΈ",
|
|
|
24 |
# model
|
25 |
"Model": "Model π€",
|
26 |
"Arch": "Arch ποΈ",
|
27 |
+
"Size": "Size π",
|
28 |
# deployment settings
|
29 |
"backend.name": "Backend π",
|
30 |
"backend.torch_dtype": "Dtype π₯",
|
31 |
"optimizations": "Optimizations π οΈ",
|
32 |
"quantization": "Quantization ποΈ",
|
33 |
+
# quality measurements
|
34 |
+
"Score": "Avg Score (%) β¬οΈ",
|
35 |
# throughput measurements
|
36 |
"decode.throughput(tokens/s)": "Decode Throughput (tokens/s) β¬οΈ",
|
37 |
"generate.throughput(tokens/s)": "E2E Throughput (tokens/s) β¬οΈ",
|
|
|
44 |
"generate.max_memory_used(MB)": "Used Memory (MB) β¬οΈ",
|
45 |
# energy measurements
|
46 |
"generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh) β¬οΈ",
|
|
|
|
|
47 |
}
|
48 |
SORTING_COLUMN = ["Score", "generate.throughput(tokens/s)"]
|
49 |
SORTING_ASCENDING = [False, True]
|
|
|
148 |
copy_df = bench_df.copy()
|
149 |
# transform
|
150 |
copy_df["Arch ποΈ"] = copy_df["Arch ποΈ"].apply(process_model_arch)
|
151 |
+
# plot
|
|
|
|
|
152 |
fig = px.scatter(
|
153 |
copy_df,
|
154 |
y="Avg Score (%) β¬οΈ",
|
src/assets/text_content.py
CHANGED
@@ -12,7 +12,7 @@ ABOUT_TEXT = """<h3>About the π€ LLM-Perf Leaderboard ποΈ</h3>
|
|
12 |
<ul>
|
13 |
<li>To avoid communication-dependent results, only one GPU is used.</li>
|
14 |
<li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">π€ Open LLM Leaderboard</a>.</li>
|
15 |
-
<li>LLMs are running on a singleton batch with a prompt size of
|
16 |
<li>Peak memory is measured in MB during the generate pass using Py3NVML while assuring the GPU's isolation.</li>
|
17 |
<li>Energy consumption is measured in kWh using CodeCarbon and taking into consideration the GPU, CPU, RAM and location of the machine.</li>
|
18 |
<li>Each pair of (Model Type, Weight Class) is represented by the best scored model. This LLM is the one used for all the hardware/backend/optimization experiments.</li>
|
@@ -44,19 +44,21 @@ device: cuda
|
|
44 |
|
45 |
backend:
|
46 |
no_weights: true
|
47 |
-
delete_cache: true
|
48 |
torch_dtype: float16
|
49 |
-
quantization_strategy: gptq
|
50 |
bettertransformer: true
|
|
|
|
|
51 |
|
52 |
benchmark:
|
53 |
memory: true
|
54 |
-
|
|
|
|
|
55 |
input_shapes:
|
56 |
batch_size: 1
|
57 |
-
sequence_length:
|
|
|
58 |
|
59 |
-
new_tokens: 1000
|
60 |
```
|
61 |
"""
|
62 |
|
|
|
12 |
<ul>
|
13 |
<li>To avoid communication-dependent results, only one GPU is used.</li>
|
14 |
<li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">π€ Open LLM Leaderboard</a>.</li>
|
15 |
+
<li>LLMs are running on a singleton batch with a prompt size of 256 and generating a 1000 tokens.</li>
|
16 |
<li>Peak memory is measured in MB during the generate pass using Py3NVML while assuring the GPU's isolation.</li>
|
17 |
<li>Energy consumption is measured in kWh using CodeCarbon and taking into consideration the GPU, CPU, RAM and location of the machine.</li>
|
18 |
<li>Each pair of (Model Type, Weight Class) is represented by the best scored model. This LLM is the one used for all the hardware/backend/optimization experiments.</li>
|
|
|
44 |
|
45 |
backend:
|
46 |
no_weights: true
|
|
|
47 |
torch_dtype: float16
|
|
|
48 |
bettertransformer: true
|
49 |
+
quantization_scheme: gptq
|
50 |
+
|
51 |
|
52 |
benchmark:
|
53 |
memory: true
|
54 |
+
energy: true
|
55 |
+
|
56 |
+
new_tokens: 1000
|
57 |
input_shapes:
|
58 |
batch_size: 1
|
59 |
+
sequence_length: 256
|
60 |
+
|
61 |
|
|
|
62 |
```
|
63 |
"""
|
64 |
|