Spaces:

ml-energy
/

leaderboard

Running

App Files Files Community

Zhiyu Wu commited on Jul 25, 2023

Commit

f0128b6

•

1 Parent(s): 327a44b

Add llama2, sort ShareGPT dataset by length (#18)

Browse files

Files changed (11) hide show

README.md +2 -2
data/A40_chat-concise_benchmark.csv +2 -0
data/A40_chat_benchmark.csv +2 -0
data/A40_instruct-concise_benchmark.csv +2 -0
data/A40_instruct_benchmark.csv +2 -0
data/score.csv +2 -0
pegasus/benchmark.yaml +1 -1
requirements-benchmark.txt +1 -1
scripts/benchmark.py +12 -6
sharegpt/README.md +5 -0
sharegpt/{sg_90k_part1_html_cleaned_lang_first_sampled.json → sg_90k_part1_html_cleaned_lang_first_sampled_sorted.json} +0 -0

README.md CHANGED Viewed

@@ -52,6 +52,6 @@ We run benchmarks using multiple nodes and GPUs using [Pegasus](https://github.c
 You can still run benchmarks without Pegasus like this:
 ```console
-$ docker exec leaderboard0 python scripts/benchmark.py --model-path /data/leaderboard/weights/lmsys/vicuna-13B --input-file sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
-$ docker exec leaderboard0 python scripts/benchmark.py --model-path databricks/dolly-v2-12b --input-file sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
 ```

 You can still run benchmarks without Pegasus like this:
 ```console
+$ docker exec leaderboard0 python scripts/benchmark.py --model-path /data/leaderboard/weights/lmsys/vicuna-13B --input-file sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled_sorted.json
+$ docker exec leaderboard0 python scripts/benchmark.py --model-path databricks/dolly-v2-12b --input-file sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled_sorted.json
 ```

data/A40_chat-concise_benchmark.csv CHANGED Viewed

@@ -19,3 +19,5 @@ metaai/llama-7B,25.80475014752762,63.463734049697784,2.2525196486312047,539.0479
 Neutralzz/BiLLa-7B-SFT,29.382300021941255,141.6155137676293,4.84122748247456,1131.9990564138398
 openaccess-ai-collective/manticore-13b-chat-pyg,17.220798012743607,268.91269308260576,15.692034786355059,4051.8244570182064
 FreedomIntelligence/phoenix-inst-chat-7b,32.33242374435414,229.95869711215582,6.910495058340042,2049.7076356614534

 Neutralzz/BiLLa-7B-SFT,29.382300021941255,141.6155137676293,4.84122748247456,1131.9990564138398
 openaccess-ai-collective/manticore-13b-chat-pyg,17.220798012743607,268.91269308260576,15.692034786355059,4051.8244570182064
 FreedomIntelligence/phoenix-inst-chat-7b,32.33242374435414,229.95869711215582,6.910495058340042,2049.7076356614534
+metaai/Llama-2-13b-chat-hf,16.934647828854768,358.7941571524513,20.990738735323337,3942.400414707617
+metaai/Llama-2-7b-chat-hf,31.733044836542074,402.6699126930826,12.569092892522697,2398.9215396235386

data/A40_chat_benchmark.csv CHANGED Viewed

@@ -19,3 +19,5 @@ BAIR/koala-7b,29.723806931945834,260.7196104768301,8.720630589929986,2017.329562
 BAIR/koala-13b,17.451436035057224,262.5295500335796,15.030911340299886,3827.6102800537265
 StabilityAI/stablelm-tuned-alpha-7b,26.413142361637988,255.34687709872398,9.454673889303727,2319.91146675621
 togethercomputer/RedPajama-INCITE-7B-Chat,21.410571862447824,279.5094022834117,12.506414288534286,2541.441298522497

 BAIR/koala-13b,17.451436035057224,262.5295500335796,15.030911340299886,3827.6102800537265
 StabilityAI/stablelm-tuned-alpha-7b,26.413142361637988,255.34687709872398,9.454673889303727,2319.91146675621
 togethercomputer/RedPajama-INCITE-7B-Chat,21.410571862447824,279.5094022834117,12.506414288534286,2541.441298522497
+metaai/Llama-2-13b-chat-hf,16.95804416983929,384.7333781061115,22.55271715111622,4337.670243116255
+metaai/Llama-2-7b-chat-hf,31.922994116700572,428.19341840161184,13.367807321468502,2556.7166067830576

data/A40_instruct-concise_benchmark.csv CHANGED Viewed

@@ -19,3 +19,5 @@ Neutralzz/BiLLa-7B-SFT,29.118626503392385,104.97817327065144,3.5443721553023035,
 nomic-ai/gpt4all-13b-snoozy,17.423064750595767,135.3938885157824,7.734149922101941,1871.6546057756862
 project-baize/baize-v2-7B,28.13796712305154,262.9902619207522,9.250474432119292,2105.324460711873
 lmsys/fastchat-t5-3b-v1.0,40.20822673632634,281.74110141034254,10.492163513616964,1110.3276249158694

 nomic-ai/gpt4all-13b-snoozy,17.423064750595767,135.3938885157824,7.734149922101941,1871.6546057756862
 project-baize/baize-v2-7B,28.13796712305154,262.9902619207522,9.250474432119292,2105.324460711873
 lmsys/fastchat-t5-3b-v1.0,40.20822673632634,281.74110141034254,10.492163513616964,1110.3276249158694
+metaai/Llama-2-13b-chat-hf,16.753336372767794,223.39019476158495,12.93183804940574,2423.302869711249
+metaai/Llama-2-7b-chat-hf,30.95799874634315,220.83680322364003,6.815573463441101,1288.2125369376631

data/A40_instruct_benchmark.csv CHANGED Viewed

@@ -19,3 +19,5 @@ lmsys/fastchat-t5-3b-v1.0,31.014371537480102,357.13734049697786,17.9643423938542
 nomic-ai/gpt4all-13b-snoozy,17.558360268154225,232.67461383478846,13.290953806575821,3411.2449123573792
 BAIR/koala-13b,17.468010116614902,254.08529214237743,14.4913390549458,3858.416870718604
 metaai/llama-7B,26.40244189851013,104.19308260577569,3.608983782098236,864.4181752854275

 nomic-ai/gpt4all-13b-snoozy,17.558360268154225,232.67461383478846,13.290953806575821,3411.2449123573792
 BAIR/koala-13b,17.468010116614902,254.08529214237743,14.4913390549458,3858.416870718604
 metaai/llama-7B,26.40244189851013,104.19308260577569,3.608983782098236,864.4181752854275
+metaai/Llama-2-13b-chat-hf,16.999960399598052,371.56312961719277,21.688517364074986,4210.194823371436
+metaai/Llama-2-7b-chat-hf,31.815139493955602,365.40362659503023,11.316028104293823,2180.2478049026786

data/score.csv CHANGED Viewed

@@ -18,3 +18,5 @@ FreedomIntelligence/phoenix-inst-chat-7b,44.965870307167236,63.2244572794264,47.
 camel-ai/CAMEL-13B-Combined-Data,55.54607508532423,79.29695279824736,47.33219922854091
 Neutralzz/BiLLa-7B-SFT,27.730375426621162,26.04062935670185,49.045640164325754
 togethercomputer/RedPajama-INCITE-7B-Chat,42.15017064846416,70.8424616610237,36.10055989611241

 camel-ai/CAMEL-13B-Combined-Data,55.54607508532423,79.29695279824736,47.33219922854091
 Neutralzz/BiLLa-7B-SFT,27.730375426621162,26.04062935670185,49.045640164325754
 togethercomputer/RedPajama-INCITE-7B-Chat,42.15017064846416,70.8424616610237,36.10055989611241
+metaai/Llama-2-7b-chat-hf,52.73037542662116,78.48038239394542,45.32519554457334
+metaai/Llama-2-13b-chat-hf,59.129692832764505,81.94582752439753,43.9572591900371

pegasus/benchmark.yaml CHANGED Viewed

@@ -3,7 +3,7 @@
 # {{ gpu }} is defined in `hosts.yaml`, and will be filled in when Pegasus
 # determines the specific node and gpu the generated job command will run on.
 - command:
-    - docker exec leaderboard{{ gpu }} python scripts/benchmark.py --input-file sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json --model-path {{ model }} --task {{ task }}
   model:
     - /data/leaderboard/weights/metaai/llama-7B
     - /data/leaderboard/weights/metaai/llama-13B

 # {{ gpu }} is defined in `hosts.yaml`, and will be filled in when Pegasus
 # determines the specific node and gpu the generated job command will run on.
 - command:
+    - docker exec leaderboard{{ gpu }} python scripts/benchmark.py --input-file sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled_sorted.json --model-path {{ model }} --task {{ task }}
   model:
     - /data/leaderboard/weights/metaai/llama-7B
     - /data/leaderboard/weights/metaai/llama-13B

requirements-benchmark.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 zeus-ml==0.4.0
-fschat==0.2.14
 rwkv==0.7.5
 einops
 tyro

 zeus-ml==0.4.0
+fschat==0.2.20
 rwkv==0.7.5
 einops
 tyro

scripts/benchmark.py CHANGED Viewed

@@ -197,7 +197,7 @@ def generate_stream(
             if not any(partially_stopped):
                 # indicates which request in batch stopped
                 different_indices = np.where(stopped != old_stopped)[0]
-                stop_length = np.array([(i, len(output[i])) for i in different_indices])
                 yield {
                     "text": output,
                     "stop_length": stop_length,
@@ -215,7 +215,7 @@ def generate_stream(
             spaces_between_special_tokens=False,
             clean_up_tokenization_spaces=True,
         )
-    stop_length = np.array([(i, len(output[i])) for i in false_indices])
     yield {
         "text": output,
@@ -230,7 +230,7 @@ def generate_stream(
 def main(
     model_path: str,
-    input_file: str = "sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json",
     output_dir: str = "data",
     device_index: int = 0,
     task: Literal[tuple(SYSTEM_PROMPTS)] = "chat",  # type: ignore
@@ -245,7 +245,7 @@ def main(
     Args:
         model_path: Path to or Huggingface Hub Id of the model.
         input_file: Path to the input JSON file. Assumed to be our cleaned ShareGPT data.
-            (Default: "sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json")
         output_dir: Path to the output directory. (Default: "data")
         device_index: Index of the GPU to use for inference. (Default: 0)
         task: Type of task to perform inference on. (Default: "chat")
@@ -304,7 +304,12 @@ def main(
     conv_base = get_conversation_template(model_path)
     # Standardize the system prompt for every model.
-    conv_base.system = SYSTEM_PROMPTS[task]
     conv_base.messages = []
     conv_base.offset = 0
@@ -407,7 +412,8 @@ def main(
         # Record numbers.
         output_text = output["text"]
         if not is_warmup:
-            response_length = int(sum(batch_token_len.values()))  # number of valid tokens
             latency = measurements.time
             throughput = response_length / latency
             energy = measurements.total_energy

             if not any(partially_stopped):
                 # indicates which request in batch stopped
                 different_indices = np.where(stopped != old_stopped)[0]
+                stop_length = np.array([(j, i+1) for j in different_indices])
                 yield {
                     "text": output,
                     "stop_length": stop_length,
             spaces_between_special_tokens=False,
             clean_up_tokenization_spaces=True,
         )
+    stop_length = np.array([(i, max_new_tokens) for i in false_indices])
     yield {
         "text": output,
 def main(
     model_path: str,
+    input_file: str = "sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled_sorted.json",
     output_dir: str = "data",
     device_index: int = 0,
     task: Literal[tuple(SYSTEM_PROMPTS)] = "chat",  # type: ignore
     Args:
         model_path: Path to or Huggingface Hub Id of the model.
         input_file: Path to the input JSON file. Assumed to be our cleaned ShareGPT data.
+            (Default: "sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled_sorted.json")
         output_dir: Path to the output directory. (Default: "data")
         device_index: Index of the GPU to use for inference. (Default: 0)
         task: Type of task to perform inference on. (Default: "chat")
     conv_base = get_conversation_template(model_path)
     # Standardize the system prompt for every model.
+    if "llama-2" in model_path.lower():
+        conv_base.system = f"<s>[INST] <<SYS>>\n{SYSTEM_PROMPTS[task]}\n<</SYS>>\n\n"
+    elif "stablelm" in model_path.lower():
+        conv_base.system = f"""<|SYSTEM|># {SYSTEM_PROMPTS[task]}\n"""
+    else:
+        conv_base.system = SYSTEM_PROMPTS[task]
     conv_base.messages = []
     conv_base.offset = 0
         # Record numbers.
         output_text = output["text"]
         if not is_warmup:
+            total_length = int(sum(batch_token_len.values()))  # number of valid tokens
+            response_length = float(total_length) / len(convs)
             latency = measurements.time
             throughput = response_length / latency
             energy = measurements.total_energy

sharegpt/README.md CHANGED Viewed

@@ -25,3 +25,8 @@ python extract_first.py --in-file sg_90k_part1_html_cleaned_lang.json --out-file
 ```
 python -m fastchat.data.sample --in sg_90k_part1_html_cleaned_lang_first.json --out sg_90k_part1_html_cleaned_lang_first_sampled.json --end 10000 --max-length 10000
 ```

 ```
 python -m fastchat.data.sample --in sg_90k_part1_html_cleaned_lang_first.json --out sg_90k_part1_html_cleaned_lang_first_sampled.json --end 10000 --max-length 10000
 ```
+## Sorted data
+'''
+python sort.py --data-dir sg_90k_part1_html_cleaned_lang_first_sampled.json --out-file sg_90k_part1_html_cleaned_lang_first_sampled_sorted.json
+'''

sharegpt/{sg_90k_part1_html_cleaned_lang_first_sampled.json → sg_90k_part1_html_cleaned_lang_first_sampled_sorted.json} RENAMED Viewed

The diff for this file is too large to render. See raw diff