Spaces:

ml-energy
/

leaderboard

Running

App Files Files Community

Zhiyu Wu commited on Sep 9, 2023

Commit

01bc423

•

1 Parent(s): aaadf66

add attention mask; fix stop_str length (#26)

Browse files

Files changed (5) hide show

pegasus/benchmark.yaml +7 -1
scripts/benchmark.py +25 -6
scripts/sort.py +15 -0
sharegpt/README.md +1 -0
sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled_sorted.json +0 -0

pegasus/benchmark.yaml CHANGED Viewed

@@ -3,7 +3,7 @@
 # {{ gpu }} is defined in `hosts.yaml`, and will be filled in when Pegasus
 # determines the specific node and gpu the generated job command will run on.
 - command:
-    - docker exec leaderboard{{ gpu }} python scripts/benchmark.py --input-file sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled_sorted.json --model-path {{ model }} --task {{ task }}
   model:
     - /data/leaderboard/weights/metaai/llama-7B
     - /data/leaderboard/weights/metaai/llama-13B
@@ -31,3 +31,9 @@
     - chat-concise
     - instruct
     - instruct-concise

 # {{ gpu }} is defined in `hosts.yaml`, and will be filled in when Pegasus
 # determines the specific node and gpu the generated job command will run on.
 - command:
+    - docker exec leaderboard{{ gpu }} python scripts/benchmark.py --input-file sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled_sorted.json --model-path {{ model }} --task {{ task }} --batch-size {{ batch_size }}
   model:
     - /data/leaderboard/weights/metaai/llama-7B
     - /data/leaderboard/weights/metaai/llama-13B
     - chat-concise
     - instruct
     - instruct-concise
+  batch_size:
+    - 1
+    - 2
+    - 4
+    - 8
+    - 16

scripts/benchmark.py CHANGED Viewed

@@ -104,7 +104,10 @@ def run_inference(
         temperature, repetition_penalty, top_p, top_k
     )
-    input_ids = tokenizer(prompts, padding=True).input_ids
     output_ids = [[] for _ in range(batch_size)]
     if model.config.is_encoder_decoder:
@@ -113,10 +116,12 @@ def run_inference(
         max_src_len = context_len - max_new_tokens - 1
     input_ids = [input_id[-max_src_len:] for input_id in input_ids]
     if model.config.is_encoder_decoder:
         encoder_output = model.encoder(
-            input_ids=torch.as_tensor(input_ids, device=device)
         )[0]
         start_ids = torch.as_tensor(
             [[model.generation_config.decoder_start_token_id] for _ in range(batch_size)],
@@ -126,6 +131,12 @@ def run_inference(
     past_key_values = out = None
     stopped = np.array(batch_size*[False])
     for i in range(max_new_tokens):
         if i == 0:  # prefill
             if model.config.is_encoder_decoder:
@@ -136,7 +147,7 @@ def run_inference(
                 )
                 logits = model.lm_head(out[0])
             else:
-                out = model(torch.as_tensor(input_ids, device=device), use_cache=True)
                 logits = out.logits
             past_key_values = out.past_key_values
         else:  # decoding
@@ -157,10 +168,17 @@ def run_inference(
                     ),
                     use_cache=True,
                     past_key_values=past_key_values,
                 )
                 logits = out.logits
             past_key_values = out.past_key_values
         if logits_processor:
             if repetition_penalty > 1.0:
                 tmp_output_ids = torch.as_tensor(output_ids, device=logits.device)
@@ -213,14 +231,15 @@ def run_inference(
                 for each_stop in stop_str:
                     pos_array = np.char.rfind(output_np, each_stop, rfind_start)
                     find_stop = pos_array != -1
             else:
                 raise ValueError("Invalid stop field type.")
             stop_str_indices = np.where(find_stop & ~stopped)[0]
             if stop_str_indices.size > 0:
                 for j in stop_str_indices:
-                    # TODO: find a elegant way to figure out the size of stop_str, here just suppose stop_str has one token
-                    result[j].response_length = i
                     result[j].output = output[j][:pos_array[j]]
                 stopped[find_stop] = True
@@ -378,7 +397,7 @@ def main(
     for is_warmup, input_prompts in data_iter:
         # Construct the input prompt.
-        for i in range(batch_size):
             conv = copy.deepcopy(conv_base)
             conv.append_message(conv.roles[0], input_prompts[i])
             conv.append_message(conv.roles[1], "")

         temperature, repetition_penalty, top_p, top_k
     )
+    prompts_encode = tokenizer(prompts, padding=True)
+    input_ids = prompts_encode.input_ids
+    attention_masks = prompts_encode.attention_mask
     output_ids = [[] for _ in range(batch_size)]
     if model.config.is_encoder_decoder:
         max_src_len = context_len - max_new_tokens - 1
     input_ids = [input_id[-max_src_len:] for input_id in input_ids]
+    attention_masks = torch.as_tensor([attention_mask[-max_src_len:] for attention_mask in attention_masks], device=device)
     if model.config.is_encoder_decoder:
         encoder_output = model.encoder(
+            input_ids=torch.as_tensor(input_ids, device=device),
+            attention_mask=attention_masks
         )[0]
         start_ids = torch.as_tensor(
             [[model.generation_config.decoder_start_token_id] for _ in range(batch_size)],
     past_key_values = out = None
     stopped = np.array(batch_size*[False])
+    # stop string length
+    stop_str_length = np.zeros(batch_size, dtype=int)
+    if stop_str and isinstance(stop_str, str):
+        stop_str_length[:] = len(tokenizer(stop_str).input_ids)
     for i in range(max_new_tokens):
         if i == 0:  # prefill
             if model.config.is_encoder_decoder:
                 )
                 logits = model.lm_head(out[0])
             else:
+                out = model(torch.as_tensor(input_ids, device=device), use_cache=True, attention_mask=attention_masks)
                 logits = out.logits
             past_key_values = out.past_key_values
         else:  # decoding
                     ),
                     use_cache=True,
                     past_key_values=past_key_values,
+                    attention_mask=attention_masks,
                 )
                 logits = out.logits
             past_key_values = out.past_key_values
+        # update attention mask
+        attention_masks = torch.cat(
+            [attention_masks, torch.ones((batch_size, 1), device=device)],
+            dim=1
+        )
         if logits_processor:
             if repetition_penalty > 1.0:
                 tmp_output_ids = torch.as_tensor(output_ids, device=logits.device)
                 for each_stop in stop_str:
                     pos_array = np.char.rfind(output_np, each_stop, rfind_start)
                     find_stop = pos_array != -1
+                    # update stop_str_length with each stop_str_length for each request
+                    stop_str_length[find_stop] = len(tokenizer(each_stop).input_ids)
             else:
                 raise ValueError("Invalid stop field type.")
             stop_str_indices = np.where(find_stop & ~stopped)[0]
             if stop_str_indices.size > 0:
                 for j in stop_str_indices:
+                    result[j].response_length = i+1-stop_str_length[j]
                     result[j].output = output[j][:pos_array[j]]
                 stopped[find_stop] = True
     for is_warmup, input_prompts in data_iter:
         # Construct the input prompt.
+        for i in range(len(input_prompts)):
             conv = copy.deepcopy(conv_base)
             conv.append_message(conv.roles[0], input_prompts[i])
             conv.append_message(conv.roles[1], "")

scripts/sort.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import json
+import tyro
+def main(data_dir:str, out_file:str) -> None:
+    with open(data_dir, "r") as f:
+        data = json.load(f)
+    sorted_data = sorted(data, key=lambda x: len(x['conversations'][0]['value']), reverse=True)
+    with open(out_file, "w") as f:
+        json.dump(sorted_data, f, indent=4)
+if __name__ == "__main__":
+    tyro.cli(main)

sharegpt/README.md CHANGED Viewed

@@ -27,6 +27,7 @@ python -m fastchat.data.sample --in sg_90k_part1_html_cleaned_lang_first.json --
 ```
 ## Sorted data
 ```
 python sort.py --data-dir sg_90k_part1_html_cleaned_lang_first_sampled.json --out-file sg_90k_part1_html_cleaned_lang_first_sampled_sorted.json
 ```

 ```
 ## Sorted data
+We sort the requests by sequence length, placing the longest sequences first. This approach minimizes the amount of padding required and allows for early detection of out-of-memory.
 ```
 python sort.py --data-dir sg_90k_part1_html_cleaned_lang_first_sampled.json --out-file sg_90k_part1_html_cleaned_lang_first_sampled_sorted.json
 ```

sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled_sorted.json CHANGED Viewed

The diff for this file is too large to render. See raw diff