Spaces:

zetavg
/

LLaMA-LoRA-Tuner-UI-Demo

Runtime error

App Files Files Community

zetavg commited on Apr 16, 2023

Commit

00263ef

•

1 Parent(s): 1a203ff

fix inference output

Browse files

Files changed (3) hide show

llama_lora/lib/inference.py +25 -4
llama_lora/ui/inference_ui.py +0 -78
llama_lora/utils/prompter.py +6 -1

llama_lora/lib/inference.py CHANGED Viewed

@@ -4,7 +4,6 @@ import transformers
 from .get_device import get_device
 from .streaming_generation_utils import Iteratorize, Stream
 def generate(
     # model
     model,
@@ -30,18 +29,34 @@ def generate(
         "stopping_criteria": transformers.StoppingCriteriaList() + stopping_criteria
     }
     if stream_output:
         # Stream the reply 1 token at a time.
         # This is based on the trick of using 'stopping_criteria' to create an iterator,
         # from https://github.com/oobabooga/text-generation-webui/blob/ad37f396fc8bcbab90e11ecf17c56c97bfbd4a9c/modules/text_generation.py#L216-L243.
         def generate_with_callback(callback=None, **kwargs):
             kwargs["stopping_criteria"].insert(
                 0,
                 Stream(callback_func=callback)
             )
             with torch.no_grad():
-                model.generate(**kwargs)
         def generate_with_streaming(**kwargs):
             return Iteratorize(
@@ -50,16 +65,22 @@ def generate(
         with generate_with_streaming(**generate_params) as generator:
             for output in generator:
-                decoded_output = tokenizer.decode(output, skip_special_tokens=True)
                 yield decoded_output, output
                 if output[-1] in [tokenizer.eos_token_id]:
                     break
         return  # early return for stream_output
     # Without streaming
     with torch.no_grad():
         generation_output = model.generate(**generate_params)
     output = generation_output.sequences[0]
-    decoded_output = tokenizer.decode(output, skip_special_tokens=True)
     yield decoded_output, output
     return

 from .get_device import get_device
 from .streaming_generation_utils import Iteratorize, Stream
 def generate(
     # model
     model,
         "stopping_criteria": transformers.StoppingCriteriaList() + stopping_criteria
     }
+    skip_special_tokens = True
+    if '/dolly' in tokenizer.name_or_path:
+        # dolly has additional_special_tokens as ['### End', '### Instruction:', '### Response:'], skipping them will break the prompter's reply extraction.
+        skip_special_tokens = False
+        # Ensure generation stops once it generates "### End"
+        end_key_token_id = tokenizer.encode("### End")
+        end_key_token_id = end_key_token_id[0]  # 50277
+        if isinstance(generate_params['generation_config'].eos_token_id, str):
+            generate_params['generation_config'].eos_token_id = [generate_params['generation_config'].eos_token_id]
+        elif not generate_params['generation_config'].eos_token_id:
+            generate_params['generation_config'].eos_token_id = []
+        generate_params['generation_config'].eos_token_id.append(end_key_token_id)
     if stream_output:
         # Stream the reply 1 token at a time.
         # This is based on the trick of using 'stopping_criteria' to create an iterator,
         # from https://github.com/oobabooga/text-generation-webui/blob/ad37f396fc8bcbab90e11ecf17c56c97bfbd4a9c/modules/text_generation.py#L216-L243.
+        generation_output = None
         def generate_with_callback(callback=None, **kwargs):
+            nonlocal generation_output
             kwargs["stopping_criteria"].insert(
                 0,
                 Stream(callback_func=callback)
             )
             with torch.no_grad():
+                generation_output = model.generate(**kwargs)
         def generate_with_streaming(**kwargs):
             return Iteratorize(
         with generate_with_streaming(**generate_params) as generator:
             for output in generator:
+                decoded_output = tokenizer.decode(output, skip_special_tokens=skip_special_tokens)
                 yield decoded_output, output
                 if output[-1] in [tokenizer.eos_token_id]:
                     break
+        if generation_output:
+            output = generation_output.sequences[0]
+            decoded_output = tokenizer.decode(output, skip_special_tokens=skip_special_tokens)
+            yield decoded_output, output
         return  # early return for stream_output
     # Without streaming
     with torch.no_grad():
         generation_output = model.generate(**generate_params)
     output = generation_output.sequences[0]
+    decoded_output = tokenizer.decode(output, skip_special_tokens=skip_special_tokens)
     yield decoded_output, output
     return

llama_lora/ui/inference_ui.py CHANGED Viewed

@@ -160,84 +160,6 @@ def do_inference(
                     None)
         return
-        if stream_output:
-            # Stream the reply 1 token at a time.
-            # This is based on the trick of using 'stopping_criteria' to create an iterator,
-            # from https://github.com/oobabooga/text-generation-webui/blob/ad37f396fc8bcbab90e11ecf17c56c97bfbd4a9c/modules/text_generation.py#L216-L243.
-            def generate_with_callback(callback=None, **kwargs):
-                kwargs.setdefault(
-                    "stopping_criteria", transformers.StoppingCriteriaList()
-                )
-                kwargs["stopping_criteria"].append(
-                    Stream(callback_func=callback)
-                )
-                with torch.no_grad():
-                    model.generate(**kwargs)
-            def generate_with_streaming(**kwargs):
-                return Iteratorize(
-                    generate_with_callback, kwargs, callback=None
-                )
-            with generate_with_streaming(**generate_params) as generator:
-                for output in generator:
-                    # new_tokens = len(output) - len(input_ids[0])
-                    decoded_output = tokenizer.decode(output)
-                    if output[-1] in [tokenizer.eos_token_id]:
-                        break
-                    raw_output = None
-                    if show_raw:
-                        raw_output = str(output)
-                    response = prompter.get_response(decoded_output)
-                    if Global.should_stop_generating:
-                        return
-                    yield (
-                        gr.Textbox.update(
-                            value=response, lines=inference_output_lines),
-                        raw_output)
-                    if Global.should_stop_generating:
-                        # If the user stops the generation, and then clicks the
-                        # generation button again, they may mysteriously landed
-                        # here, in the previous, should-be-stopped generation
-                        # function call, with the new generation function not be
-                        # called at all. To workaround this, we yield a message
-                        # and setting lines=1, and if the front-end JS detects
-                        # that lines has been set to 1 (rows="1" in HTML),
-                        # it will automatically click the generate button again
-                        # (gr.Textbox.update() does not support updating
-                        # elem_classes or elem_id).
-                        # [WORKAROUND-UI01]
-                        yield (
-                            gr.Textbox.update(
-                                value="Please retry", lines=1),
-                            None)
-            return  # early return for stream_output
-        # Without streaming
-        with torch.no_grad():
-            generation_output = model.generate(**generate_params)
-        s = generation_output.sequences[0]
-        output = tokenizer.decode(s)
-        raw_output = None
-        if show_raw:
-            raw_output = str(s)
-        response = prompter.get_response(output)
-        if Global.should_stop_generating:
-            return
-        yield (
-            gr.Textbox.update(value=response, lines=inference_output_lines),
-            raw_output)
     except Exception as e:
         raise gr.Error(e)

                     None)
         return
     except Exception as e:
         raise gr.Error(e)

llama_lora/utils/prompter.py CHANGED Viewed

@@ -131,8 +131,13 @@ class Prompter(object):
     def get_response(self, output: str) -> str:
         if self.template_name == "None":
             return output
         return self.template["response_split"].join(
-            output.split(self.template["response_split"])[1:]
         ).strip()
     def get_variable_names(self) -> List[str]:

     def get_response(self, output: str) -> str:
         if self.template_name == "None":
             return output
+        splitted_output = output.split(self.template["response_split"])
+        # if len(splitted_output) <= 1:
+        #     return output.strip()
         return self.template["response_split"].join(
+            splitted_output[1:]
         ).strip()
     def get_variable_names(self) -> List[str]: