Spaces:

HuggingFaceM4
/

idefics_playground

Runtime error

App Files Files Community

VictorSanh commited on Jul 26, 2023

Commit

7df19dd

•

1 Parent(s): f10b974

Update visualization

Browse files

Files changed (2) hide show

app_bis.py +7 -9
app_dialogue.py +32 -71

app_bis.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import logging
 import os
 import re
 import time
 from io import BytesIO
@@ -10,7 +9,6 @@ import requests
 import torch
 import transformers
 from accelerate.utils import get_max_memory
 from joblib import Parallel, delayed
 from PIL import Image
 from transformers import AutoTokenizer
@@ -699,17 +697,17 @@ with gr.Blocks() as demo:
                 converted into real newline characters.
                 See examples and additional details below.""")
-    #gr.HTML("<h3 align='center'>Help to write prompts:🙌</h3><br>Put the urls to the images inside the image tokens, it will be converted into the real image tokens. Put <fake_token_around_image> before and after each image token WITHOUT space. The texts \\n will be converted into real newline characters. See examples and additional details below.")
-    #gr.Markdown(MSG_MAIN)
-    #with gr.Row():
-    #with gr.Column():
     gr.Markdown("## Input")
     with gr.Row():
         if not IS_MAIN_SPACE:
             images = gr.File(label="Images", file_count="multiple")
         prompt = gr.Textbox(label="Prompt", placeholder="Enter the prompt here", lines=5)
-    #gr.Markdown("## Common parameters to all decoding strategy")
     with gr.Row():
         with gr.Accordion("Common parameters to all decoding strategy", open=False, elem_id="common_params"):
             temperature = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=1.0, label="Softmax temperature")
@@ -751,7 +749,7 @@ with gr.Blocks() as demo:
                 label="Stop generation when an image token, a bos or a eos token is generated", value=False
             )
-        #gr.Markdown("## Decoding strategy and its specific parameters")
         with gr.Accordion("Decoding strategy and its specific parameters", open=False, elem_id="decoding_params"):
             decoding_strategy = gr.Dropdown(
                 ["greedy", "beam_search", "beam_sampling", "sampling_top_k", "sampling_top_p", "contrastive_sampling"],
@@ -793,7 +791,7 @@ with gr.Blocks() as demo:
     submit = gr.Button(label="Generate")
-    #with gr.Column():
     with gr.Row():
         if IS_MAIN_SPACE:
             outputs = [

 import logging
 import os
 import re
 import time
 from io import BytesIO
 import torch
 import transformers
 from accelerate.utils import get_max_memory
 from joblib import Parallel, delayed
 from PIL import Image
 from transformers import AutoTokenizer
                 converted into real newline characters.
                 See examples and additional details below.""")
+    # gr.HTML("<h3 align='center'>Help to write prompts:🙌</h3><br>Put the urls to the images inside the image tokens, it will be converted into the real image tokens. Put <fake_token_around_image> before and after each image token WITHOUT space. The texts \\n will be converted into real newline characters. See examples and additional details below.")
+    # gr.Markdown(MSG_MAIN)
+    # with gr.Row():
+    # with gr.Column():
     gr.Markdown("## Input")
     with gr.Row():
         if not IS_MAIN_SPACE:
             images = gr.File(label="Images", file_count="multiple")
         prompt = gr.Textbox(label="Prompt", placeholder="Enter the prompt here", lines=5)
+    # gr.Markdown("## Common parameters to all decoding strategy")
     with gr.Row():
         with gr.Accordion("Common parameters to all decoding strategy", open=False, elem_id="common_params"):
             temperature = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=1.0, label="Softmax temperature")
                 label="Stop generation when an image token, a bos or a eos token is generated", value=False
             )
+        # gr.Markdown("## Decoding strategy and its specific parameters")
         with gr.Accordion("Decoding strategy and its specific parameters", open=False, elem_id="decoding_params"):
             decoding_strategy = gr.Dropdown(
                 ["greedy", "beam_search", "beam_sampling", "sampling_top_k", "sampling_top_p", "contrastive_sampling"],
     submit = gr.Button(label="Generate")
+    # with gr.Column():
     with gr.Row():
         if IS_MAIN_SPACE:
             outputs = [

app_dialogue.py CHANGED Viewed

@@ -1,12 +1,11 @@
 import os
 import gradio as gr
-import requests
 models = [
-    "HuggingFaceM4/tr_209_ift_mixture_opt_step-14000"
-    # "HuggingFaceM4/tr_210_ift_mixture_opt_step-2500",
 ]
 SYSTEM_PROMPT = """The following is a conversation between a highly knowledgeable and intelligent AI assistant, called Assistant, and a human user, called User. In the following interactions, User and Assistant will converse in natural language, and Assistant will do its best to answer User’s questions. Assistant was built to be respectful, polite and inclusive. It knows a lot, and always tells the truth. When prompted with an image, it does not make up facts.
@@ -31,18 +30,9 @@ BAN_TOKENS = "<image>;<fake_token_around_image>"
 EOS_TOKENS = "</s>;User"
 import logging
-import re
-from io import BytesIO
-import torch
 from accelerate.utils import get_max_memory
-from PIL import Image
-from transformers import AutoTokenizer
-from m4.models.vllama.configuration_vllama import VLlamaConfig
-from m4.models.vllama.modeling_vllama import VLlamaForCausalLM
-from m4.training.packing import image_attention_mask_for_packed_input_ids, incremental_to_binary_attention_mask
-from m4.training.utils import build_image_transform
 TOKENIZER_FAST = True
@@ -52,7 +42,12 @@ logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger()
-def load_tokenizer_model(model_name):
     tokenizer = AutoTokenizer.from_pretrained(
         model_name,
         use_fast=TOKENIZER_FAST,
@@ -61,7 +56,7 @@ def load_tokenizer_model(model_name):
     )
     # tokenizer.padding_side = "left" -> we don't need that, do we?
-    config = VLlamaConfig.from_pretrained(model_name, use_auth_token=os.getenv("HF_AUTH_TOKEN", True))
     max_memory_map = get_max_memory()
     for key in max_memory_map.keys():
@@ -71,7 +66,7 @@ def load_tokenizer_model(model_name):
             # Decrease 2 for Pytorch overhead and 2 for the forward to be safe
             max_memory_map[key] = f"{max_memory_map[key] - 4} GiB"
-    model = VLlamaForCausalLM.from_pretrained(
         model_name,
         use_auth_token=os.getenv("HF_AUTH_TOKEN", True),
         device_map="auto",
@@ -83,28 +78,23 @@ def load_tokenizer_model(model_name):
     print("Current device map:", model.hf_device_map)
     print("Model default generation config:", model.generation_config)
     # TODO: the device_map looks very inefficien right now. that could be improved
-    return tokenizer, model
-def fetch_images(url_images):
-    headers = {
-        "User-Agent": (
-            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0"
-            " Safari/537.36"
-        )
-    }
-    images = []
-    for url in url_images:
-        if isinstance(url, str):
-            images.append(Image.open(BytesIO(requests.get(url, stream=True, headers=headers).content)))
         else:
-            images.append(url)
-    return images
 def model_generation(
     prompt,
-    images,
     tokenizer,
     model,
     temperature,
@@ -123,31 +113,15 @@ def model_generation(
     top_p,
     penalty_alpha,
 ):
-    # Preparing inputs
-    tokens = tokenizer(
-        [prompt],
         truncation=True,
         max_length=MAX_SEQ_LEN - 512,  # TODO: replace the 512 value with `max_new_tokens`
         padding=True,
-        add_special_tokens=False,
     )
-    input_ids = torch.tensor([[tokenizer.bos_token_id] + tokens.input_ids[0]])
-    attention_mask = torch.tensor([[1] + tokens.attention_mask[0]])
-    image_attention_mask = [
-        incremental_to_binary_attention_mask(
-            image_attention_mask_for_packed_input_ids(input_ids[0].unsqueeze(0), tokenizer)[0], num_classes=len(images)
-        )
-    ]
-    image_transform = build_image_transform(eval=True)
-    pixel_values = [torch.stack([image_transform(img) for img in images])]
-    input_ids = input_ids.to(0)
-    attention_mask = attention_mask.to(0)
-    pixel_values = torch.stack(pixel_values).to(0)
-    image_attention_mask = torch.cat(image_attention_mask, 0).to(0)
     # Excluding some words from the generation
     bad_words_ids = None
@@ -179,13 +153,6 @@ def model_generation(
                     )
                 eos_token_ids += tokenized_eos_token
-    # Inputs
-    input_args = {
-        "input_ids": input_ids,
-        "attention_mask": attention_mask,
-        "pixel_values": pixel_values,
-        "image_attention_mask": image_attention_mask,
-    }
     # Common parameters to all decoding strategies
     # This documentation is useful to read: https://huggingface.co/docs/transformers/main/en/generation_strategies
     generation_args = {
@@ -239,7 +206,7 @@ def model_generation(
         tokenizer.batch_decode(generated_tokens, skip_special_tokens=hide_special_tokens)[0]
     )
-    actual_generated_tokens = generated_tokens[:, input_ids.shape[-1] :]
     first_end_token = len(actual_generated_tokens[0])
     actual_generated_tokens = actual_generated_tokens[:, :first_end_token]
     generated_text = tokenizer.batch_decode(actual_generated_tokens, skip_special_tokens=hide_special_tokens)[0]
@@ -285,7 +252,7 @@ with gr.Blocks(title="IDEFICS", theme=gr.themes.Base()) as demo:
                     show_label=False,
                     container=False,
                 )
-            tokenizer, model = load_tokenizer_model(model_selector.value)
             imagebox = gr.Image(
                 type="pil",
@@ -329,7 +296,7 @@ with gr.Blocks(title="IDEFICS", theme=gr.themes.Base()) as demo:
                 elem_id="chatbot",
                 label="Idefics Chatbot",
                 visible=True,
-                height=550,
                 value=[
                     [
                         (
@@ -391,7 +358,7 @@ with gr.Blocks(title="IDEFICS", theme=gr.themes.Base()) as demo:
         user_prompt,
         chat_history,
     ):
-        global model, tokenizer
         temperature = 1.0
         no_repeat_ngram_size = 0
@@ -412,15 +379,9 @@ with gr.Blocks(title="IDEFICS", theme=gr.themes.Base()) as demo:
             history=chat_history,
         )
-        url_images = re.findall(r"<image(.*?)>", formated_prompt)
-        for idx, url_image in enumerate(url_images):
-            formated_prompt = formated_prompt.replace(url_image, "")
-            url_images[idx] = url_images[idx][1:]
-        images = fetch_images(url_images)
         generated_text = model_generation(
             prompt=formated_prompt,
-            images=images,
             tokenizer=tokenizer,
             model=model,
             temperature=temperature,

 import os
 import gradio as gr
 models = [
+    "HuggingFaceM4/idefics-9b-instruct",
+    # "HuggingFaceM4/idefics-80b-instruct",
 ]
 SYSTEM_PROMPT = """The following is a conversation between a highly knowledgeable and intelligent AI assistant, called Assistant, and a human user, called User. In the following interactions, User and Assistant will converse in natural language, and Assistant will do its best to answer User’s questions. Assistant was built to be respectful, polite and inclusive. It knows a lot, and always tells the truth. When prompted with an image, it does not make up facts.
 EOS_TOKENS = "</s>;User"
 import logging
 from accelerate.utils import get_max_memory
+from transformers import AutoTokenizer, AutoProcessor, AutoConfig, AutoModelForCausalLM
 TOKENIZER_FAST = True
 logger = logging.getLogger()
+def load_processor_tokenizer_model(model_name):
+    processor = AutoProcessor.from_pretrained(
+        model_name,
+        use_auth_token=os.getenv("HF_AUTH_TOKEN", True),
+        truncation_side="left",
+    )
     tokenizer = AutoTokenizer.from_pretrained(
         model_name,
         use_fast=TOKENIZER_FAST,
     )
     # tokenizer.padding_side = "left" -> we don't need that, do we?
+    config = AutoConfig.from_pretrained(model_name, use_auth_token=os.getenv("HF_AUTH_TOKEN", True))
     max_memory_map = get_max_memory()
     for key in max_memory_map.keys():
             # Decrease 2 for Pytorch overhead and 2 for the forward to be safe
             max_memory_map[key] = f"{max_memory_map[key] - 4} GiB"
+    model = AutoModelForCausalLM.from_pretrained(
         model_name,
         use_auth_token=os.getenv("HF_AUTH_TOKEN", True),
         device_map="auto",
     print("Current device map:", model.hf_device_map)
     print("Model default generation config:", model.generation_config)
     # TODO: the device_map looks very inefficien right now. that could be improved
+    return processor, tokenizer, model
+def split_prompt_into_list(prompt_str):
+    """Convert a full string prompt to the list format expected by the processor."""
+    prompt_splitted = prompt_str.split("<fake_token_around_image>")
+    prompt_list = []
+    for ps in prompt_splitted:
+        if ps.startswith("<image:"):
+            prompt_list.append(ps[7:-1])
         else:
+            prompt_list.append(ps)
+    return prompt_list
 def model_generation(
     prompt,
+    processor,
     tokenizer,
     model,
     temperature,
     top_p,
     penalty_alpha,
 ):
+    input_args = processor(
+        [split_prompt_into_list(prompt)],
+        eval_mode=True,
         truncation=True,
         max_length=MAX_SEQ_LEN - 512,  # TODO: replace the 512 value with `max_new_tokens`
         padding=True,
     )
+    for k, v in input_args.items():
+        input_args[k] = v.to(0)
     # Excluding some words from the generation
     bad_words_ids = None
                     )
                 eos_token_ids += tokenized_eos_token
     # Common parameters to all decoding strategies
     # This documentation is useful to read: https://huggingface.co/docs/transformers/main/en/generation_strategies
     generation_args = {
         tokenizer.batch_decode(generated_tokens, skip_special_tokens=hide_special_tokens)[0]
     )
+    actual_generated_tokens = generated_tokens[:, input_args["input_ids"].shape[-1] :]
     first_end_token = len(actual_generated_tokens[0])
     actual_generated_tokens = actual_generated_tokens[:, :first_end_token]
     generated_text = tokenizer.batch_decode(actual_generated_tokens, skip_special_tokens=hide_special_tokens)[0]
                     show_label=False,
                     container=False,
                 )
+            processor, tokenizer, model = load_processor_tokenizer_model(model_selector.value)
             imagebox = gr.Image(
                 type="pil",
                 elem_id="chatbot",
                 label="Idefics Chatbot",
                 visible=True,
+                height=750,
                 value=[
                     [
                         (
         user_prompt,
         chat_history,
     ):
+        global processor, model, tokenizer
         temperature = 1.0
         no_repeat_ngram_size = 0
             history=chat_history,
         )
         generated_text = model_generation(
             prompt=formated_prompt,
+            processor=processor,
             tokenizer=tokenizer,
             model=model,
             temperature=temperature,