Ketengan-Diffusion-Lab commited on
Commit
fd950ef
1 Parent(s): 82daaef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -38
app.py CHANGED
@@ -1,45 +1,63 @@
1
  import gradio as gr
2
  import torch
3
- from transformers import AutoModel, AutoTokenizer
 
4
  from PIL import Image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- # Disable gradient computation
7
- torch.set_grad_enabled(False)
8
-
9
- # Initialize model and tokenizer
10
- model = AutoModel.from_pretrained('internlm/internlm-xcomposer2d5-7b',
11
- torch_dtype=torch.bfloat16,
12
- trust_remote_code=True).cuda().eval()
13
- tokenizer = AutoTokenizer.from_pretrained('internlm/internlm-xcomposer2d5-7b',
14
- trust_remote_code=True)
15
- model.tokenizer = tokenizer
16
-
17
- # Define the function to process input and generate a response
18
- def analyze_image(query, image_path):
19
- image = Image.open(image_path)
20
- # Convert image to required format and save temporarily if needed
21
- with torch.autocast(device_type='cuda', dtype=torch.float16):
22
- response, _ = model.chat(tokenizer, query, [image_path], do_sample=False, num_beams=3, use_meta=True)
23
-
24
- return response
25
-
26
- # Create Gradio interface
27
  with gr.Blocks() as demo:
28
- gr.Markdown("## Image Analysis Tool using Hugging Face's `internlm-xcomposer2d5-7b`")
29
-
30
- with gr.Row():
31
- query_input = gr.Textbox(label="Enter your query", placeholder="Analyze the given image in a detailed manner")
32
-
33
  with gr.Row():
34
- image_input = gr.Image(label="Upload an Image", type="filepath")
35
-
36
- with gr.Row():
37
- result_output = gr.Textbox(label="Result", placeholder="Model response will appear here", interactive=False)
38
-
39
- with gr.Row():
40
- submit_button = gr.Button("Submit")
41
-
42
- submit_button.click(fn=analyze_image, inputs=[query_input, image_input], outputs=result_output)
43
 
44
- # Launch the Gradio interface
45
- demo.launch()
 
1
  import gradio as gr
2
  import torch
3
+ import transformers
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer
5
  from PIL import Image
6
+ import warnings
7
+
8
+ # disable some warnings
9
+ transformers.logging.set_verbosity_error()
10
+ transformers.logging.disable_progress_bar()
11
+ warnings.filterwarnings('ignore')
12
+
13
+ # set device
14
+ torch.set_default_device('cuda') # or 'cpu'
15
+
16
+ model_name = 'cognitivecomputations/dolphin-vision-7b'
17
+
18
+ # create model
19
+ model = AutoModelForCausalLM.from_pretrained(
20
+ model_name,
21
+ torch_dtype=torch.float16,
22
+ device_map='auto',
23
+ trust_remote_code=True)
24
+ tokenizer = AutoTokenizer.from_pretrained(
25
+ model_name,
26
+ trust_remote_code=True)
27
+
28
+ def inference(prompt, image):
29
+ messages = [
30
+ {"role": "user", "content": f'<image>\n{prompt}'}
31
+ ]
32
+ text = tokenizer.apply_chat_template(
33
+ messages,
34
+ tokenize=False,
35
+ add_generation_prompt=True
36
+ )
37
+
38
+ text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
39
+ input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
40
+
41
+ image_tensor = model.process_images([image], model.config).to(dtype=model.dtype)
42
+
43
+ # generate
44
+ output_ids = model.generate(
45
+ input_ids,
46
+ images=image_tensor,
47
+ max_new_tokens=2048,
48
+ use_cache=True)[0]
49
+
50
+ return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  with gr.Blocks() as demo:
 
 
 
 
 
53
  with gr.Row():
54
+ with gr.Column():
55
+ prompt_input = gr.Textbox(label="Prompt", placeholder="Describe this image in detail")
56
+ image_input = gr.Image(label="Image", type="pil")
57
+ submit_button = gr.Button("Submit")
58
+ with gr.Column():
59
+ output_text = gr.Textbox(label="Output")
60
+
61
+ submit_button.click(fn=inference, inputs=[prompt_input, image_input], outputs=output_text)
 
62
 
63
+ demo.launch()