yasserrmd commited on
Commit
ff95e3f
1 Parent(s): 36e07d3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -13
app.py CHANGED
@@ -21,31 +21,37 @@ model = AutoModelForCausalLM.from_pretrained(
21
  )
22
 
23
  @spaces.GPU
24
- def describe_image(image):
25
- # Process the image
26
- inputs = processor.process(images=[image], text="Describe this image.")
27
 
28
  # Move inputs to the correct device and make a batch of size 1
29
  inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}
30
 
31
- # Generate output with maximum 200 new tokens
32
  output = model.generate_from_batch(
33
  inputs,
34
  GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
35
  tokenizer=processor.tokenizer
36
  )
37
 
38
- # Decode and return generated text
39
  generated_tokens = output[0, inputs['input_ids'].size(1):]
40
  generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
41
 
42
  return generated_text
43
 
44
- # Gradio interface
45
- gr.Interface(
46
- fn=describe_image,
47
- inputs=gr.inputs.Image(type="pil"),
48
- outputs="text",
49
- title="Visual Language Model - Molmo",
50
- description="Upload an image, and the model will generate a detailed description of it."
51
- ).launch()
 
 
 
 
 
 
 
21
  )
22
 
23
  @spaces.GPU
24
+ def describe_image(image, prompt):
25
+ # Process the image with the user-provided text prompt
26
+ inputs = processor.process(images=[image], text=prompt)
27
 
28
  # Move inputs to the correct device and make a batch of size 1
29
  inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}
30
 
31
+ # Generate output with a maximum of 200 new tokens
32
  output = model.generate_from_batch(
33
  inputs,
34
  GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
35
  tokenizer=processor.tokenizer
36
  )
37
 
38
+ # Decode and return the generated text
39
  generated_tokens = output[0, inputs['input_ids'].size(1):]
40
  generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
41
 
42
  return generated_text
43
 
44
+ # Gradio interface using the latest API
45
+ with gr.Blocks() as demo:
46
+ gr.Markdown("# Visual Language Model - Molmo")
47
+ with gr.Row():
48
+ image_input = gr.Image(type="pil", label="Upload an image")
49
+ text_input = gr.Textbox(label="Enter a prompt", placeholder="Describe this image...")
50
+ output_text = gr.Textbox(label="Generated Description")
51
+ submit_button = gr.Button("Generate Description")
52
+
53
+ # Connect the inputs (image, text prompt) to the function and output
54
+ submit_button.click(fn=describe_image, inputs=[image_input, text_input], outputs=output_text)
55
+
56
+ # Launch the app
57
+ demo.launch()