File size: 5,516 Bytes
353fa54
c3a1897
 
 
 
 
 
eb902b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d756d59
 
44a0c32
eb902b3
9b4b3ea
eb902b3
 
9b4b3ea
eb902b3
 
 
 
 
353fa54
c3a1897
 
 
 
 
353fa54
c3a1897
 
 
 
 
09db30b
 
 
 
8381241
 
44a0c32
 
 
 
 
c3a1897
 
44a0c32
c3a1897
 
44a0c32
 
c3a1897
 
44a0c32
 
 
 
 
 
 
 
 
 
c3a1897
 
 
44a0c32
 
 
 
 
 
 
 
 
 
c3a1897
 
eb902b3
c3a1897
 
 
8381241
44a0c32
c3a1897
 
 
 
 
80f89b9
8381241
80f89b9
 
c3a1897
 
40adb4f
eb902b3
 
 
8381241
eb902b3
 
c3a1897
 
8381241
c3a1897
 
8381241
 
 
 
51f8a02
c3a1897
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import gradio as gr
import cv2
import numpy as np
from PIL import Image
import base64
from io import BytesIO
from models.image_text_transformation import ImageTextTransformation
import argparse
import torch

parser = argparse.ArgumentParser()
parser.add_argument('--gpt_version', choices=['gpt-3.5-turbo', 'gpt4'], default='gpt-3.5-turbo')
parser.add_argument('--image_caption', action='store_true', dest='image_caption', default=True, help='Set this flag to True if you want to use BLIP2 Image Caption')
parser.add_argument('--dense_caption', action='store_true', dest='dense_caption', default=True, help='Set this flag to True if you want to use Dense Caption')
parser.add_argument('--semantic_segment', action='store_true', dest='semantic_segment', default=False, help='Set this flag to True if you want to use semantic segmentation')
parser.add_argument('--image_caption_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, gpu memory larger than 14G is recommended')
parser.add_argument('--dense_caption_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, < 6G GPU is not recommended>')
parser.add_argument('--semantic_segment_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, gpu memory larger than 14G is recommended')
parser.add_argument('--contolnet_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, <6G GPU is not recommended>')

args = parser.parse_args()

device = "cuda" if torch.cuda.is_available() else "cpu"
# device = "cpu"

if device == "cuda":
    args.image_caption_device = "cpu"
    args.dense_caption_device = "cuda"
    args.semantic_segment_device = "cuda"
    args.contolnet_device = "cuda"
else:
    args.image_caption_device = "cpu"
    args.dense_caption_device = "cpu"
    args.semantic_segment_device = "cpu"
    args.contolnet_device = "cpu"

def pil_image_to_base64(image):
    buffered = BytesIO()
    image.save(buffered, format="JPEG")
    img_str = base64.b64encode(buffered.getvalue()).decode()
    return img_str

def add_logo():
    with open("examples/logo.png", "rb") as f:
        logo_base64 = base64.b64encode(f.read()).decode()
    return logo_base64

def process_image(image_src, options=None, processor=None):
    print(options)
    if options is None:
        options = []
    # processor.args.semantic_segment = "Semantic Segment" in options
    processor.args.semantic_segment = False
    image_generation_status = "Image Generation" in options
    image_caption, dense_caption, region_semantic, gen_text = processor.image_to_text(image_src)
    if image_generation_status:
        gen_image = processor.text_to_image(gen_text)
        gen_image_str = pil_image_to_base64(gen_image)
    # Combine the outputs into a single HTML output
    custom_output = f'''
    <h2>Image->Text:</h2>
    <div style="display: flex; flex-wrap: wrap;">
        <div style="flex: 1;">
            <h3>Image Caption</h3>
            <p>{image_caption}</p>
        </div>
        <div style="flex: 1;">
            <h3>Dense Caption</h3>
            <p>{dense_caption}</p>
        </div>
        <div style="flex: 1;">
            <h3>Region Semantic</h3>
            <p>{region_semantic}</p>
        </div>
        <div style="flex: 1;">
            <h3>GPT4 Reasoning:</h3>
            <p>{gen_text}</p>
        </div>
    </div>
    '''
    if image_generation_status:
        custom_output += f'''
        <h2>Text->Image:</h2>
        <div style="display: flex; flex-wrap: wrap;">
            <div style="flex: 1;">
                <h3>Generated Image</h3>
                <img src="data:image/jpeg;base64,{gen_image_str}" width="400" style="vertical-align: middle;">
            </div>
        </div>
        '''
    return custom_output

processor = ImageTextTransformation(args)

# Create Gradio input and output components
image_input = gr.inputs.Image(type='filepath', label="Input Image")
# semantic_segment_checkbox = gr.inputs.Checkbox(label="Semantic Segment", default=False)
image_generation_checkbox = gr.inputs.Checkbox(label="Image Generation", default=False)

logo_base64 = add_logo()
# Create the title with the logo
title_with_logo = f'<img src="data:image/jpeg;base64,{logo_base64}" width="400" style="vertical-align: middle;"> Understanding Image with Text'

examples = [
    ["examples/test_4.jpg"],
]

# Create Gradio interface
interface = gr.Interface(
    fn=lambda image, options: process_image(image, options, processor),
    inputs=[image_input,        
            gr.CheckboxGroup(
            label="Options",
            choices=["Image Generation"],
            ),
            ],
    outputs=gr.outputs.HTML(),
    title=title_with_logo,
    examples=examples,
    description="""
    This code support image to text transformation. Then the generated text can do retrieval, question answering et al to conduct zero-shot.
    \n Github: https://github.com/showlab/Image2Paragraph
    \n Twitter: https://twitter.com/awinyimgprocess/status/1646225454599372800?s=46&t=HvOe9T2n35iFuCHP5aIHpQ
    \n Since GPU is expensive, we use CPU for demo and not include semantic segment anything. Run code local with gpu or google colab we provided for fast speed.
    \n Ttext2image model is controlnet ( very slow in cpu(~2m)), which used canny edge as reference.
    \n To speed up, we generate image with small size 256, run the code local for high-quality sample.
    """
)

# Launch the interface
interface.launch()