ssboost commited on
Commit
2255dc0
โ€ข
1 Parent(s): 49f0887

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +123 -156
app.py CHANGED
@@ -15,9 +15,6 @@ import os
15
  from PIL import Image
16
  import re
17
 
18
- # Cohere ๋ชจ๋ธ ์ดˆ๊ธฐํ™”
19
- client = InferenceClient("CohereForAI/c4ai-command-r-plus", token=os.getenv("HF_TOKEN"))
20
-
21
  device = "cuda"
22
  ckpt_dir = snapshot_download(repo_id="Kwai-Kolors/Kolors")
23
  ckpt_IPA_dir = snapshot_download(repo_id="Kwai-Kolors/Kolors-IP-Adapter-Plus")
@@ -28,16 +25,16 @@ vae = AutoencoderKL.from_pretrained(f"{ckpt_dir}/vae", revision=None).half().to(
28
  scheduler = EulerDiscreteScheduler.from_pretrained(f"{ckpt_dir}/scheduler")
29
  unet_t2i = UNet2DConditionModel.from_pretrained(f"{ckpt_dir}/unet", revision=None).half().to(device)
30
  unet_i2i = unet_2d_condition.UNet2DConditionModel.from_pretrained(f"{ckpt_dir}/unet", revision=None).half().to(device)
31
- image_encoder = CLIPVisionModelWithProjection.from_pretrained(f'{ckpt_IPA_dir}/image_encoder', ignore_mismatched_sizes=True).to(dtype=torch.float16, device=device)
32
  ip_img_size = 336
33
  clip_image_processor = CLIPImageProcessor(size=ip_img_size, crop_size=ip_img_size)
34
 
35
  pipe_t2i = pipeline_stable_diffusion_xl_chatglm_256.StableDiffusionXLPipeline(
36
  vae=vae,
37
- text_encoder=text_encoder,
38
- tokenizer=tokenizer,
39
- unet=unet_t2i,
40
- scheduler=scheduler,
41
  force_zeros_for_empty_prompt=False
42
  ).to(device)
43
 
@@ -60,34 +57,16 @@ pipe_i2i.load_ip_adapter(f'{ckpt_IPA_dir}' , subfolder="", weight_name=["ip_adap
60
  MAX_SEED = np.iinfo(np.int32).max
61
  MAX_IMAGE_SIZE = 1024
62
 
63
- def call_api(content, system_message, max_tokens=1000, temperature=0.7, top_p=0.95):
64
- messages = [{"role": "system", "content": system_message}, {"role": "user", "content": content}]
65
- response = client.chat_completion(messages, max_tokens=max_tokens, temperature=temperature, top_p=top_p)
66
- return response.choices[0].message['content']
67
-
68
- def generate_prompt(korean_prompt):
69
- system_message = """
70
- Given the following description in Korean,
71
- translate and generate a concise English prompt suitable for a Stable Diffusion model.
72
- The prompt should be focused, descriptive,
73
- and contain specific keywords or phrases that will help guide the image generation process.
74
- Use simple and descriptive language, avoiding unnecessary words.
75
- Ensure the output is in English and follows the format typically used in Stable Diffusion prompts.
76
- The description is: [Insert Korean description here]
77
- """
78
- optimized_prompt = call_api(korean_prompt, system_message)
79
- return optimized_prompt # ์ตœ์ ํ™”๋œ ํ”„๋กฌํ”„ํŠธ ๋ฐ˜ํ™˜
80
-
81
  @spaces.GPU
82
- def infer(prompt,
83
- ip_adapter_image = None,
84
- ip_adapter_scale = 0.5,
85
- negative_prompt = "",
86
- seed = 0,
87
- randomize_seed = False,
88
- width = 1024,
89
- height = 1024,
90
- guidance_scale = 5.0,
91
  num_inference_steps = 25
92
  ):
93
  if randomize_seed:
@@ -97,14 +76,14 @@ def infer(prompt,
97
  if ip_adapter_image is None:
98
  pipe_t2i.to(device)
99
  image = pipe_t2i(
100
- prompt = prompt,
101
  negative_prompt = negative_prompt,
102
- guidance_scale = guidance_scale,
103
- num_inference_steps = num_inference_steps,
104
- width = width,
105
  height = height,
106
  generator = generator
107
- ).images[0]
108
  image.save("generated_image.jpg") # ํŒŒ์ผ ํ™•์žฅ์ž๋ฅผ .jpg๋กœ ๋ณ€๊ฒฝ
109
  return image, "generated_image.jpg"
110
  else:
@@ -115,10 +94,10 @@ def infer(prompt,
115
  image = pipe_i2i(
116
  prompt=prompt,
117
  ip_adapter_image=[ip_adapter_image],
118
- negative_prompt=negative_prompt,
119
  height=height,
120
  width=width,
121
- num_inference_steps=num_inference_steps,
122
  guidance_scale=guidance_scale,
123
  num_images_per_prompt=1,
124
  generator=generator
@@ -126,10 +105,26 @@ def infer(prompt,
126
  image.save("generated_image.jpg") # ํŒŒ์ผ ํ™•์žฅ์ž๋ฅผ .jpg๋กœ ๋ณ€๊ฒฝ
127
  return image, "generated_image.jpg"
128
 
129
- # ์‚ฌ์ง„ ์„ค๋ช… ๊ธฐ๋Šฅ ์ถ”๊ฐ€๋ฅผ ์œ„ํ•œ ์ฐธ์กฐ ์ฝ”๋“œ ํ†ตํ•ฉ
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  from transformers import AutoProcessor, AutoModelForCausalLM
131
 
132
- model = AutoModelForCausalLM.from_pretrained('gokaygokay/Florence-2-SD3-Captioner', trust_remote_code=True).to("cuda").eval()
133
  processor = AutoProcessor.from_pretrained('gokaygokay/Florence-2-SD3-Captioner', trust_remote_code=True)
134
 
135
  def modify_caption(caption: str) -> str:
@@ -145,7 +140,7 @@ def modify_caption(caption: str) -> str:
145
  return modified_caption if modified_caption != caption else caption
146
 
147
  @spaces.GPU
148
- def describe_image(image):
149
  image = Image.fromarray(image)
150
  task_prompt = "<DESCRIPTION>"
151
  prompt = task_prompt + "Describe this image in great detail."
@@ -153,7 +148,7 @@ def describe_image(image):
153
  if image.mode != "RGB":
154
  image = image.convert("RGB")
155
 
156
- inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda")
157
  generated_ids = model.generate(
158
  input_ids=inputs["input_ids"],
159
  pixel_values=inputs["pixel_values"],
@@ -164,124 +159,96 @@ def describe_image(image):
164
  parsed_answer = processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.width, image.height))
165
  return modify_caption(parsed_answer["<DESCRIPTION>"])
166
 
167
- css="""
168
- #col-left {
169
- margin: 0 auto;
170
- max-width: 600px;
171
- }
172
- #col-right {
173
- margin: 0 auto;
174
- max-width: 750px;
175
- }
176
- """
177
-
178
  with gr.Blocks(css=css) as Kolors:
179
- with gr.Row():
180
- with gr.Column(elem_id="col-left"):
181
- with gr.Row():
182
- korean_prompt = gr.Textbox(
183
- label="ํ•œ๊ตญ์–ด ํ”„๋กฌํ”„ํŠธ ์ž…๋ ฅ",
184
- placeholder="ํ•œ๊ตญ์–ด๋กœ ์›ํ•˜๋Š” ํ”„๋กฌํ”„ํŠธ๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”",
185
- lines=2
186
- )
187
- with gr.Row():
188
- generate_prompt_button = gr.Button("Generate Prompt")
189
- with gr.Row():
190
- optimized_prompt = gr.Textbox(
191
- label="์ตœ์ ํ™”๋œ ํ”„๋กฌํ”„ํŠธ ์ƒ์„ฑ",
192
- placeholder=" ",
193
- lines=2,
194
- interactive=False
195
- )
196
- with gr.Row():
197
- generated_prompt = gr.Textbox(
198
- label="ํ”„๋กฌํ”„ํŠธ ์ž…๋ ฅ",
199
- placeholder="์ด๋ฏธ์ง€ ์ƒ์„ฑ์— ์‚ฌ์šฉํ•  ํ”„๋กฌํ”„ํŠธ๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”",
200
- lines=2
201
- )
202
- with gr.Row():
203
- ip_adapter_image = gr.Image(label="Image Prompt (optional)", type="pil")
204
- with gr.Row(visible=False): # Advanced Settings ์ˆจ๊น€
205
- negative_prompt = gr.Textbox(
206
- label="Negative prompt",
207
- placeholder="Enter a negative prompt",
208
- visible=True,
209
- )
210
- seed = gr.Slider(
211
- label="Seed",
212
- minimum=0,
213
- maximum=MAX_SEED,
214
- step=1,
215
- value=0,
216
- )
217
- randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
218
  with gr.Row():
219
- width = gr.Slider(
220
- label="Width",
221
- minimum=256,
222
- maximum=MAX_IMAGE_SIZE,
223
- step=32,
224
- value=1024,
225
- )
226
- height = gr.Slider(
227
- label="Height",
228
- minimum=256,
229
- maximum=MAX_IMAGE_SIZE,
230
- step=32,
231
- value=1024,
232
  )
233
  with gr.Row():
234
- guidance_scale = gr.Slider(
235
- label="Guidance scale",
236
- minimum=0.0,
237
- maximum=10.0,
238
- step=0.1,
239
- value=5.0,
240
  )
241
- num_inference_steps = gr.Slider(
242
- label="Number of inference steps",
243
- minimum=10,
244
- maximum=50,
245
  step=1,
246
- value=25,
247
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  with gr.Row():
249
- ip_adapter_scale = gr.Slider(
250
- label="Image influence scale",
251
- info="Use 1 for creating variations",
252
- minimum=0.0,
253
- maximum=1.0,
254
- step=0.05,
255
- value=0.5,
256
- )
257
- with gr.Row():
258
- run_button = gr.Button("Generate Image")
259
-
260
- with gr.Column(elem_id="col-right"):
261
- result = gr.Image(label="Result", show_label=False)
262
- download_button = gr.File(label="Download Image")
263
- image_description = gr.Textbox(label="Image Description", placeholder="์ด๋ฏธ์ง€ ๋ถ„์„ ๊ฒฐ๊ณผ๊ฐ€ ์—ฌ๊ธฐ์— ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค.", interactive=False)
264
- analyze_button = gr.Button("Analyze Image")
265
-
266
- # ์ตœ์ ํ™”๋œ ํ”„๋กฌํ”„ํŠธ ์ƒ์„ฑ ๋ฐ ๊ฒฐ๊ณผ ํ‘œ์‹œ
267
- generate_prompt_button.click(
268
- fn=generate_prompt,
269
- inputs=[korean_prompt],
270
- outputs=[optimized_prompt]
271
- )
272
-
273
- # ์ด๋ฏธ์ง€ ์ƒ์„ฑ ๋ฐ ๋‹ค์šด๋กœ๋“œ ํŒŒ์ผ ๊ฒฝ๋กœ ์„ค์ •
274
- run_button.click(
275
- fn=infer,
276
- inputs=[generated_prompt, ip_adapter_image, ip_adapter_scale, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps],
277
- outputs=[result, download_button]
278
- )
279
 
280
- # ์ด๋ฏธ์ง€ ์„ค๋ช… ์ƒ์„ฑ
281
- analyze_button.click(
282
- fn=describe_image,
283
- inputs=[ip_adapter_image],
284
- outputs=[image_description]
285
- )
 
 
 
 
 
 
286
 
287
  Kolors.queue().launch(debug=True)
 
15
  from PIL import Image
16
  import re
17
 
 
 
 
18
  device = "cuda"
19
  ckpt_dir = snapshot_download(repo_id="Kwai-Kolors/Kolors")
20
  ckpt_IPA_dir = snapshot_download(repo_id="Kwai-Kolors/Kolors-IP-Adapter-Plus")
 
25
  scheduler = EulerDiscreteScheduler.from_pretrained(f"{ckpt_dir}/scheduler")
26
  unet_t2i = UNet2DConditionModel.from_pretrained(f"{ckpt_dir}/unet", revision=None).half().to(device)
27
  unet_i2i = unet_2d_condition.UNet2DConditionModel.from_pretrained(f"{ckpt_dir}/unet", revision=None).half().to(device)
28
+ image_encoder = CLIPVisionModelWithProjection.from_pretrained(f'{ckpt_IPA_dir}/image_encoder',ignore_mismatched_sizes=True).to(dtype=torch.float16, device=device)
29
  ip_img_size = 336
30
  clip_image_processor = CLIPImageProcessor(size=ip_img_size, crop_size=ip_img_size)
31
 
32
  pipe_t2i = pipeline_stable_diffusion_xl_chatglm_256.StableDiffusionXLPipeline(
33
  vae=vae,
34
+ text_encoder=text_encoder,
35
+ tokenizer=tokenizer,
36
+ unet=unet_t2i,
37
+ scheduler=scheduler,
38
  force_zeros_for_empty_prompt=False
39
  ).to(device)
40
 
 
57
  MAX_SEED = np.iinfo(np.int32).max
58
  MAX_IMAGE_SIZE = 1024
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  @spaces.GPU
61
+ def infer(prompt,
62
+ ip_adapter_image = None,
63
+ ip_adapter_scale = 0.5,
64
+ negative_prompt = "",
65
+ seed = 0,
66
+ randomize_seed = False,
67
+ width = 1024,
68
+ height = 1024,
69
+ guidance_scale = 5.0,
70
  num_inference_steps = 25
71
  ):
72
  if randomize_seed:
 
76
  if ip_adapter_image is None:
77
  pipe_t2i.to(device)
78
  image = pipe_t2i(
79
+ prompt = prompt,
80
  negative_prompt = negative_prompt,
81
+ guidance_scale = guidance_scale,
82
+ num_inference_steps = num_inference_steps,
83
+ width = width,
84
  height = height,
85
  generator = generator
86
+ ).images[0]
87
  image.save("generated_image.jpg") # ํŒŒ์ผ ํ™•์žฅ์ž๋ฅผ .jpg๋กœ ๋ณ€๊ฒฝ
88
  return image, "generated_image.jpg"
89
  else:
 
94
  image = pipe_i2i(
95
  prompt=prompt,
96
  ip_adapter_image=[ip_adapter_image],
97
+ negative_prompt=negative_prompt,
98
  height=height,
99
  width=width,
100
+ num_inference_steps=num_inference_steps,
101
  guidance_scale=guidance_scale,
102
  num_images_per_prompt=1,
103
  generator=generator
 
105
  image.save("generated_image.jpg") # ํŒŒ์ผ ํ™•์žฅ์ž๋ฅผ .jpg๋กœ ๋ณ€๊ฒฝ
106
  return image, "generated_image.jpg"
107
 
108
+ css="""
109
+ #col-left {
110
+ margin: 0 auto;
111
+ max-width: 600px;
112
+ }
113
+ #col-right {
114
+ margin: 0 auto;
115
+ max-width: 750px;
116
+ }
117
+ #output {
118
+ height: 500px;
119
+ overflow: auto;
120
+ border: 1px solid #ccc;
121
+ }
122
+ """
123
+
124
+ # ์ถ”๊ฐ€ ์ฝ”๋“œ ํ†ตํ•ฉ
125
  from transformers import AutoProcessor, AutoModelForCausalLM
126
 
127
+ model = AutoModelForCausalLM.from_pretrained('gokaygokay/Florence-2-SD3-Captioner', trust_remote_code=True).eval()
128
  processor = AutoProcessor.from_pretrained('gokaygokay/Florence-2-SD3-Captioner', trust_remote_code=True)
129
 
130
  def modify_caption(caption: str) -> str:
 
140
  return modified_caption if modified_caption != caption else caption
141
 
142
  @spaces.GPU
143
+ def run_example(image):
144
  image = Image.fromarray(image)
145
  task_prompt = "<DESCRIPTION>"
146
  prompt = task_prompt + "Describe this image in great detail."
 
148
  if image.mode != "RGB":
149
  image = image.convert("RGB")
150
 
151
+ inputs = processor(text=prompt, images=image, return_tensors="pt")
152
  generated_ids = model.generate(
153
  input_ids=inputs["input_ids"],
154
  pixel_values=inputs["pixel_values"],
 
159
  parsed_answer = processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.width, image.height))
160
  return modify_caption(parsed_answer["<DESCRIPTION>"])
161
 
 
 
 
 
 
 
 
 
 
 
 
162
  with gr.Blocks(css=css) as Kolors:
163
+ with gr.Tab("Image Generation"):
164
+ with gr.Row():
165
+ with gr.Column(elem_id="col-left"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  with gr.Row():
167
+ generated_prompt = gr.Textbox(
168
+ label="ํ”„๋กฌํ”„ํŠธ ์ž…๋ ฅ",
169
+ placeholder="์ด๋ฏธ์ง€ ์ƒ์„ฑ์— ์‚ฌ์šฉํ•  ํ”„๋กฌํ”„ํŠธ๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”",
170
+ lines=2
 
 
 
 
 
 
 
 
 
171
  )
172
  with gr.Row():
173
+ ip_adapter_image = gr.Image(label="Image Prompt (optional)", type="pil")
174
+ with gr.Row(visible=False): # Advanced Settings ์ˆจ๊น€
175
+ negative_prompt = gr.Textbox(
176
+ label="Negative prompt",
177
+ placeholder="Enter a negative prompt",
178
+ visible=True,
179
  )
180
+ seed = gr.Slider(
181
+ label="Seed",
182
+ minimum=0,
183
+ maximum=MAX_SEED,
184
  step=1,
185
+ value=0,
186
  )
187
+ randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
188
+ with gr.Row():
189
+ width = gr.Slider(
190
+ label="Width",
191
+ minimum=256,
192
+ maximum=MAX_IMAGE_SIZE,
193
+ step=32,
194
+ value=1024,
195
+ )
196
+ height = gr.Slider(
197
+ label="Height",
198
+ minimum=256,
199
+ maximum=MAX_IMAGE_SIZE,
200
+ step=32,
201
+ value=1024,
202
+ )
203
+ with gr.Row():
204
+ guidance_scale = gr.Slider(
205
+ label="Guidance scale",
206
+ minimum=0.0,
207
+ maximum=10.0,
208
+ step=0.1,
209
+ value=5.0,
210
+ )
211
+ num_inference_steps = gr.Slider(
212
+ label="Number of inference steps",
213
+ minimum=10,
214
+ maximum=50,
215
+ step=1,
216
+ value=25,
217
+ )
218
+ with gr.Row():
219
+ ip_adapter_scale = gr.Slider(
220
+ label="Image influence scale",
221
+ info="Use 1 for creating variations",
222
+ minimum=0.0,
223
+ maximum=1.0,
224
+ step=0.05,
225
+ value=0.5,
226
+ )
227
  with gr.Row():
228
+ run_button = gr.Button("Generate Image")
229
+
230
+ with gr.Column(elem_id="col-right"):
231
+ result = gr.Image(label="Result", show_label=False)
232
+ download_button = gr.File(label="Download Image")
233
+
234
+ # ์ด๋ฏธ์ง€ ์ƒ์„ฑ ๋ฐ ๋‹ค์šด๋กœ๋“œ ํŒŒ์ผ ๊ฒฝ๋กœ ์„ค์ •
235
+ run_button.click(
236
+ fn=infer,
237
+ inputs=[generated_prompt, ip_adapter_image, ip_adapter_scale, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps],
238
+ outputs=[result, download_button]
239
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
 
241
+ with gr.Tab("Florence-2 SD3 Prompts"):
242
+ gr.Markdown("# [Florence-2 SD3 Long Captioner](https://huggingface.co/gokaygokay/Florence-2-SD3-Captioner/)")
243
+ gr.Markdown("[Florence-2 Base](https://huggingface.co/microsoft/Florence-2-base-ft) fine-tuned on Long SD3 Prompt and Image pairs. Check above link for datasets that are used for fine-tuning.")
244
+ with gr.Row():
245
+ with gr.Column():
246
+ input_img = gr.Image(label="Input Picture")
247
+ submit_btn = gr.Button(value="Submit")
248
+ with gr.Column():
249
+ output_text = gr.Textbox(label="Output Text")
250
+
251
+
252
+ submit_btn.click(run_example, [input_img], [output_text])
253
 
254
  Kolors.queue().launch(debug=True)