Fix: Ensure Object is Correctly Placed in Scene without Texturing when the texture is not provided

#4
Files changed (1) hide show
  1. pops.py +230 -231
pops.py CHANGED
@@ -1,231 +1,230 @@
1
- import gradio as gr
2
- import torch
3
- from PIL import Image
4
- from diffusers import PriorTransformer, UNet2DConditionModel, KandinskyV22Pipeline
5
- from huggingface_hub import hf_hub_download
6
- from transformers import CLIPVisionModelWithProjection, CLIPImageProcessor, CLIPTokenizer, CLIPTextModelWithProjection
7
-
8
- from model import pops_utils
9
- from model.pipeline_pops import pOpsPipeline
10
-
11
- kandinsky_prior_repo: str = 'kandinsky-community/kandinsky-2-2-prior'
12
- kandinsky_decoder_repo: str = 'kandinsky-community/kandinsky-2-2-decoder'
13
- prior_texture_repo: str = 'models/texturing/learned_prior.pth'
14
- prior_instruct_repo: str = 'models/instruct/learned_prior.pth'
15
- prior_scene_repo: str = 'models/scene/learned_prior.pth'
16
- prior_repo = "pOpsPaper/operators"
17
-
18
- # gpu = torch.device('cuda')
19
- # cpu = torch.device('cpu')
20
-
21
- class PopsPipelines:
22
- def __init__(self):
23
- weight_dtype = torch.float16
24
- self.weight_dtype = weight_dtype
25
- device = 'cpu' #torch.device("cuda" if torch.cuda.is_available() else "cpu")
26
- self.device = 'cuda' #device
27
- self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(kandinsky_prior_repo,
28
- subfolder='image_encoder',
29
- torch_dtype=weight_dtype).eval()
30
- self.image_encoder.requires_grad_(False)
31
-
32
- self.image_processor = CLIPImageProcessor.from_pretrained(kandinsky_prior_repo,
33
- subfolder='image_processor')
34
-
35
- self.tokenizer = CLIPTokenizer.from_pretrained(kandinsky_prior_repo, subfolder='tokenizer')
36
- self.text_encoder = CLIPTextModelWithProjection.from_pretrained(kandinsky_prior_repo,
37
- subfolder='text_encoder',
38
- torch_dtype=weight_dtype).eval().to(device)
39
-
40
- # Load full model for vis
41
- self.unet = UNet2DConditionModel.from_pretrained(kandinsky_decoder_repo,
42
- subfolder='unet').to(torch.float16).to(device)
43
-
44
-
45
- self.decoder = KandinskyV22Pipeline.from_pretrained(kandinsky_decoder_repo, unet=self.unet,
46
- torch_dtype=torch.float16)
47
- self.decoder = self.decoder.to(device)
48
-
49
-
50
- self.priors_dict = {
51
- 'texturing':{'repo':prior_texture_repo},
52
- 'instruct': {'repo': prior_instruct_repo},
53
- 'scene': {'repo':prior_scene_repo}
54
- }
55
-
56
- for prior_type in self.priors_dict:
57
- prior_path = self.priors_dict[prior_type]['repo']
58
- prior = PriorTransformer.from_pretrained(
59
- kandinsky_prior_repo, subfolder="prior"
60
- )
61
-
62
- # Load from huggingface
63
- prior_path = hf_hub_download(repo_id=prior_repo, filename=str(prior_path))
64
- prior_state_dict = torch.load(prior_path, map_location=device)
65
- prior.load_state_dict(prior_state_dict, strict=False)
66
-
67
- prior.eval()
68
- prior = prior.to(weight_dtype)
69
-
70
- prior_pipeline = pOpsPipeline.from_pretrained(kandinsky_prior_repo,
71
- prior=prior,
72
- image_encoder=self.image_encoder,
73
- torch_dtype=torch.float16)
74
-
75
- self.priors_dict[prior_type]['pipeline'] = prior_pipeline
76
-
77
- def process_image(self, input_path):
78
- if input_path is None:
79
- return None
80
- image_pil = Image.open(input_path).convert("RGB").resize((512, 512))
81
- image = torch.Tensor(self.image_processor(image_pil)['pixel_values'][0]).to(self.device).unsqueeze(0).to(
82
- self.weight_dtype)
83
-
84
- return image
85
-
86
- def process_text(self, text):
87
- self.text_encoder.to('cuda')
88
- text_inputs = self.tokenizer(
89
- text,
90
- padding="max_length",
91
- max_length=self.tokenizer.model_max_length,
92
- truncation=True,
93
- return_tensors="pt",
94
- )
95
- mask = text_inputs.attention_mask.bool() # [0]
96
-
97
- text_encoder_output = self.text_encoder(text_inputs.input_ids.to(self.device))
98
- text_encoder_hidden_states = text_encoder_output.last_hidden_state
99
- text_encoder_concat = text_encoder_hidden_states[:, :mask.sum().item()]
100
- self.text_encoder.to('cpu')
101
- return text_encoder_concat
102
-
103
- def run_binary(self, input_a, input_b, prior_type):
104
- # Move pipeline to GPU
105
- pipeline = self.priors_dict[prior_type]['pipeline']
106
- pipeline.to('cuda')
107
- self.image_encoder.to('cuda')
108
- input_image_embeds, input_hidden_state = pops_utils.preprocess(input_a, input_b,
109
- self.image_encoder,
110
- pipeline.prior.clip_mean.detach(),
111
- pipeline.prior.clip_std.detach())
112
-
113
- negative_input_embeds = torch.zeros_like(input_image_embeds)
114
- negative_hidden_states = torch.zeros_like(input_hidden_state)
115
-
116
- guidance_scale = 1.0
117
- if prior_type == 'texturing':
118
- guidance_scale = 8.0
119
-
120
- img_emb = pipeline(input_embeds=input_image_embeds, input_hidden_states=input_hidden_state,
121
- negative_input_embeds=negative_input_embeds,
122
- negative_input_hidden_states=negative_hidden_states,
123
- num_inference_steps=25,
124
- num_images_per_prompt=1,
125
- guidance_scale=guidance_scale)
126
-
127
- # Optional
128
- if prior_type == 'scene':
129
- # Scene is the closet to what avg represents for a background image so incorporate that as well
130
- mean_emb = 0.5 * input_hidden_state[:, 0] + 0.5 * input_hidden_state[:, 1]
131
- mean_emb = (mean_emb * pipeline.prior.clip_std) + pipeline.prior.clip_mean
132
- alpha = 0.4
133
- img_emb.image_embeds = (1 - alpha) * img_emb.image_embeds + alpha * mean_emb
134
-
135
- # Move pipeline to CPU
136
- pipeline.to('cpu')
137
- self.image_encoder.to('cpu')
138
- return img_emb
139
-
140
- def run_instruct(self, input_a, text):
141
-
142
- text_encodings = self.process_text(text)
143
-
144
- # Move pipeline to GPU
145
- instruct_pipeline = self.priors_dict['instruct']['pipeline']
146
- instruct_pipeline.to('cuda')
147
- self.image_encoder.to('cuda')
148
- input_image_embeds, input_hidden_state = pops_utils.preprocess(input_a, None,
149
- self.image_encoder,
150
- instruct_pipeline.prior.clip_mean.detach(), instruct_pipeline.prior.clip_std.detach(),
151
- concat_hidden_states=text_encodings)
152
-
153
- negative_input_embeds = torch.zeros_like(input_image_embeds)
154
- negative_hidden_states = torch.zeros_like(input_hidden_state)
155
- img_emb = instruct_pipeline(input_embeds=input_image_embeds, input_hidden_states=input_hidden_state,
156
- negative_input_embeds=negative_input_embeds,
157
- negative_input_hidden_states=negative_hidden_states,
158
- num_inference_steps=25,
159
- num_images_per_prompt=1,
160
- guidance_scale=1.0)
161
-
162
- # Move pipeline to CPU
163
- instruct_pipeline.to('cpu')
164
- self.image_encoder.to('cpu')
165
- return img_emb
166
-
167
- def render(self, img_emb):
168
- self.decoder.to('cuda')
169
- images = self.decoder(image_embeds=img_emb.image_embeds, negative_image_embeds=img_emb.negative_image_embeds,
170
- num_inference_steps=50, height=512,
171
- width=512, guidance_scale=4).images
172
- self.decoder.to('cpu')
173
- return images[0]
174
-
175
- def run_instruct_texture(self, image_object_path, text_instruct, image_texture_path):
176
- # Process both inputs
177
- image_object = self.process_image(image_object_path)
178
- image_texture = self.process_image(image_texture_path)
179
-
180
- if image_object is None:
181
- raise gr.Error('Object image is required')
182
-
183
- current_emb = None
184
-
185
- if image_texture is None:
186
- instruct_input = image_object
187
- else:
188
- # Run texturing
189
- current_emb = self.run_binary(input_a=image_object, input_b=image_texture,prior_type='texturing')
190
- instruct_input = current_emb.image_embeds
191
-
192
- if text_instruct != '':
193
- current_emb = self.run_instruct(input_a=instruct_input, text=text_instruct)
194
-
195
- if current_emb is None:
196
- raise gr.Error('At least one of the inputs is required')
197
-
198
- # Render as image
199
- image = self.render(current_emb)
200
-
201
- return image
202
-
203
- def run_texture_scene(self, image_object_path, image_texture_path, image_scene_path):
204
- # Process both inputs
205
- image_object = self.process_image(image_object_path)
206
- image_texture = self.process_image(image_texture_path)
207
- image_scene = self.process_image(image_scene_path)
208
-
209
- if image_object is None:
210
- raise gr.Error('Object image is required')
211
-
212
- current_emb = None
213
-
214
- if image_texture is None:
215
- scene_input = image_object
216
- else:
217
- # Run texturing
218
- current_emb = self.run_binary(input_a=image_object, input_b=image_scene,prior_type='scene')
219
- scene_input = current_emb.image_embeds
220
-
221
- # Run scene
222
- if image_scene is not None:
223
- current_emb = self.run_binary(input_a=scene_input, input_b=image_texture,prior_type='texturing')
224
-
225
- if current_emb is None:
226
- raise gr.Error('At least one of the images is required')
227
- # Render as image
228
- image = self.render(current_emb)
229
-
230
- return image
231
-
 
1
+ import gradio as gr
2
+ import torch
3
+ from PIL import Image
4
+ from diffusers import PriorTransformer, UNet2DConditionModel, KandinskyV22Pipeline
5
+ from huggingface_hub import hf_hub_download
6
+ from transformers import CLIPVisionModelWithProjection, CLIPImageProcessor, CLIPTokenizer, CLIPTextModelWithProjection
7
+
8
+ from model import pops_utils
9
+ from model.pipeline_pops import pOpsPipeline
10
+
11
+ kandinsky_prior_repo: str = 'kandinsky-community/kandinsky-2-2-prior'
12
+ kandinsky_decoder_repo: str = 'kandinsky-community/kandinsky-2-2-decoder'
13
+ prior_texture_repo: str = 'models/texturing/learned_prior.pth'
14
+ prior_instruct_repo: str = 'models/instruct/learned_prior.pth'
15
+ prior_scene_repo: str = 'models/scene/learned_prior.pth'
16
+ prior_repo = "pOpsPaper/operators"
17
+
18
+ # gpu = torch.device('cuda')
19
+ # cpu = torch.device('cpu')
20
+
21
+ class PopsPipelines:
22
+ def __init__(self):
23
+ weight_dtype = torch.float16
24
+ self.weight_dtype = weight_dtype
25
+ device = 'cpu' #torch.device("cuda" if torch.cuda.is_available() else "cpu")
26
+ self.device = 'cuda' #device
27
+ self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(kandinsky_prior_repo,
28
+ subfolder='image_encoder',
29
+ torch_dtype=weight_dtype).eval()
30
+ self.image_encoder.requires_grad_(False)
31
+
32
+ self.image_processor = CLIPImageProcessor.from_pretrained(kandinsky_prior_repo,
33
+ subfolder='image_processor')
34
+
35
+ self.tokenizer = CLIPTokenizer.from_pretrained(kandinsky_prior_repo, subfolder='tokenizer')
36
+ self.text_encoder = CLIPTextModelWithProjection.from_pretrained(kandinsky_prior_repo,
37
+ subfolder='text_encoder',
38
+ torch_dtype=weight_dtype).eval().to(device)
39
+
40
+ # Load full model for vis
41
+ self.unet = UNet2DConditionModel.from_pretrained(kandinsky_decoder_repo,
42
+ subfolder='unet').to(torch.float16).to(device)
43
+
44
+
45
+ self.decoder = KandinskyV22Pipeline.from_pretrained(kandinsky_decoder_repo, unet=self.unet,
46
+ torch_dtype=torch.float16)
47
+ self.decoder = self.decoder.to(device)
48
+
49
+
50
+ self.priors_dict = {
51
+ 'texturing':{'repo':prior_texture_repo},
52
+ 'instruct': {'repo': prior_instruct_repo},
53
+ 'scene': {'repo':prior_scene_repo}
54
+ }
55
+
56
+ for prior_type in self.priors_dict:
57
+ prior_path = self.priors_dict[prior_type]['repo']
58
+ prior = PriorTransformer.from_pretrained(
59
+ kandinsky_prior_repo, subfolder="prior"
60
+ )
61
+
62
+ # Load from huggingface
63
+ prior_path = hf_hub_download(repo_id=prior_repo, filename=str(prior_path))
64
+ prior_state_dict = torch.load(prior_path, map_location=device)
65
+ prior.load_state_dict(prior_state_dict, strict=False)
66
+
67
+ prior.eval()
68
+ prior = prior.to(weight_dtype)
69
+
70
+ prior_pipeline = pOpsPipeline.from_pretrained(kandinsky_prior_repo,
71
+ prior=prior,
72
+ image_encoder=self.image_encoder,
73
+ torch_dtype=torch.float16)
74
+
75
+ self.priors_dict[prior_type]['pipeline'] = prior_pipeline
76
+
77
+ def process_image(self, input_path):
78
+ if input_path is None:
79
+ return None
80
+ image_pil = Image.open(input_path).convert("RGB").resize((512, 512))
81
+ image = torch.Tensor(self.image_processor(image_pil)['pixel_values'][0]).to(self.device).unsqueeze(0).to(
82
+ self.weight_dtype)
83
+
84
+ return image
85
+
86
+ def process_text(self, text):
87
+ self.text_encoder.to('cuda')
88
+ text_inputs = self.tokenizer(
89
+ text,
90
+ padding="max_length",
91
+ max_length=self.tokenizer.model_max_length,
92
+ truncation=True,
93
+ return_tensors="pt",
94
+ )
95
+ mask = text_inputs.attention_mask.bool() # [0]
96
+
97
+ text_encoder_output = self.text_encoder(text_inputs.input_ids.to(self.device))
98
+ text_encoder_hidden_states = text_encoder_output.last_hidden_state
99
+ text_encoder_concat = text_encoder_hidden_states[:, :mask.sum().item()]
100
+ self.text_encoder.to('cpu')
101
+ return text_encoder_concat
102
+
103
+ def run_binary(self, input_a, input_b, prior_type):
104
+ # Move pipeline to GPU
105
+ pipeline = self.priors_dict[prior_type]['pipeline']
106
+ pipeline.to('cuda')
107
+ self.image_encoder.to('cuda')
108
+ input_image_embeds, input_hidden_state = pops_utils.preprocess(input_a, input_b,
109
+ self.image_encoder,
110
+ pipeline.prior.clip_mean.detach(),
111
+ pipeline.prior.clip_std.detach())
112
+
113
+ negative_input_embeds = torch.zeros_like(input_image_embeds)
114
+ negative_hidden_states = torch.zeros_like(input_hidden_state)
115
+
116
+ guidance_scale = 1.0
117
+ if prior_type == 'texturing':
118
+ guidance_scale = 8.0
119
+
120
+ img_emb = pipeline(input_embeds=input_image_embeds, input_hidden_states=input_hidden_state,
121
+ negative_input_embeds=negative_input_embeds,
122
+ negative_input_hidden_states=negative_hidden_states,
123
+ num_inference_steps=25,
124
+ num_images_per_prompt=1,
125
+ guidance_scale=guidance_scale)
126
+
127
+ # Optional
128
+ if prior_type == 'scene':
129
+ # Scene is the closet to what avg represents for a background image so incorporate that as well
130
+ mean_emb = 0.5 * input_hidden_state[:, 0] + 0.5 * input_hidden_state[:, 1]
131
+ mean_emb = (mean_emb * pipeline.prior.clip_std) + pipeline.prior.clip_mean
132
+ alpha = 0.4
133
+ img_emb.image_embeds = (1 - alpha) * img_emb.image_embeds + alpha * mean_emb
134
+
135
+ # Move pipeline to CPU
136
+ pipeline.to('cpu')
137
+ self.image_encoder.to('cpu')
138
+ return img_emb
139
+
140
+ def run_instruct(self, input_a, text):
141
+
142
+ text_encodings = self.process_text(text)
143
+
144
+ # Move pipeline to GPU
145
+ instruct_pipeline = self.priors_dict['instruct']['pipeline']
146
+ instruct_pipeline.to('cuda')
147
+ self.image_encoder.to('cuda')
148
+ input_image_embeds, input_hidden_state = pops_utils.preprocess(input_a, None,
149
+ self.image_encoder,
150
+ instruct_pipeline.prior.clip_mean.detach(), instruct_pipeline.prior.clip_std.detach(),
151
+ concat_hidden_states=text_encodings)
152
+
153
+ negative_input_embeds = torch.zeros_like(input_image_embeds)
154
+ negative_hidden_states = torch.zeros_like(input_hidden_state)
155
+ img_emb = instruct_pipeline(input_embeds=input_image_embeds, input_hidden_states=input_hidden_state,
156
+ negative_input_embeds=negative_input_embeds,
157
+ negative_input_hidden_states=negative_hidden_states,
158
+ num_inference_steps=25,
159
+ num_images_per_prompt=1,
160
+ guidance_scale=1.0)
161
+
162
+ # Move pipeline to CPU
163
+ instruct_pipeline.to('cpu')
164
+ self.image_encoder.to('cpu')
165
+ return img_emb
166
+
167
+ def render(self, img_emb):
168
+ self.decoder.to('cuda')
169
+ images = self.decoder(image_embeds=img_emb.image_embeds, negative_image_embeds=img_emb.negative_image_embeds,
170
+ num_inference_steps=50, height=512,
171
+ width=512, guidance_scale=4).images
172
+ self.decoder.to('cpu')
173
+ return images[0]
174
+
175
+ def run_instruct_texture(self, image_object_path, text_instruct, image_texture_path):
176
+ # Process both inputs
177
+ image_object = self.process_image(image_object_path)
178
+ image_texture = self.process_image(image_texture_path)
179
+
180
+ if image_object is None:
181
+ raise gr.Error('Object image is required')
182
+
183
+ current_emb = None
184
+
185
+ if image_texture is None:
186
+ instruct_input = image_object
187
+ else:
188
+ # Run texturing
189
+ current_emb = self.run_binary(input_a=image_object, input_b=image_texture,prior_type='texturing')
190
+ instruct_input = current_emb.image_embeds
191
+
192
+ if text_instruct != '':
193
+ current_emb = self.run_instruct(input_a=instruct_input, text=text_instruct)
194
+
195
+ if current_emb is None:
196
+ raise gr.Error('At least one of the inputs is required')
197
+
198
+ # Render as image
199
+ image = self.render(current_emb)
200
+
201
+ return image
202
+
203
+ def run_texture_scene(self, image_object_path, image_texture_path, image_scene_path):
204
+ image_object = self.process_image(image_object_path)
205
+ image_texture = self.process_image(image_texture_path)
206
+ image_scene = self.process_image(image_scene_path)
207
+
208
+ if image_object is None:
209
+ raise gr.Error('Object image is required')
210
+
211
+ current_emb = None
212
+
213
+ # If both object and scene images are provided, run scene processing
214
+ if image_scene is not None:
215
+ current_emb = self.run_binary(input_a=image_object, input_b=image_scene, prior_type='scene')
216
+ scene_input = current_emb.image_embeds
217
+ else:
218
+ scene_input = image_object
219
+
220
+ # If a texture image is provided, apply texturing
221
+ if image_texture is not None:
222
+ current_emb = self.run_binary(input_a=scene_input, input_b=image_texture, prior_type='texturing')
223
+
224
+ if current_emb is None:
225
+ raise gr.Error('At least one of the images is required')
226
+
227
+ # Render the final image
228
+ image = self.render(current_emb)
229
+
230
+ return image