lopho commited on
Commit
6deb407
1 Parent(s): 3d3ef22

more info + correct model and dataset links

Browse files
Files changed (2) hide show
  1. README.md +3 -1
  2. app.py +23 -6
README.md CHANGED
@@ -11,9 +11,11 @@ license: agpl-3.0
11
  library_name: diffusers
12
  pipeline_tag: text-to-video
13
  datasets:
14
- - TempoFunk/tempofunk-s
 
15
  models:
16
  - TempoFunk/makeavid-sd-jax
 
17
  tags:
18
  - jax-diffusers-event
19
  ---
 
11
  library_name: diffusers
12
  pipeline_tag: text-to-video
13
  datasets:
14
+ - TempoFunk/tempofunk-sdance
15
+ - TempoFunk/tempofunk-m
16
  models:
17
  - TempoFunk/makeavid-sd-jax
18
+ - runwayml/stable-diffusion-v1-5
19
  tags:
20
  - jax-diffusers-event
21
  ---
app.py CHANGED
@@ -121,15 +121,31 @@ with gr.Blocks(title = 'Make-A-Video Stable Diffusion JAX', analytics_enabled =
121
  with gr.Column():
122
  intro1 = gr.Markdown("""
123
  # Make-A-Video Stable Diffusion JAX
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  **Please be patient. The model might have to compile with current parameters.**
125
 
126
  This can take up to 5 minutes on the first run, and 2-3 minutes on later runs.
127
  The compilation will be cached and consecutive runs with the same parameters
128
  will be much faster.
129
- """)
130
- with gr.Column():
131
- intro2 = gr.Markdown("""
132
- The following parameters require the model to compile
133
  - Number of frames
134
  - Width & Height
135
  - Steps
@@ -153,7 +169,7 @@ with gr.Blocks(title = 'Make-A-Video Stable Diffusion JAX', analytics_enabled =
153
  )
154
  inference_steps_input = gr.Slider(
155
  label = 'Steps',
156
- minimum = 1,
157
  maximum = 100,
158
  value = 20,
159
  step = 1
@@ -222,6 +238,7 @@ with gr.Blocks(title = 'Make-A-Video Stable Diffusion JAX', analytics_enabled =
222
  height_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
223
  width_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
224
  num_frames_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
 
225
  inference_steps_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
226
  will_trigger.value = trigger_check_fun(image_input.value, inference_steps_input.value, height_input.value, width_input.value, num_frames_input.value)
227
  ev = submit_button.click(
@@ -254,6 +271,6 @@ with gr.Blocks(title = 'Make-A-Video Stable Diffusion JAX', analytics_enabled =
254
  )
255
  cancel_button.click(fn = lambda: None, cancels = ev)
256
 
257
- demo.queue(concurrency_count = 1, max_size = 16)
258
  demo.launch()
259
 
 
121
  with gr.Column():
122
  intro1 = gr.Markdown("""
123
  # Make-A-Video Stable Diffusion JAX
124
+
125
+ We have extended a pretrained LMD inpainting image generation model with temporal convolutions and attention.
126
+ We take advantage of the extra 5 input channels of the inpaint model to guide the video generation with a hint image and mask.
127
+ The hint image can be given by the users, otherwise it is generated by an generative image model.
128
+
129
+ The temporal convolution and attention is a port of [Make-A-Video Pytorch](https://github.com/lucidrains/make-a-video-pytorch/blob/main/make_a_video_pytorch)
130
+ to FLAX. It is a pseudo 3D convolution that seperately convolves accross the spatial dimension in 2D and over the temporal dimension in 1D.
131
+ Temporal attention is purely self attention and also separately attends to time and space.
132
+
133
+ Only the new temporal layers have been fine tuned on a dataset of videos themed around dance.
134
+ The model has been trained for 60 epochs on a dataset of 10,000 Videos with 120 frames each, randomly selecting a 24 frame range from each sample.
135
+
136
+ See model and dataset links in the metadata.
137
+
138
+ Model implementation and training code can be found at [https://github.com/lopho/makeavid-sd-tpu](https://github.com/lopho/makeavid-sd-tpu)
139
+ """)
140
+ with gr.Column():
141
+ intro3 = gr.Markdown("""
142
  **Please be patient. The model might have to compile with current parameters.**
143
 
144
  This can take up to 5 minutes on the first run, and 2-3 minutes on later runs.
145
  The compilation will be cached and consecutive runs with the same parameters
146
  will be much faster.
147
+
148
+ Changes to the following parameters require the model to compile
 
 
149
  - Number of frames
150
  - Width & Height
151
  - Steps
 
169
  )
170
  inference_steps_input = gr.Slider(
171
  label = 'Steps',
172
+ minimum = 2,
173
  maximum = 100,
174
  value = 20,
175
  step = 1
 
238
  height_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
239
  width_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
240
  num_frames_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
241
+ image_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
242
  inference_steps_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
243
  will_trigger.value = trigger_check_fun(image_input.value, inference_steps_input.value, height_input.value, width_input.value, num_frames_input.value)
244
  ev = submit_button.click(
 
271
  )
272
  cancel_button.click(fn = lambda: None, cancels = ev)
273
 
274
+ demo.queue(concurrency_count = 1, max_size = 32)
275
  demo.launch()
276