Jae-Won Chung commited on
Commit
c97bae1
1 Parent(s): abd945c

Updated diffusion benchmark and data

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +1 -1
  2. benchmark/diffusion/image-to-video/pegasus/A100/queue_1gpu.yaml +4 -4
  3. benchmark/diffusion/image-to-video/pegasus/H100/queue_1gpu.yaml +4 -4
  4. benchmark/diffusion/image-to-video/scripts/aggregate_leaderboard_models.py +1 -1
  5. benchmark/diffusion/image-to-video/scripts/benchmark_one_datapoint.py +26 -15
  6. benchmark/diffusion/image-to-video/scripts/benchmark_one_model.py +41 -35
  7. benchmark/diffusion/image-to-video/sharegpt4video/extract_first_frame.py +1 -1
  8. benchmark/diffusion/text-to-image/pegasus/A100/queue_1gpu.yaml +1 -1
  9. benchmark/diffusion/text-to-image/scripts/aggregate_leaderboard_models.py +2 -1
  10. benchmark/diffusion/text-to-image/scripts/benchmark_one_datapoint.py +49 -8
  11. benchmark/diffusion/text-to-image/scripts/benchmark_one_model.py +20 -18
  12. benchmark/diffusion/text-to-video/pegasus/A100/queue_1gpu.yaml +1 -1
  13. benchmark/diffusion/text-to-video/pegasus/H100/queue_1gpu.yaml +1 -1
  14. benchmark/diffusion/text-to-video/scripts/aggregate_leaderboard_models.py +2 -1
  15. benchmark/diffusion/text-to-video/scripts/benchmark_one_datapoint.py +11 -11
  16. benchmark/diffusion/text-to-video/scripts/benchmark_one_model.py +37 -35
  17. data/diffusion/image-to-video/A100-SXM4-40GB/ali-vilab/i2vgen-xl/bs1+steps25+frames16.json +2 -2
  18. data/diffusion/image-to-video/A100-SXM4-40GB/ali-vilab/i2vgen-xl/bs2+steps25+frames16.json +2 -2
  19. data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid-xt/bs1+steps25+frames25.json +2 -2
  20. data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid-xt/bs2+steps25+frames25.json +9 -0
  21. data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid-xt/bs3+steps25+frames25.json +9 -0
  22. data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid/bs1+steps25+frames14.json +2 -2
  23. data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid/bs2+steps25+frames14.json +9 -0
  24. data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid/bs3+steps25+frames14.json +9 -0
  25. data/diffusion/image-to-video/H100 80GB HBM3/ali-vilab/i2vgen-xl/bs1+steps25+frames16.json +2 -2
  26. data/diffusion/image-to-video/H100 80GB HBM3/ali-vilab/i2vgen-xl/bs2+steps25+frames16.json +2 -2
  27. data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid-xt/bs1+steps25+frames25.json +2 -2
  28. data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid-xt/bs2+steps25+frames25.json +9 -0
  29. data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid-xt/bs3+steps25+frames25.json +9 -0
  30. data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid/bs1+steps25+frames14.json +2 -2
  31. data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid/bs2+steps25+frames14.json +9 -0
  32. data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid/bs3+steps25+frames14.json +9 -0
  33. data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs1+steps25.json +2 -2
  34. data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs16+steps25.json +2 -2
  35. data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs2+steps25.json +2 -2
  36. data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs32+steps25.json +8 -0
  37. data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs4+steps25.json +2 -2
  38. data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs8+steps25.json +2 -2
  39. data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-3/bs1+steps25.json +2 -2
  40. data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-3/bs2+steps25.json +2 -2
  41. data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs1+steps25.json +2 -2
  42. data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs16+steps25.json +2 -2
  43. data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs2+steps25.json +2 -2
  44. data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs32+steps25.json +8 -0
  45. data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs4+steps25.json +2 -2
  46. data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs64+steps25.json +8 -0
  47. data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs8+steps25.json +2 -2
  48. data/diffusion/text-to-image/A100-SXM4-40GB/segmind/SSD-1B/bs1+steps25.json +2 -2
  49. data/diffusion/text-to-image/A100-SXM4-40GB/segmind/SSD-1B/bs2+steps25.json +2 -2
  50. data/diffusion/text-to-image/A100-SXM4-40GB/segmind/SSD-1B/bs4+steps25.json +2 -2
.gitignore CHANGED
@@ -18,4 +18,4 @@ build/
18
 
19
  # Data files
20
  *.log
21
- pegasus/consumed.yaml
 
18
 
19
  # Data files
20
  *.log
21
+ figures/
benchmark/diffusion/image-to-video/pegasus/A100/queue_1gpu.yaml CHANGED
@@ -1,6 +1,6 @@
1
  - command:
2
- - "python scripts/benchmark_one_model.py {{ model }} --result-root results/joule --dataset-path sharegpt4video/sharegpt4video_100.json --gpu-ids {{ gpu }} --batch-sizes 8 4 2 1 --power-limits 400 --num-inference-steps 25"
3
  model:
4
- - '--model ali-vilab/i2vgen-xl --num-frames 16 --add-text-prompt'
5
- - '--model stabilityai/stable-video-diffusion-img2vid --num-frames 14'
6
- - '--model stabilityai/stable-video-diffusion-img2vid-xt --num-frames 25'
 
1
  - command:
2
+ - "python scripts/benchmark_one_model.py {{ model }} --result-root results/joule --dataset-path sharegpt4video/sharegpt4video_100.json --gpu-ids {{ gpu }} --batch-sizes 4 3 2 1 --power-limits 400 --num-inference-steps 1 2 4 8 16 25 30 40 50"
3
  model:
4
+ - '--model ali-vilab/i2vgen-xl --num-frames 16 --add-text-prompt --width 1280 --height 720'
5
+ - '--model stabilityai/stable-video-diffusion-img2vid --num-frames 14 --width 1024 --height 576'
6
+ - '--model stabilityai/stable-video-diffusion-img2vid-xt --num-frames 25 --width 1024 --height 576'
benchmark/diffusion/image-to-video/pegasus/H100/queue_1gpu.yaml CHANGED
@@ -1,6 +1,6 @@
1
  - command:
2
- - "python scripts/benchmark_one_model.py {{ model }} --result-root results/joule --dataset-path sharegpt4video/sharegpt4video_700.json --gpu-ids {{ gpu }} --batch-sizes 64 32 16 8 4 2 1 --power-limits 700 --num-inference-steps 25"
3
  model:
4
- - '--model ali-vilab/i2vgen-xl --num-frames 16 --add-text-prompt'
5
- - '--model stabilityai/stable-video-diffusion-img2vid --num-frames 14'
6
- - '--model stabilityai/stable-video-diffusion-img2vid-xt --num-frames 25'
 
1
  - command:
2
+ - "python scripts/benchmark_one_model.py {{ model }} --result-root results/joule --dataset-path sharegpt4video/sharegpt4video_100.json --gpu-ids {{ gpu }} --batch-sizes 4 3 2 1 --power-limits 700 --num-inference-steps 1 2 4 8 16 25 30 40 50"
3
  model:
4
+ - "--model ali-vilab/i2vgen-xl --num-frames 16 --add-text-prompt --width 1280 --height 720"
5
+ - "--model stabilityai/stable-video-diffusion-img2vid --num-frames 14 --width 1024 --height 576"
6
+ - "--model stabilityai/stable-video-diffusion-img2vid-xt --num-frames 25 --width 1024 --height 576"
benchmark/diffusion/image-to-video/scripts/aggregate_leaderboard_models.py CHANGED
@@ -15,7 +15,7 @@ def main(results_dir: Path, output_file: Path) -> None:
15
  for model_dir in sorted(glob(f"{results_dir}/*/*")):
16
  model_name = "/".join(model_dir.split("/")[-2:])
17
  print(f" {model_name}")
18
- result_file_cand = glob(f"{model_dir}/bs1+*+results.json")
19
  assert len(result_file_cand) == 1, model_name
20
  results_data = json.load(open(result_file_cand[0]))
21
  denosing_module_name = "unet" if "unet" in results_data["num_parameters"] else "transformer"
 
15
  for model_dir in sorted(glob(f"{results_dir}/*/*")):
16
  model_name = "/".join(model_dir.split("/")[-2:])
17
  print(f" {model_name}")
18
+ result_file_cand = glob(f"{model_dir}/bs1+*+steps25+results.json")
19
  assert len(result_file_cand) == 1, model_name
20
  results_data = json.load(open(result_file_cand[0]))
21
  denosing_module_name = "unet" if "unet" in results_data["num_parameters"] else "transformer"
benchmark/diffusion/image-to-video/scripts/benchmark_one_datapoint.py CHANGED
@@ -27,10 +27,10 @@ class Results:
27
  model: str
28
  num_parameters: dict[str, int]
29
  gpu_model: str
30
- num_infernece_steps: int
31
- num_frames: int
32
  power_limit: int
33
  batch_size: int
 
 
34
  num_prompts: int
35
  total_runtime: float = 0.0
36
  total_energy: float = 0.0
@@ -80,6 +80,7 @@ def load_text_image_prompts(
80
  path: str,
81
  batch_size: int,
82
  num_batches: int | None = None,
 
83
  ) -> tuple[int, list[tuple[list[str], list[Image.Image]]]]:
84
  """Load the dataset to feed the model and return it as a list of batches of prompts.
85
 
@@ -93,6 +94,9 @@ def load_text_image_prompts(
93
  dataset = json.load(open(path))
94
  assert len(dataset["caption"]) == len(dataset["video_id"])
95
 
 
 
 
96
  if num_batches is not None:
97
  if len(dataset["caption"]) < num_batches * batch_size:
98
  raise ValueError("Not enough data for the requested number of batches.")
@@ -103,6 +107,8 @@ def load_text_image_prompts(
103
  dataset["first_frame"] = [
104
  load_image(str(image_path / f"{video_id}.jpg")) for video_id in dataset["video_id"]
105
  ]
 
 
106
 
107
  batched = [
108
  (dataset["caption"][i : i + batch_size], dataset["first_frame"][i : i + batch_size])
@@ -135,8 +141,8 @@ def benchmark(args: argparse.Namespace) -> None:
135
 
136
  results_dir = Path(args.result_root) / args.model
137
  results_dir.mkdir(parents=True, exist_ok=True)
138
- benchmark_name = str(results_dir / f"bs{args.batch_size}+pl{args.power_limit}")
139
- video_dir = results_dir / f"bs{args.batch_size}+pl{args.power_limit}+generated"
140
  video_dir.mkdir(exist_ok=True)
141
 
142
  arg_out_filename = f"{benchmark_name}+args.json"
@@ -150,11 +156,16 @@ def benchmark(args: argparse.Namespace) -> None:
150
  pynvml.nvmlInit()
151
  handle = pynvml.nvmlDeviceGetHandleByIndex(0)
152
  gpu_model = pynvml.nvmlDeviceGetName(handle)
153
- pynvml.nvmlDeviceSetPersistenceMode(handle, pynvml.NVML_FEATURE_ENABLED)
154
- pynvml.nvmlDeviceSetPowerManagementLimit(handle, args.power_limit * 1000)
155
  pynvml.nvmlShutdown()
156
 
157
- num_prompts, batched_prompts = load_text_image_prompts(args.dataset_path, args.batch_size, args.num_batches)
 
 
 
 
 
158
 
159
  pipeline = get_pipeline(args.model)
160
 
@@ -189,7 +200,7 @@ def benchmark(args: argparse.Namespace) -> None:
189
  fps_param_name = fps_param_name_candidates[0]
190
 
191
  torch.cuda.reset_peak_memory_stats(device="cuda:0")
192
- zeus_monitor.begin_window("benchmark", sync_cuda=False)
193
 
194
  # Build common parameter dict for all batches
195
  params: dict[str, Any] = dict(
@@ -210,15 +221,15 @@ def benchmark(args: argparse.Namespace) -> None:
210
  if args.add_text_prompt:
211
  params["prompt"] = intermediate.prompts
212
 
213
- zeus_monitor.begin_window("batch", sync_cuda=False)
214
  frames = pipeline(**params).frames
215
- batch_measurements = zeus_monitor.end_window("batch", sync_cuda=False)
216
 
217
  intermediate.frames = frames
218
  intermediate.batch_latency = batch_measurements.time
219
  intermediate.batch_energy = batch_measurements.total_energy
220
 
221
- measurements = zeus_monitor.end_window("benchmark", sync_cuda=False)
222
  peak_memory = torch.cuda.max_memory_allocated(device="cuda:0")
223
 
224
  results: list[Result] = []
@@ -255,10 +266,10 @@ def benchmark(args: argparse.Namespace) -> None:
255
  model=args.model,
256
  num_parameters=count_parameters(pipeline),
257
  gpu_model=gpu_model,
258
- num_infernece_steps=args.num_inference_steps,
259
- num_frames=args.num_frames,
260
  power_limit=args.power_limit,
261
  batch_size=args.batch_size,
 
 
262
  num_prompts=num_prompts,
263
  total_runtime=measurements.time,
264
  total_energy=measurements.total_energy,
@@ -289,8 +300,8 @@ if __name__ == "__main__":
289
  parser.add_argument("--num-inference-steps", type=int, default=50, help="The number of denoising steps.")
290
  parser.add_argument("--num-frames", type=int, default=1, help="The number of frames to generate.")
291
  parser.add_argument("--fps", type=int, default=16, help="Frames per second for micro-conditioning.")
292
- parser.add_argument("--height", type=int, help="Height of the generated video.")
293
- parser.add_argument("--width", type=int, help="Width of the generated video.")
294
  parser.add_argument("--num-batches", type=int, default=None, help="The number of batches to use from the dataset.")
295
  parser.add_argument("--save-every", type=int, default=10, help="Save generations to file every N prompts.")
296
  parser.add_argument("--seed", type=int, default=0, help="The seed to use for the RNG.")
 
27
  model: str
28
  num_parameters: dict[str, int]
29
  gpu_model: str
 
 
30
  power_limit: int
31
  batch_size: int
32
+ num_inference_steps: int
33
+ num_frames: int
34
  num_prompts: int
35
  total_runtime: float = 0.0
36
  total_energy: float = 0.0
 
80
  path: str,
81
  batch_size: int,
82
  num_batches: int | None = None,
83
+ image_resize: tuple[int, int] | None = None,
84
  ) -> tuple[int, list[tuple[list[str], list[Image.Image]]]]:
85
  """Load the dataset to feed the model and return it as a list of batches of prompts.
86
 
 
94
  dataset = json.load(open(path))
95
  assert len(dataset["caption"]) == len(dataset["video_id"])
96
 
97
+ dataset["caption"] *= 10
98
+ dataset["video_id"] *= 10
99
+
100
  if num_batches is not None:
101
  if len(dataset["caption"]) < num_batches * batch_size:
102
  raise ValueError("Not enough data for the requested number of batches.")
 
107
  dataset["first_frame"] = [
108
  load_image(str(image_path / f"{video_id}.jpg")) for video_id in dataset["video_id"]
109
  ]
110
+ if image_resize is not None:
111
+ dataset["first_frame"] = [image.resize(image_resize) for image in dataset["first_frame"]]
112
 
113
  batched = [
114
  (dataset["caption"][i : i + batch_size], dataset["first_frame"][i : i + batch_size])
 
141
 
142
  results_dir = Path(args.result_root) / args.model
143
  results_dir.mkdir(parents=True, exist_ok=True)
144
+ benchmark_name = str(results_dir / f"bs{args.batch_size}+pl{args.power_limit}+steps{args.num_inference_steps}")
145
+ video_dir = results_dir / f"bs{args.batch_size}+pl{args.power_limit}+steps{args.num_inference_steps}+generated"
146
  video_dir.mkdir(exist_ok=True)
147
 
148
  arg_out_filename = f"{benchmark_name}+args.json"
 
156
  pynvml.nvmlInit()
157
  handle = pynvml.nvmlDeviceGetHandleByIndex(0)
158
  gpu_model = pynvml.nvmlDeviceGetName(handle)
159
+ # pynvml.nvmlDeviceSetPersistenceMode(handle, pynvml.NVML_FEATURE_ENABLED)
160
+ # pynvml.nvmlDeviceSetPowerManagementLimit(handle, args.power_limit * 1000)
161
  pynvml.nvmlShutdown()
162
 
163
+ num_prompts, batched_prompts = load_text_image_prompts(
164
+ args.dataset_path,
165
+ args.batch_size,
166
+ args.num_batches,
167
+ (args.width, args.height),
168
+ )
169
 
170
  pipeline = get_pipeline(args.model)
171
 
 
200
  fps_param_name = fps_param_name_candidates[0]
201
 
202
  torch.cuda.reset_peak_memory_stats(device="cuda:0")
203
+ zeus_monitor.begin_window("benchmark", sync_execution=False)
204
 
205
  # Build common parameter dict for all batches
206
  params: dict[str, Any] = dict(
 
221
  if args.add_text_prompt:
222
  params["prompt"] = intermediate.prompts
223
 
224
+ zeus_monitor.begin_window("batch", sync_execution=False)
225
  frames = pipeline(**params).frames
226
+ batch_measurements = zeus_monitor.end_window("batch", sync_execution=False)
227
 
228
  intermediate.frames = frames
229
  intermediate.batch_latency = batch_measurements.time
230
  intermediate.batch_energy = batch_measurements.total_energy
231
 
232
+ measurements = zeus_monitor.end_window("benchmark", sync_execution=False)
233
  peak_memory = torch.cuda.max_memory_allocated(device="cuda:0")
234
 
235
  results: list[Result] = []
 
266
  model=args.model,
267
  num_parameters=count_parameters(pipeline),
268
  gpu_model=gpu_model,
 
 
269
  power_limit=args.power_limit,
270
  batch_size=args.batch_size,
271
+ num_inference_steps=args.num_inference_steps,
272
+ num_frames=args.num_frames,
273
  num_prompts=num_prompts,
274
  total_runtime=measurements.time,
275
  total_energy=measurements.total_energy,
 
300
  parser.add_argument("--num-inference-steps", type=int, default=50, help="The number of denoising steps.")
301
  parser.add_argument("--num-frames", type=int, default=1, help="The number of frames to generate.")
302
  parser.add_argument("--fps", type=int, default=16, help="Frames per second for micro-conditioning.")
303
+ parser.add_argument("--height", type=int, required=True, help="Height of the generated video.")
304
+ parser.add_argument("--width", type=int, required=True, help="Width of the generated video.")
305
  parser.add_argument("--num-batches", type=int, default=None, help="The number of batches to use from the dataset.")
306
  parser.add_argument("--save-every", type=int, default=10, help="Save generations to file every N prompts.")
307
  parser.add_argument("--seed", type=int, default=0, help="The seed to use for the RNG.")
benchmark/diffusion/image-to-video/scripts/benchmark_one_model.py CHANGED
@@ -28,44 +28,48 @@ def main(args: argparse.Namespace) -> None:
28
  print_and_write(outfile, f"Benchmarking {args.model}\n")
29
  print_and_write(outfile, f"Batch sizes: {args.batch_sizes}\n")
30
  print_and_write(outfile, f"Power limits: {args.power_limits}\n")
 
31
 
32
  for batch_size in args.batch_sizes:
33
  for power_limit in args.power_limits:
34
- print_and_write(outfile, f"{batch_size=}, {power_limit=}\n", flush=True)
35
- with subprocess.Popen(
36
- args=[
37
- "docker", "run",
38
- "--gpus", '"device=' + ','.join(args.gpu_ids) + '"',
39
- "--cap-add", "SYS_ADMIN",
40
- "--name", f"leaderboard-i2v-{''.join(args.gpu_ids)}",
41
- "--rm",
42
- "-v", "/data/leaderboard/hfcache:/root/.cache/huggingface",
43
- "-v", f"{os.getcwd()}:/workspace/image-to-video",
44
- "mlenergy/leaderboard:diffusion-i2v",
45
- "--dataset-path", args.dataset_path,
46
- "--result-root", args.result_root,
47
- "--batch-size", batch_size,
48
- "--num-batches", "10",
49
- "--power-limit", power_limit,
50
- "--model", args.model,
51
- "--huggingface-token", hf_token,
52
- "--num-frames", args.num_frames,
53
- "--num-inference-steps", args.num_inference_steps,
54
- ] + (["--add-text-prompt"] if args.add_text_prompt else []),
55
- stdout=subprocess.PIPE,
56
- stderr=subprocess.STDOUT,
57
- text=True,
58
- ) as proc:
59
- if proc.stdout:
60
- i = 0
61
- for line in proc.stdout:
62
- print_and_write(outfile, line, flush=i % 50 == 0)
63
- i += 1
 
 
 
64
 
65
- # If proc exited with non-zero status, it's probably an OOM.
66
- # Move on to the next batch size.
67
- if proc.returncode != 0:
68
- break
69
 
70
 
71
 
@@ -77,8 +81,10 @@ if __name__ == "__main__":
77
  parser.add_argument("--batch-sizes", type=str, nargs="+", default=["8", "4", "2", "1"], help="Batch sizes to benchmark")
78
  parser.add_argument("--power-limits", type=str, nargs="+", default=["400", "300", "200"], help="Power limits to benchmark")
79
  parser.add_argument("--num-frames", type=str, help="Number of frames to generate")
80
- parser.add_argument("--num-inference-steps", type=str, help="Number of denoising steps")
81
  parser.add_argument("--add-text-prompt", action="store_true", help="Input text prompt alongside image.")
 
 
82
  parser.add_argument("--dataset-path", type=str, help="Path to the dataset JSON file.")
83
  args = parser.parse_args()
84
  main(args)
 
28
  print_and_write(outfile, f"Benchmarking {args.model}\n")
29
  print_and_write(outfile, f"Batch sizes: {args.batch_sizes}\n")
30
  print_and_write(outfile, f"Power limits: {args.power_limits}\n")
31
+ print_and_write(outfile, f"Number of inference steps: {args.num_inference_steps}\n")
32
 
33
  for batch_size in args.batch_sizes:
34
  for power_limit in args.power_limits:
35
+ for num_inference_steps in args.num_inference_steps:
36
+ print_and_write(outfile, f"{batch_size=}, {power_limit=}, {num_inference_steps=}\n", flush=True)
37
+ with subprocess.Popen(
38
+ args=[
39
+ "docker", "run",
40
+ "--gpus", '"device=' + ','.join(args.gpu_ids) + '"',
41
+ "--cap-add", "SYS_ADMIN",
42
+ "--name", f"leaderboard-i2v-{''.join(args.gpu_ids)}",
43
+ "--rm",
44
+ "-v", "/data/leaderboard/hfcache:/root/.cache/huggingface",
45
+ "-v", f"{os.getcwd()}:/workspace/image-to-video",
46
+ "mlenergy/leaderboard:diffusion-i2v",
47
+ "--dataset-path", args.dataset_path,
48
+ "--result-root", args.result_root,
49
+ "--batch-size", batch_size,
50
+ "--num-batches", "8",
51
+ "--power-limit", power_limit,
52
+ "--model", args.model,
53
+ "--huggingface-token", hf_token,
54
+ "--num-frames", args.num_frames,
55
+ "--num-inference-steps", num_inference_steps,
56
+ "--width", str(args.width),
57
+ "--height", str(args.height),
58
+ ] + (["--add-text-prompt"] if args.add_text_prompt else []),
59
+ stdout=subprocess.PIPE,
60
+ stderr=subprocess.STDOUT,
61
+ text=True,
62
+ ) as proc:
63
+ if proc.stdout:
64
+ i = 0
65
+ for line in proc.stdout:
66
+ print_and_write(outfile, line, flush=i % 50 == 0)
67
+ i += 1
68
 
69
+ # If proc exited with non-zero status, it's probably an OOM.
70
+ # Move on to the next batch size.
71
+ if proc.returncode != 0:
72
+ break
73
 
74
 
75
 
 
81
  parser.add_argument("--batch-sizes", type=str, nargs="+", default=["8", "4", "2", "1"], help="Batch sizes to benchmark")
82
  parser.add_argument("--power-limits", type=str, nargs="+", default=["400", "300", "200"], help="Power limits to benchmark")
83
  parser.add_argument("--num-frames", type=str, help="Number of frames to generate")
84
+ parser.add_argument("--num-inference-steps", type=str, nargs="+", default=["1", "2", "4", "8", "16", "30", "40", "50"], help="Number of inference steps to run")
85
  parser.add_argument("--add-text-prompt", action="store_true", help="Input text prompt alongside image.")
86
+ parser.add_argument("--height", type=int, required=True, help="Height of the generated video.")
87
+ parser.add_argument("--width", type=int, required=True, help="Width of the generated video.")
88
  parser.add_argument("--dataset-path", type=str, help="Path to the dataset JSON file.")
89
  args = parser.parse_args()
90
  main(args)
benchmark/diffusion/image-to-video/sharegpt4video/extract_first_frame.py CHANGED
@@ -3,7 +3,7 @@ import json
3
 
4
  import cv2
5
 
6
- DATASET_PATH = "sharegpt4video_700.json"
7
 
8
 
9
  def main() -> None:
 
3
 
4
  import cv2
5
 
6
+ DATASET_PATH = "sharegpt4video_100.json"
7
 
8
 
9
  def main() -> None:
benchmark/diffusion/text-to-image/pegasus/A100/queue_1gpu.yaml CHANGED
@@ -1,5 +1,5 @@
1
  - command:
2
- - "python scripts/benchmark_one_model.py --model {{ model }} --result-root results/joule --gpu-ids {{ gpu }} --batch-sizes 16 8 4 2 1 --power-limits 400"
3
  model:
4
  - stabilityai/stable-diffusion-2-1
5
  - stabilityai/stable-diffusion-xl-base-1.0
 
1
  - command:
2
+ - "python scripts/benchmark_one_model.py --model {{ model }} --result-root results/joule --gpu-ids {{ gpu }} --batch-sizes 64 32 16 8 4 2 1 --num-inference-steps 1 2 4 8 16 25 30 40 50 --power-limits 400"
3
  model:
4
  - stabilityai/stable-diffusion-2-1
5
  - stabilityai/stable-diffusion-xl-base-1.0
benchmark/diffusion/text-to-image/scripts/aggregate_leaderboard_models.py CHANGED
@@ -15,7 +15,7 @@ def main(results_dir: Path, output_file: Path) -> None:
15
  for model_dir in sorted(glob(f"{results_dir}/*/*")):
16
  model_name = "/".join(model_dir.split("/")[-2:])
17
  print(f" {model_name}")
18
- result_file_cand = glob(f"{model_dir}/bs1+*+results.json")
19
  assert len(result_file_cand) == 1, model_name
20
  results_data = json.load(open(result_file_cand[0]))
21
  denosing_module_name = "unet" if "unet" in results_data["num_parameters"] else "transformer"
@@ -24,6 +24,7 @@ def main(results_dir: Path, output_file: Path) -> None:
24
  nickname=model_name.split("/")[-1].replace("-", " ").title(),
25
  total_params=raw_params_to_readable(sum(results_data["num_parameters"].values())),
26
  denoising_params=raw_params_to_readable(results_data["num_parameters"][denosing_module_name]),
 
27
  )
28
  assert model_name not in models
29
  models[model_name] = model_info
 
15
  for model_dir in sorted(glob(f"{results_dir}/*/*")):
16
  model_name = "/".join(model_dir.split("/")[-2:])
17
  print(f" {model_name}")
18
+ result_file_cand = glob(f"{model_dir}/bs1+*+steps25+results.json")
19
  assert len(result_file_cand) == 1, model_name
20
  results_data = json.load(open(result_file_cand[0]))
21
  denosing_module_name = "unet" if "unet" in results_data["num_parameters"] else "transformer"
 
24
  nickname=model_name.split("/")[-1].replace("-", " ").title(),
25
  total_params=raw_params_to_readable(sum(results_data["num_parameters"].values())),
26
  denoising_params=raw_params_to_readable(results_data["num_parameters"][denosing_module_name]),
27
+ resolution="NA",
28
  )
29
  assert model_name not in models
30
  models[model_name] = model_info
benchmark/diffusion/text-to-image/scripts/benchmark_one_datapoint.py CHANGED
@@ -1,8 +1,10 @@
1
  from __future__ import annotations
2
 
3
  import os
 
4
  import json
5
  import argparse
 
6
  from pprint import pprint
7
  from pathlib import Path
8
  from contextlib import suppress
@@ -11,6 +13,7 @@ from dataclasses import dataclass, field, asdict
11
  import torch
12
  import pynvml
13
  import numpy as np
 
14
  from PIL import Image
15
  from datasets import load_dataset, Dataset
16
  from transformers.trainer_utils import set_seed
@@ -35,9 +38,9 @@ class Results:
35
  model: str
36
  num_parameters: dict[str, int]
37
  gpu_model: str
38
- num_inference_steps: int
39
  power_limit: int
40
  batch_size: int
 
41
  num_prompts: int
42
  average_clip_score: float = 0.0
43
  total_runtime: float = 0.0
@@ -118,6 +121,28 @@ def load_partiprompts(
118
  return len(batched) * batch_size, batched
119
 
120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  def calculate_clip_score(
122
  model: CLIPModel,
123
  processor: CLIPProcessor,
@@ -183,8 +208,8 @@ def benchmark(args: argparse.Namespace) -> None:
183
 
184
  results_dir = Path(args.result_root) / args.model
185
  results_dir.mkdir(parents=True, exist_ok=True)
186
- benchmark_name = str(results_dir / f"bs{args.batch_size}+pl{args.power_limit}")
187
- image_dir = results_dir / f"bs{args.batch_size}+pl{args.power_limit}+generated"
188
  image_dir.mkdir(exist_ok=True)
189
 
190
  arg_out_filename = f"{benchmark_name}+args.json"
@@ -222,27 +247,42 @@ def benchmark(args: argparse.Namespace) -> None:
222
  ResultIntermediateBatched(prompts=batch) for batch in batched_prompts
223
  ]
224
 
 
 
 
 
 
 
 
 
 
 
225
  torch.cuda.reset_peak_memory_stats(device="cuda:0")
226
- zeus_monitor.begin_window("benchmark", sync_cuda=False)
227
 
228
  for ind, intermediate in enumerate(intermediates):
229
  print(f"Batch {ind + 1}/{len(intermediates)}")
230
- zeus_monitor.begin_window("batch", sync_cuda=False)
231
  images = pipeline(
232
  intermediate.prompts,
233
  generator=rng,
234
  num_inference_steps=args.num_inference_steps,
235
  output_type="np",
236
  ).images
237
- batch_measurements = zeus_monitor.end_window("batch", sync_cuda=False)
238
 
239
  intermediate.images = images
240
  intermediate.batch_latency = batch_measurements.time
241
  intermediate.batch_energy = batch_measurements.total_energy
242
 
243
- measurements = zeus_monitor.end_window("benchmark", sync_cuda=False)
244
  peak_memory = torch.cuda.max_memory_allocated(device="cuda:0")
245
 
 
 
 
 
 
246
  # Scale images to [0, 256] and convert to uint8
247
  for intermediate in intermediates:
248
  intermediate.images = (intermediate.images * 255).astype("uint8")
@@ -292,9 +332,9 @@ def benchmark(args: argparse.Namespace) -> None:
292
  model=args.model,
293
  num_parameters=count_parameters(pipeline),
294
  gpu_model=gpu_model,
295
- num_inference_steps=args.num_inference_steps,
296
  power_limit=args.power_limit,
297
  batch_size=args.batch_size,
 
298
  num_prompts=num_prompts,
299
  average_clip_score=sum(r.clip_score for r in results) / len(results),
300
  total_runtime=measurements.time,
@@ -326,6 +366,7 @@ if __name__ == "__main__":
326
  parser.add_argument("--image-save-every", type=int, default=10, help="Save images to file every N prompts.")
327
  parser.add_argument("--seed", type=int, default=0, help="The seed to use for the RNG.")
328
  parser.add_argument("--huggingface-token", type=str, help="The HuggingFace token to use.")
 
329
  args = parser.parse_args()
330
 
331
  benchmark(args)
 
1
  from __future__ import annotations
2
 
3
  import os
4
+ import time
5
  import json
6
  import argparse
7
+ import multiprocessing as mp
8
  from pprint import pprint
9
  from pathlib import Path
10
  from contextlib import suppress
 
13
  import torch
14
  import pynvml
15
  import numpy as np
16
+ import pandas as pd
17
  from PIL import Image
18
  from datasets import load_dataset, Dataset
19
  from transformers.trainer_utils import set_seed
 
38
  model: str
39
  num_parameters: dict[str, int]
40
  gpu_model: str
 
41
  power_limit: int
42
  batch_size: int
43
+ num_inference_steps: int
44
  num_prompts: int
45
  average_clip_score: float = 0.0
46
  total_runtime: float = 0.0
 
121
  return len(batched) * batch_size, batched
122
 
123
 
124
+ def power_monitor(csv_path: str, gpu_indices: list[int], chan: mp.SimpleQueue) -> None:
125
+ pynvml.nvmlInit()
126
+ handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in gpu_indices]
127
+
128
+ fields = [
129
+ (pynvml.NVML_FI_DEV_POWER_AVERAGE, pynvml.NVML_POWER_SCOPE_GPU),
130
+ (pynvml.NVML_FI_DEV_POWER_AVERAGE, pynvml.NVML_POWER_SCOPE_MEMORY),
131
+ ]
132
+
133
+ columns = ["timestamp"] + sum([[f"gpu{i}", f"vram{i}"] for i in gpu_indices], [])
134
+ power: list[list] = []
135
+ while chan.empty():
136
+ row = [time.monotonic()]
137
+ values = [pynvml.nvmlDeviceGetFieldValues(h, fields) for h in handles]
138
+ for value in values:
139
+ row.extend((value[0].value.uiVal, value[1].value.uiVal))
140
+ power.append(row)
141
+ time.sleep(max(0.0, 0.1 - (time.monotonic() - row[0])))
142
+
143
+ pd.DataFrame(power, columns=columns).to_csv(csv_path, index=False)
144
+
145
+
146
  def calculate_clip_score(
147
  model: CLIPModel,
148
  processor: CLIPProcessor,
 
208
 
209
  results_dir = Path(args.result_root) / args.model
210
  results_dir.mkdir(parents=True, exist_ok=True)
211
+ benchmark_name = str(results_dir / f"bs{args.batch_size}+pl{args.power_limit}+steps{args.num_inference_steps}")
212
+ image_dir = results_dir / f"bs{args.batch_size}+pl{args.power_limit}+steps{args.num_inference_steps}+generated"
213
  image_dir.mkdir(exist_ok=True)
214
 
215
  arg_out_filename = f"{benchmark_name}+args.json"
 
247
  ResultIntermediateBatched(prompts=batch) for batch in batched_prompts
248
  ]
249
 
250
+ pmon = None
251
+ pmon_chan = None
252
+ if args.monitor_power:
253
+ pmon_chan = mp.SimpleQueue()
254
+ pmon = mp.get_context("spawn").Process(
255
+ target=power_monitor,
256
+ args=(f"{benchmark_name}+power.csv", [g.gpu_index for g in zeus_monitor.gpus.gpus], pmon_chan),
257
+ )
258
+ pmon.start()
259
+
260
  torch.cuda.reset_peak_memory_stats(device="cuda:0")
261
+ zeus_monitor.begin_window("benchmark", sync_execution=False)
262
 
263
  for ind, intermediate in enumerate(intermediates):
264
  print(f"Batch {ind + 1}/{len(intermediates)}")
265
+ zeus_monitor.begin_window("batch", sync_execution=False)
266
  images = pipeline(
267
  intermediate.prompts,
268
  generator=rng,
269
  num_inference_steps=args.num_inference_steps,
270
  output_type="np",
271
  ).images
272
+ batch_measurements = zeus_monitor.end_window("batch", sync_execution=False)
273
 
274
  intermediate.images = images
275
  intermediate.batch_latency = batch_measurements.time
276
  intermediate.batch_energy = batch_measurements.total_energy
277
 
278
+ measurements = zeus_monitor.end_window("benchmark", sync_execution=False)
279
  peak_memory = torch.cuda.max_memory_allocated(device="cuda:0")
280
 
281
+ if pmon is not None and pmon_chan is not None:
282
+ pmon_chan.put("stop")
283
+ pmon.join(timeout=5.0)
284
+ pmon.terminate()
285
+
286
  # Scale images to [0, 256] and convert to uint8
287
  for intermediate in intermediates:
288
  intermediate.images = (intermediate.images * 255).astype("uint8")
 
332
  model=args.model,
333
  num_parameters=count_parameters(pipeline),
334
  gpu_model=gpu_model,
 
335
  power_limit=args.power_limit,
336
  batch_size=args.batch_size,
337
+ num_inference_steps=args.num_inference_steps,
338
  num_prompts=num_prompts,
339
  average_clip_score=sum(r.clip_score for r in results) / len(results),
340
  total_runtime=measurements.time,
 
366
  parser.add_argument("--image-save-every", type=int, default=10, help="Save images to file every N prompts.")
367
  parser.add_argument("--seed", type=int, default=0, help="The seed to use for the RNG.")
368
  parser.add_argument("--huggingface-token", type=str, help="The HuggingFace token to use.")
369
+ parser.add_argument("--monitor-power", default=False, action="store_true", help="Whether to monitor power over time.")
370
  args = parser.parse_args()
371
 
372
  benchmark(args)
benchmark/diffusion/text-to-image/scripts/benchmark_one_model.py CHANGED
@@ -28,12 +28,13 @@ def main(args: argparse.Namespace) -> None:
28
  print_and_write(outfile, f"Benchmarking {args.model}\n")
29
  print_and_write(outfile, f"Batch sizes: {args.batch_sizes}\n")
30
  print_and_write(outfile, f"Power limits: {args.power_limits}\n")
 
31
 
32
  for batch_size in args.batch_sizes:
33
  for power_limit in args.power_limits:
34
- print_and_write(outfile, f"{batch_size=}, {power_limit=}\n", flush=True)
35
- with subprocess.Popen(
36
- args=[
37
  "docker", "run",
38
  "--gpus", '"device=' + ','.join(args.gpu_ids) + '"',
39
  "--cap-add", "SYS_ADMIN",
@@ -48,22 +49,21 @@ def main(args: argparse.Namespace) -> None:
48
  "--power-limit", power_limit,
49
  "--model", args.model,
50
  "--huggingface-token", hf_token,
51
- "--num-inference-steps", "25",
52
- ],
53
- stdout=subprocess.PIPE,
54
- stderr=subprocess.STDOUT,
55
- text=True,
56
- ) as proc:
57
- if proc.stdout:
58
- i = 0
59
- for line in proc.stdout:
60
- print_and_write(outfile, line, flush=i % 50 == 0)
61
- i += 1
62
 
63
- # If proc exited with non-zero status, it's probably an OOM.
64
- # Move on to the next batch size.
65
- if proc.returncode != 0:
66
- break
67
 
68
 
69
 
@@ -74,5 +74,7 @@ if __name__ == "__main__":
74
  parser.add_argument("--gpu-ids", type=str, nargs="+", help="GPU IDs to use")
75
  parser.add_argument("--batch-sizes", type=str, nargs="+", default=["8", "4", "2", "1"], help="Batch sizes to benchmark")
76
  parser.add_argument("--power-limits", type=str, nargs="+", default=["400", "300", "200"], help="Power limits to benchmark")
 
 
77
  args = parser.parse_args()
78
  main(args)
 
28
  print_and_write(outfile, f"Benchmarking {args.model}\n")
29
  print_and_write(outfile, f"Batch sizes: {args.batch_sizes}\n")
30
  print_and_write(outfile, f"Power limits: {args.power_limits}\n")
31
+ print_and_write(outfile, f"Number of inference steps: {args.num_inference_steps}\n")
32
 
33
  for batch_size in args.batch_sizes:
34
  for power_limit in args.power_limits:
35
+ for num_inference_steps in args.num_inference_steps:
36
+ print_and_write(outfile, f"{batch_size=}, {power_limit=}, {num_inference_steps=}\n", flush=True)
37
+ cmd=[
38
  "docker", "run",
39
  "--gpus", '"device=' + ','.join(args.gpu_ids) + '"',
40
  "--cap-add", "SYS_ADMIN",
 
49
  "--power-limit", power_limit,
50
  "--model", args.model,
51
  "--huggingface-token", hf_token,
52
+ "--num-inference-steps", num_inference_steps,
53
+ ]
54
+ if args.monitor_power:
55
+ cmd.append("--monitor-power")
56
+ with subprocess.Popen(args=cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) as proc:
57
+ if proc.stdout:
58
+ i = 0
59
+ for line in proc.stdout:
60
+ print_and_write(outfile, line, flush=i % 50 == 0)
61
+ i += 1
 
62
 
63
+ # If proc exited with non-zero status, it's probably an OOM.
64
+ # Move on to the next batch size.
65
+ if proc.returncode != 0:
66
+ break
67
 
68
 
69
 
 
74
  parser.add_argument("--gpu-ids", type=str, nargs="+", help="GPU IDs to use")
75
  parser.add_argument("--batch-sizes", type=str, nargs="+", default=["8", "4", "2", "1"], help="Batch sizes to benchmark")
76
  parser.add_argument("--power-limits", type=str, nargs="+", default=["400", "300", "200"], help="Power limits to benchmark")
77
+ parser.add_argument("--num-inference-steps", type=str, nargs="+", default=["1", "2", "4", "8", "16", "25", "30", "40", "50"], help="Number of inference steps to run")
78
+ parser.add_argument("--monitor-power", default=False, action="store_true", help="Whether to monitor power over time.")
79
  args = parser.parse_args()
80
  main(args)
benchmark/diffusion/text-to-video/pegasus/A100/queue_1gpu.yaml CHANGED
@@ -1,5 +1,5 @@
1
  - command:
2
- - "python scripts/benchmark_one_model.py --model {{ model }} --result-root results/joule --dataset-path sharegpt4video/sharegpt4video_100.json --gpu-ids {{ gpu }} --batch-sizes 16 8 4 2 1 --power-limits 400 --num-inference-steps 25 --num-frames 16"
3
  model:
4
  - ali-vilab/text-to-video-ms-1.7b
5
  - guoyww/animatediff-motion-adapter-v1-5-3
 
1
  - command:
2
+ - "python scripts/benchmark_one_model.py --model {{ model }} --result-root results/joule --dataset-path sharegpt4video/sharegpt4video_100.json --gpu-ids {{ gpu }} --batch-sizes 32 16 8 4 2 1 --power-limits 400 --num-inference-steps 1 2 4 8 16 25 30 40 50 --num-frames 16"
3
  model:
4
  - ali-vilab/text-to-video-ms-1.7b
5
  - guoyww/animatediff-motion-adapter-v1-5-3
benchmark/diffusion/text-to-video/pegasus/H100/queue_1gpu.yaml CHANGED
@@ -1,5 +1,5 @@
1
  - command:
2
- - "python scripts/benchmark_one_model.py --model {{ model }} --result-root results/joule --dataset-path sharegpt4video/sharegpt4video_700.json --gpu-ids {{ gpu }} --batch-sizes 64 32 16 8 4 2 1 --power-limits 700 --num-inference-steps 25 --num-frames 16"
3
  model:
4
  - ali-vilab/text-to-video-ms-1.7b
5
  - guoyww/animatediff-motion-adapter-v1-5-3
 
1
  - command:
2
+ - "python scripts/benchmark_one_model.py --model {{ model }} --result-root results/joule --dataset-path sharegpt4video/sharegpt4video_100.json --gpu-ids {{ gpu }} --batch-sizes 32 16 8 4 2 1 --power-limits 700 --num-inference-steps 1 2 4 8 16 25 30 40 50 --num-frames 16"
3
  model:
4
  - ali-vilab/text-to-video-ms-1.7b
5
  - guoyww/animatediff-motion-adapter-v1-5-3
benchmark/diffusion/text-to-video/scripts/aggregate_leaderboard_models.py CHANGED
@@ -15,7 +15,7 @@ def main(results_dir: Path, output_file: Path) -> None:
15
  for model_dir in sorted(glob(f"{results_dir}/*/*")):
16
  model_name = "/".join(model_dir.split("/")[-2:])
17
  print(f" {model_name}")
18
- result_file_cand = glob(f"{model_dir}/bs1+*+results.json")
19
  assert len(result_file_cand) == 1, model_name
20
  results_data = json.load(open(result_file_cand[0]))
21
  denosing_module_name = "unet" if "unet" in results_data["num_parameters"] else "transformer"
@@ -24,6 +24,7 @@ def main(results_dir: Path, output_file: Path) -> None:
24
  nickname=model_name.split("/")[-1].replace("-", " ").title(),
25
  total_params=raw_params_to_readable(sum(results_data["num_parameters"].values())),
26
  denoising_params=raw_params_to_readable(results_data["num_parameters"][denosing_module_name]),
 
27
  )
28
  assert model_name not in models
29
  models[model_name] = model_info
 
15
  for model_dir in sorted(glob(f"{results_dir}/*/*")):
16
  model_name = "/".join(model_dir.split("/")[-2:])
17
  print(f" {model_name}")
18
+ result_file_cand = glob(f"{model_dir}/bs1+*+steps25+results.json")
19
  assert len(result_file_cand) == 1, model_name
20
  results_data = json.load(open(result_file_cand[0]))
21
  denosing_module_name = "unet" if "unet" in results_data["num_parameters"] else "transformer"
 
24
  nickname=model_name.split("/")[-1].replace("-", " ").title(),
25
  total_params=raw_params_to_readable(sum(results_data["num_parameters"].values())),
26
  denoising_params=raw_params_to_readable(results_data["num_parameters"][denosing_module_name]),
27
+ resolution="NA",
28
  )
29
  assert model_name not in models
30
  models[model_name] = model_info
benchmark/diffusion/text-to-video/scripts/benchmark_one_datapoint.py CHANGED
@@ -32,10 +32,10 @@ class Results:
32
  model: str
33
  num_parameters: dict[str, int]
34
  gpu_model: str
35
- num_inference_steps: int
36
- num_frames: int
37
  power_limit: int
38
  batch_size: int
 
 
39
  num_prompts: int
40
  total_runtime: float = 0.0
41
  total_energy: float = 0.0
@@ -119,7 +119,7 @@ def load_text_prompts(
119
  Returns:
120
  Total number of prompts and a list of batches of prompts.
121
  """
122
- dataset = json.load(open(path))["caption"]
123
  if num_batches is not None:
124
  if len(dataset) < num_batches * batch_size:
125
  raise ValueError("Dataset is too small for the given number of batches.")
@@ -151,8 +151,8 @@ def benchmark(args: argparse.Namespace) -> None:
151
 
152
  results_dir = Path(args.result_root) / args.model
153
  results_dir.mkdir(parents=True, exist_ok=True)
154
- benchmark_name = str(results_dir / f"bs{args.batch_size}+pl{args.power_limit}")
155
- video_dir = results_dir / f"bs{args.batch_size}+pl{args.power_limit}+generated"
156
  video_dir.mkdir(exist_ok=True)
157
 
158
  arg_out_filename = f"{benchmark_name}+args.json"
@@ -190,7 +190,7 @@ def benchmark(args: argparse.Namespace) -> None:
190
  ]
191
 
192
  torch.cuda.reset_peak_memory_stats(device="cuda:0")
193
- zeus_monitor.begin_window("benchmark", sync_cuda=False)
194
 
195
  # Build common parameter dict for all batches
196
  params: dict[str, Any] = dict(
@@ -208,15 +208,15 @@ def benchmark(args: argparse.Namespace) -> None:
208
 
209
  params["prompt"] = intermediate.prompts
210
 
211
- zeus_monitor.begin_window("batch", sync_cuda=False)
212
  frames = pipeline(**params).frames
213
- batch_measurements = zeus_monitor.end_window("batch", sync_cuda=False)
214
 
215
  intermediate.frames = frames
216
  intermediate.batch_latency = batch_measurements.time
217
  intermediate.batch_energy = batch_measurements.total_energy
218
 
219
- measurements = zeus_monitor.end_window("benchmark", sync_cuda=False)
220
  peak_memory = torch.cuda.max_memory_allocated(device="cuda:0")
221
 
222
  results: list[Result] = []
@@ -253,10 +253,10 @@ def benchmark(args: argparse.Namespace) -> None:
253
  model=args.model,
254
  num_parameters=count_parameters(pipeline),
255
  gpu_model=gpu_model,
256
- num_inference_steps=args.num_inference_steps,
257
- num_frames=args.num_frames,
258
  power_limit=args.power_limit,
259
  batch_size=args.batch_size,
 
 
260
  num_prompts=num_prompts,
261
  total_runtime=measurements.time,
262
  total_energy=measurements.total_energy,
 
32
  model: str
33
  num_parameters: dict[str, int]
34
  gpu_model: str
 
 
35
  power_limit: int
36
  batch_size: int
37
+ num_inference_steps: int
38
+ num_frames: int
39
  num_prompts: int
40
  total_runtime: float = 0.0
41
  total_energy: float = 0.0
 
119
  Returns:
120
  Total number of prompts and a list of batches of prompts.
121
  """
122
+ dataset = json.load(open(path))["caption"] * 10
123
  if num_batches is not None:
124
  if len(dataset) < num_batches * batch_size:
125
  raise ValueError("Dataset is too small for the given number of batches.")
 
151
 
152
  results_dir = Path(args.result_root) / args.model
153
  results_dir.mkdir(parents=True, exist_ok=True)
154
+ benchmark_name = str(results_dir / f"bs{args.batch_size}+pl{args.power_limit}+steps{args.num_inference_steps}")
155
+ video_dir = results_dir / f"bs{args.batch_size}+pl{args.power_limit}+steps{args.num_inference_steps}+generated"
156
  video_dir.mkdir(exist_ok=True)
157
 
158
  arg_out_filename = f"{benchmark_name}+args.json"
 
190
  ]
191
 
192
  torch.cuda.reset_peak_memory_stats(device="cuda:0")
193
+ zeus_monitor.begin_window("benchmark", sync_execution=False)
194
 
195
  # Build common parameter dict for all batches
196
  params: dict[str, Any] = dict(
 
208
 
209
  params["prompt"] = intermediate.prompts
210
 
211
+ zeus_monitor.begin_window("batch", sync_execution=False)
212
  frames = pipeline(**params).frames
213
+ batch_measurements = zeus_monitor.end_window("batch", sync_execution=False)
214
 
215
  intermediate.frames = frames
216
  intermediate.batch_latency = batch_measurements.time
217
  intermediate.batch_energy = batch_measurements.total_energy
218
 
219
+ measurements = zeus_monitor.end_window("benchmark", sync_execution=False)
220
  peak_memory = torch.cuda.max_memory_allocated(device="cuda:0")
221
 
222
  results: list[Result] = []
 
253
  model=args.model,
254
  num_parameters=count_parameters(pipeline),
255
  gpu_model=gpu_model,
 
 
256
  power_limit=args.power_limit,
257
  batch_size=args.batch_size,
258
+ num_inference_steps=args.num_inference_steps,
259
+ num_frames=args.num_frames,
260
  num_prompts=num_prompts,
261
  total_runtime=measurements.time,
262
  total_energy=measurements.total_energy,
benchmark/diffusion/text-to-video/scripts/benchmark_one_model.py CHANGED
@@ -28,44 +28,46 @@ def main(args: argparse.Namespace) -> None:
28
  print_and_write(outfile, f"Benchmarking {args.model}\n")
29
  print_and_write(outfile, f"Batch sizes: {args.batch_sizes}\n")
30
  print_and_write(outfile, f"Power limits: {args.power_limits}\n")
 
31
 
32
  for batch_size in args.batch_sizes:
33
  for power_limit in args.power_limits:
34
- print_and_write(outfile, f"{batch_size=}, {power_limit=}\n", flush=True)
35
- with subprocess.Popen(
36
- args=[
37
- "docker", "run",
38
- "--gpus", '"device=' + ','.join(args.gpu_ids) + '"',
39
- "--cap-add", "SYS_ADMIN",
40
- "--name", f"leaderboard-t2v-{''.join(args.gpu_ids)}",
41
- "--rm",
42
- "-v", "/data/leaderboard/hfcache:/root/.cache/huggingface",
43
- "-v", f"{os.getcwd()}:/workspace/text-to-video",
44
- "mlenergy/leaderboard:diffusion-t2v",
45
- "--result-root", args.result_root,
46
- "--batch-size", batch_size,
47
- "--num-batches", "10",
48
- "--power-limit", power_limit,
49
- "--model", args.model,
50
- "--dataset-path", args.dataset_path,
51
- "--huggingface-token", hf_token,
52
- "--num-inference-steps", args.num_inference_steps,
53
- "--num-frames", args.num_frames,
54
- ],
55
- stdout=subprocess.PIPE,
56
- stderr=subprocess.STDOUT,
57
- text=True,
58
- ) as proc:
59
- if proc.stdout:
60
- i = 0
61
- for line in proc.stdout:
62
- print_and_write(outfile, line, flush=i % 50 == 0)
63
- i += 1
 
64
 
65
- # If proc exited with non-zero status, it's probably an OOM.
66
- # Move on to the next batch size.
67
- if proc.returncode != 0:
68
- break
69
 
70
 
71
 
@@ -76,7 +78,7 @@ if __name__ == "__main__":
76
  parser.add_argument("--gpu-ids", type=str, nargs="+", help="GPU IDs to use")
77
  parser.add_argument("--batch-sizes", type=str, nargs="+", default=["8", "4", "2", "1"], help="Batch sizes to benchmark")
78
  parser.add_argument("--power-limits", type=str, nargs="+", default=["400", "300", "200"], help="Power limits to benchmark")
79
- parser.add_argument("--num-inference-steps", type=str, required=True, help="Number of denoising steps")
80
  parser.add_argument("--num-frames", type=str, required=True, help="Number of frames to generate")
81
  parser.add_argument("--dataset-path", type=str, help="Path to the dataset JSON file.")
82
  args = parser.parse_args()
 
28
  print_and_write(outfile, f"Benchmarking {args.model}\n")
29
  print_and_write(outfile, f"Batch sizes: {args.batch_sizes}\n")
30
  print_and_write(outfile, f"Power limits: {args.power_limits}\n")
31
+ print_and_write(outfile, f"Number of inference steps: {args.num_inference_steps}\n")
32
 
33
  for batch_size in args.batch_sizes:
34
  for power_limit in args.power_limits:
35
+ for num_inference_steps in args.num_inference_steps:
36
+ print_and_write(outfile, f"{batch_size=}, {power_limit=}, {num_inference_steps=}\n", flush=True)
37
+ with subprocess.Popen(
38
+ args=[
39
+ "docker", "run",
40
+ "--gpus", '"device=' + ','.join(args.gpu_ids) + '"',
41
+ "--cap-add", "SYS_ADMIN",
42
+ "--name", f"leaderboard-t2v-{''.join(args.gpu_ids)}",
43
+ "--rm",
44
+ "-v", "/data/leaderboard/hfcache:/root/.cache/huggingface",
45
+ "-v", f"{os.getcwd()}:/workspace/text-to-video",
46
+ "mlenergy/leaderboard:diffusion-t2v",
47
+ "--result-root", args.result_root,
48
+ "--batch-size", batch_size,
49
+ "--num-batches", "10",
50
+ "--power-limit", power_limit,
51
+ "--model", args.model,
52
+ "--dataset-path", args.dataset_path,
53
+ "--huggingface-token", hf_token,
54
+ "--num-inference-steps", num_inference_steps,
55
+ "--num-frames", args.num_frames,
56
+ ],
57
+ stdout=subprocess.PIPE,
58
+ stderr=subprocess.STDOUT,
59
+ text=True,
60
+ ) as proc:
61
+ if proc.stdout:
62
+ i = 0
63
+ for line in proc.stdout:
64
+ print_and_write(outfile, line, flush=i % 50 == 0)
65
+ i += 1
66
 
67
+ # If proc exited with non-zero status, it's probably an OOM.
68
+ # Move on to the next batch size.
69
+ if proc.returncode != 0:
70
+ break
71
 
72
 
73
 
 
78
  parser.add_argument("--gpu-ids", type=str, nargs="+", help="GPU IDs to use")
79
  parser.add_argument("--batch-sizes", type=str, nargs="+", default=["8", "4", "2", "1"], help="Batch sizes to benchmark")
80
  parser.add_argument("--power-limits", type=str, nargs="+", default=["400", "300", "200"], help="Power limits to benchmark")
81
+ parser.add_argument("--num-inference-steps", type=str, nargs="+", default=["1", "2", "4", "8", "16", "25", "30", "40", "50"], help="Number of denoising steps")
82
  parser.add_argument("--num-frames", type=str, required=True, help="Number of frames to generate")
83
  parser.add_argument("--dataset-path", type=str, help="Path to the dataset JSON file.")
84
  args = parser.parse_args()
data/diffusion/image-to-video/A100-SXM4-40GB/ali-vilab/i2vgen-xl/bs1+steps25+frames16.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "Model": "ali-vilab/i2vgen-xl",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
- "Energy/video (J)": 16348.217100000009,
5
- "Batch latency (s)": 44.41898396015167,
6
  "Batch size": 1,
7
  "Denoising steps": 25,
8
  "Frames": 16
 
1
  {
2
  "Model": "ali-vilab/i2vgen-xl",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
+ "Energy/video (J)": 16915.850124999997,
5
+ "Batch latency (s)": 46.14208295941353,
6
  "Batch size": 1,
7
  "Denoising steps": 25,
8
  "Frames": 16
data/diffusion/image-to-video/A100-SXM4-40GB/ali-vilab/i2vgen-xl/bs2+steps25+frames16.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "Model": "ali-vilab/i2vgen-xl",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
- "Energy/video (J)": 16091.048200000008,
5
- "Batch latency (s)": 85.8618726491928,
6
  "Batch size": 2,
7
  "Denoising steps": 25,
8
  "Frames": 16
 
1
  {
2
  "Model": "ali-vilab/i2vgen-xl",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
+ "Energy/video (J)": 16496.045437499997,
5
+ "Batch latency (s)": 89.03019031882286,
6
  "Batch size": 2,
7
  "Denoising steps": 25,
8
  "Frames": 16
data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid-xt/bs1+steps25+frames25.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "Model": "stabilityai/stable-video-diffusion-img2vid-xt",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
- "Energy/video (J)": 15346.527300000005,
5
- "Batch latency (s)": 42.11920440196991,
6
  "Batch size": 1,
7
  "Denoising steps": 25,
8
  "Frames": 25
 
1
  {
2
  "Model": "stabilityai/stable-video-diffusion-img2vid-xt",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
+ "Energy/video (J)": 15709.767625000095,
5
+ "Batch latency (s)": 42.397395104169846,
6
  "Batch size": 1,
7
  "Denoising steps": 25,
8
  "Frames": 25
data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid-xt/bs2+steps25+frames25.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Model": "stabilityai/stable-video-diffusion-img2vid-xt",
3
+ "GPU": "NVIDIA A100-SXM4-40GB",
4
+ "Energy/video (J)": 15291.016625000047,
5
+ "Batch latency (s)": 82.90474811196327,
6
+ "Batch size": 2,
7
+ "Denoising steps": 25,
8
+ "Frames": 25
9
+ }
data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid-xt/bs3+steps25+frames25.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Model": "stabilityai/stable-video-diffusion-img2vid-xt",
3
+ "GPU": "NVIDIA A100-SXM4-40GB",
4
+ "Energy/video (J)": 14761.389999999976,
5
+ "Batch latency (s)": 120.65004900523594,
6
+ "Batch size": 3,
7
+ "Denoising steps": 25,
8
+ "Frames": 25
9
+ }
data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid/bs1+steps25+frames14.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "Model": "stabilityai/stable-video-diffusion-img2vid",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
- "Energy/video (J)": 8803.383999999985,
5
- "Batch latency (s)": 24.10387804508209,
6
  "Batch size": 1,
7
  "Denoising steps": 25,
8
  "Frames": 14
 
1
  {
2
  "Model": "stabilityai/stable-video-diffusion-img2vid",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
+ "Energy/video (J)": 9066.434124999912,
5
+ "Batch latency (s)": 24.369865357875824,
6
  "Batch size": 1,
7
  "Denoising steps": 25,
8
  "Frames": 14
data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid/bs2+steps25+frames14.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Model": "stabilityai/stable-video-diffusion-img2vid",
3
+ "GPU": "NVIDIA A100-SXM4-40GB",
4
+ "Energy/video (J)": 8835.22312499996,
5
+ "Batch latency (s)": 47.65615049004555,
6
+ "Batch size": 2,
7
+ "Denoising steps": 25,
8
+ "Frames": 14
9
+ }
data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid/bs3+steps25+frames14.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Model": "stabilityai/stable-video-diffusion-img2vid",
3
+ "GPU": "NVIDIA A100-SXM4-40GB",
4
+ "Energy/video (J)": 8683.536285714292,
5
+ "Batch latency (s)": 70.55723374230521,
6
+ "Batch size": 3,
7
+ "Denoising steps": 25,
8
+ "Frames": 14
9
+ }
data/diffusion/image-to-video/H100 80GB HBM3/ali-vilab/i2vgen-xl/bs1+steps25+frames16.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "Model": "ali-vilab/i2vgen-xl",
3
  "GPU": "NVIDIA H100 80GB HBM3",
4
- "Energy/video (J)": 14222.658400000026,
5
- "Batch latency (s)": 22.950254821777342,
6
  "Batch size": 1,
7
  "Denoising steps": 25,
8
  "Frames": 16
 
1
  {
2
  "Model": "ali-vilab/i2vgen-xl",
3
  "GPU": "NVIDIA H100 80GB HBM3",
4
+ "Energy/video (J)": 14867.419125000015,
5
+ "Batch latency (s)": 23.717748790979385,
6
  "Batch size": 1,
7
  "Denoising steps": 25,
8
  "Frames": 16
data/diffusion/image-to-video/H100 80GB HBM3/ali-vilab/i2vgen-xl/bs2+steps25+frames16.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "Model": "ali-vilab/i2vgen-xl",
3
  "GPU": "NVIDIA H100 80GB HBM3",
4
- "Energy/video (J)": 13657.628800000017,
5
- "Batch latency (s)": 42.94859471321106,
6
  "Batch size": 2,
7
  "Denoising steps": 25,
8
  "Frames": 16
 
1
  {
2
  "Model": "ali-vilab/i2vgen-xl",
3
  "GPU": "NVIDIA H100 80GB HBM3",
4
+ "Energy/video (J)": 14348.508499999996,
5
+ "Batch latency (s)": 44.71498331427574,
6
  "Batch size": 2,
7
  "Denoising steps": 25,
8
  "Frames": 16
data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid-xt/bs1+steps25+frames25.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "Model": "stabilityai/stable-video-diffusion-img2vid-xt",
3
  "GPU": "NVIDIA H100 80GB HBM3",
4
- "Energy/video (J)": 13366.447699999995,
5
- "Batch latency (s)": 20.89660472869873,
6
  "Batch size": 1,
7
  "Denoising steps": 25,
8
  "Frames": 25
 
1
  {
2
  "Model": "stabilityai/stable-video-diffusion-img2vid-xt",
3
  "GPU": "NVIDIA H100 80GB HBM3",
4
+ "Energy/video (J)": 13392.813624999952,
5
+ "Batch latency (s)": 20.788252592086792,
6
  "Batch size": 1,
7
  "Denoising steps": 25,
8
  "Frames": 25
data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid-xt/bs2+steps25+frames25.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Model": "stabilityai/stable-video-diffusion-img2vid-xt",
3
+ "GPU": "NVIDIA H100 80GB HBM3",
4
+ "Energy/video (J)": 12901.83275000006,
5
+ "Batch latency (s)": 39.99498334527016,
6
+ "Batch size": 2,
7
+ "Denoising steps": 25,
8
+ "Frames": 25
9
+ }
data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid-xt/bs3+steps25+frames25.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Model": "stabilityai/stable-video-diffusion-img2vid-xt",
3
+ "GPU": "NVIDIA H100 80GB HBM3",
4
+ "Energy/video (J)": 12790.552809523862,
5
+ "Batch latency (s)": 59.380911929266794,
6
+ "Batch size": 3,
7
+ "Denoising steps": 25,
8
+ "Frames": 25
9
+ }
data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid/bs1+steps25+frames14.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "Model": "stabilityai/stable-video-diffusion-img2vid",
3
  "GPU": "NVIDIA H100 80GB HBM3",
4
- "Energy/video (J)": 7550.921200000029,
5
- "Batch latency (s)": 12.265265846252442,
6
  "Batch size": 1,
7
  "Denoising steps": 25,
8
  "Frames": 14
 
1
  {
2
  "Model": "stabilityai/stable-video-diffusion-img2vid",
3
  "GPU": "NVIDIA H100 80GB HBM3",
4
+ "Energy/video (J)": 7623.074500000104,
5
+ "Batch latency (s)": 12.191031396389008,
6
  "Batch size": 1,
7
  "Denoising steps": 25,
8
  "Frames": 14
data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid/bs2+steps25+frames14.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Model": "stabilityai/stable-video-diffusion-img2vid",
3
+ "GPU": "NVIDIA H100 80GB HBM3",
4
+ "Energy/video (J)": 7416.721437499975,
5
+ "Batch latency (s)": 23.368041068315506,
6
+ "Batch size": 2,
7
+ "Denoising steps": 25,
8
+ "Frames": 14
9
+ }
data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid/bs3+steps25+frames14.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Model": "stabilityai/stable-video-diffusion-img2vid",
3
+ "GPU": "NVIDIA H100 80GB HBM3",
4
+ "Energy/video (J)": 7354.00133333333,
5
+ "Batch latency (s)": 34.5100462777274,
6
+ "Batch size": 3,
7
+ "Denoising steps": 25,
8
+ "Frames": 14
9
+ }
data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs1+steps25.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "Model": "kandinsky-community/kandinsky-2-2-decoder",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
- "Energy/image (J)": 329.6848999999929,
5
- "Batch latency (s)": 1.808762288093567,
6
  "Batch size": 1,
7
  "Denoising steps": 25
8
  }
 
1
  {
2
  "Model": "kandinsky-community/kandinsky-2-2-decoder",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
+ "Energy/image (J)": 324.06850000005215,
5
+ "Batch latency (s)": 1.6537675857543945,
6
  "Batch size": 1,
7
  "Denoising steps": 25
8
  }
data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs16+steps25.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "Model": "kandinsky-community/kandinsky-2-2-decoder",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
- "Energy/image (J)": 174.24531874999812,
5
- "Batch latency (s)": 7.439638161659241,
6
  "Batch size": 16,
7
  "Denoising steps": 25
8
  }
 
1
  {
2
  "Model": "kandinsky-community/kandinsky-2-2-decoder",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
+ "Energy/image (J)": 172.51030000000029,
5
+ "Batch latency (s)": 7.375234842300415,
6
  "Batch size": 16,
7
  "Denoising steps": 25
8
  }
data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs2+steps25.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "Model": "kandinsky-community/kandinsky-2-2-decoder",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
- "Energy/image (J)": 232.40825000000186,
5
- "Batch latency (s)": 1.640995717048645,
6
  "Batch size": 2,
7
  "Denoising steps": 25
8
  }
 
1
  {
2
  "Model": "kandinsky-community/kandinsky-2-2-decoder",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
+ "Energy/image (J)": 230.3378000000026,
5
+ "Batch latency (s)": 1.5861663103103638,
6
  "Batch size": 2,
7
  "Denoising steps": 25
8
  }
data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs32+steps25.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Model": "kandinsky-community/kandinsky-2-2-decoder",
3
+ "GPU": "NVIDIA A100-SXM4-40GB",
4
+ "Energy/image (J)": 163.0797656249997,
5
+ "Batch latency (s)": 13.998618459701538,
6
+ "Batch size": 32,
7
+ "Denoising steps": 25
8
+ }
data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs4+steps25.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "Model": "kandinsky-community/kandinsky-2-2-decoder",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
- "Energy/image (J)": 202.8745750000002,
5
- "Batch latency (s)": 2.3463359832763673,
6
  "Batch size": 4,
7
  "Denoising steps": 25
8
  }
 
1
  {
2
  "Model": "kandinsky-community/kandinsky-2-2-decoder",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
+ "Energy/image (J)": 200.16462499999906,
5
+ "Batch latency (s)": 2.299217462539673,
6
  "Batch size": 4,
7
  "Denoising steps": 25
8
  }
data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs8+steps25.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "Model": "kandinsky-community/kandinsky-2-2-decoder",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
- "Energy/image (J)": 187.65767500000075,
5
- "Batch latency (s)": 4.030062103271485,
6
  "Batch size": 8,
7
  "Denoising steps": 25
8
  }
 
1
  {
2
  "Model": "kandinsky-community/kandinsky-2-2-decoder",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
+ "Energy/image (J)": 184.9021625000052,
5
+ "Batch latency (s)": 4.0124232292175295,
6
  "Batch size": 8,
7
  "Denoising steps": 25
8
  }
data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-3/bs1+steps25.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "Model": "kandinsky-community/kandinsky-3",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
- "Energy/image (J)": 914.0325000000187,
5
- "Batch latency (s)": 3.1329710721969604,
6
  "Batch size": 1,
7
  "Denoising steps": 25
8
  }
 
1
  {
2
  "Model": "kandinsky-community/kandinsky-3",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
+ "Energy/image (J)": 930.2532999999821,
5
+ "Batch latency (s)": 3.0359585523605346,
6
  "Batch size": 1,
7
  "Denoising steps": 25
8
  }
data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-3/bs2+steps25.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "Model": "kandinsky-community/kandinsky-3",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
- "Energy/image (J)": 875.4787999999942,
5
- "Batch latency (s)": 5.2747025966644285,
6
  "Batch size": 2,
7
  "Denoising steps": 25
8
  }
 
1
  {
2
  "Model": "kandinsky-community/kandinsky-3",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
+ "Energy/image (J)": 895.7575500000036,
5
+ "Batch latency (s)": 5.261959171295166,
6
  "Batch size": 2,
7
  "Denoising steps": 25
8
  }
data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs1+steps25.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "Model": "prompthero/openjourney-v4",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
- "Energy/image (J)": 235.712099999981,
5
- "Batch latency (s)": 1.0208970069885255,
6
  "Batch size": 1,
7
  "Denoising steps": 25
8
  }
 
1
  {
2
  "Model": "prompthero/openjourney-v4",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
+ "Energy/image (J)": 227.21699999999254,
5
+ "Batch latency (s)": 0.9210062503814698,
6
  "Batch size": 1,
7
  "Denoising steps": 25
8
  }
data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs16+steps25.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "Model": "prompthero/openjourney-v4",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
- "Energy/image (J)": 157.4185124999989,
5
- "Batch latency (s)": 6.579187059402466,
6
  "Batch size": 16,
7
  "Denoising steps": 25
8
  }
 
1
  {
2
  "Model": "prompthero/openjourney-v4",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
+ "Energy/image (J)": 156.51368749999673,
5
+ "Batch latency (s)": 6.559858226776123,
6
  "Batch size": 16,
7
  "Denoising steps": 25
8
  }
data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs2+steps25.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "Model": "prompthero/openjourney-v4",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
- "Energy/image (J)": 196.30995000000112,
5
- "Batch latency (s)": 1.1641260623931884,
6
  "Batch size": 2,
7
  "Denoising steps": 25
8
  }
 
1
  {
2
  "Model": "prompthero/openjourney-v4",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
+ "Energy/image (J)": 188.78500000000932,
5
+ "Batch latency (s)": 1.1187455892562865,
6
  "Batch size": 2,
7
  "Denoising steps": 25
8
  }
data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs32+steps25.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Model": "prompthero/openjourney-v4",
3
+ "GPU": "NVIDIA A100-SXM4-40GB",
4
+ "Energy/image (J)": 154.23499999999768,
5
+ "Batch latency (s)": 12.850126147270203,
6
+ "Batch size": 32,
7
+ "Denoising steps": 25
8
+ }
data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs4+steps25.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "Model": "prompthero/openjourney-v4",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
- "Energy/image (J)": 177.43804999999702,
5
- "Batch latency (s)": 1.884285831451416,
6
  "Batch size": 4,
7
  "Denoising steps": 25
8
  }
 
1
  {
2
  "Model": "prompthero/openjourney-v4",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
+ "Energy/image (J)": 175.33082500000017,
5
+ "Batch latency (s)": 1.8664743423461914,
6
  "Batch size": 4,
7
  "Denoising steps": 25
8
  }
data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs64+steps25.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Model": "prompthero/openjourney-v4",
3
+ "GPU": "NVIDIA A100-SXM4-40GB",
4
+ "Energy/image (J)": 150.57691875000017,
5
+ "Batch latency (s)": 25.000647592544556,
6
+ "Batch size": 64,
7
+ "Denoising steps": 25
8
+ }
data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs8+steps25.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "Model": "prompthero/openjourney-v4",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
- "Energy/image (J)": 162.92667500000098,
5
- "Batch latency (s)": 3.505508875846863,
6
  "Batch size": 8,
7
  "Denoising steps": 25
8
  }
 
1
  {
2
  "Model": "prompthero/openjourney-v4",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
+ "Energy/image (J)": 163.7534500000067,
5
+ "Batch latency (s)": 3.423132634162903,
6
  "Batch size": 8,
7
  "Denoising steps": 25
8
  }
data/diffusion/text-to-image/A100-SXM4-40GB/segmind/SSD-1B/bs1+steps25.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "Model": "segmind/SSD-1B",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
- "Energy/image (J)": 717.2012000000104,
5
- "Batch latency (s)": 1.9508831262588502,
6
  "Batch size": 1,
7
  "Denoising steps": 25
8
  }
 
1
  {
2
  "Model": "segmind/SSD-1B",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
+ "Energy/image (J)": 745.7899999999441,
5
+ "Batch latency (s)": 1.9644724607467652,
6
  "Batch size": 1,
7
  "Denoising steps": 25
8
  }
data/diffusion/text-to-image/A100-SXM4-40GB/segmind/SSD-1B/bs2+steps25.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "Model": "segmind/SSD-1B",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
- "Energy/image (J)": 681.1273499999894,
5
- "Batch latency (s)": 3.633535361289978,
6
  "Batch size": 2,
7
  "Denoising steps": 25
8
  }
 
1
  {
2
  "Model": "segmind/SSD-1B",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
+ "Energy/image (J)": 700.4580500000156,
5
+ "Batch latency (s)": 3.6897377252578734,
6
  "Batch size": 2,
7
  "Denoising steps": 25
8
  }
data/diffusion/text-to-image/A100-SXM4-40GB/segmind/SSD-1B/bs4+steps25.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "Model": "segmind/SSD-1B",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
- "Energy/image (J)": 672.6853499999968,
5
- "Batch latency (s)": 7.193562436103821,
6
  "Batch size": 4,
7
  "Denoising steps": 25
8
  }
 
1
  {
2
  "Model": "segmind/SSD-1B",
3
  "GPU": "NVIDIA A100-SXM4-40GB",
4
+ "Energy/image (J)": 688.6121250000084,
5
+ "Batch latency (s)": 7.168970584869385,
6
  "Batch size": 4,
7
  "Denoising steps": 25
8
  }