Spaces:

ml-energy
/

leaderboard

Running

App Files Files Community

Jae-Won Chung commited on 5 days ago

Commit

c97bae1

•

1 Parent(s): abd945c

Updated diffusion benchmark and data

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +1 -1
benchmark/diffusion/image-to-video/pegasus/A100/queue_1gpu.yaml +4 -4
benchmark/diffusion/image-to-video/pegasus/H100/queue_1gpu.yaml +4 -4
benchmark/diffusion/image-to-video/scripts/aggregate_leaderboard_models.py +1 -1
benchmark/diffusion/image-to-video/scripts/benchmark_one_datapoint.py +26 -15
benchmark/diffusion/image-to-video/scripts/benchmark_one_model.py +41 -35
benchmark/diffusion/image-to-video/sharegpt4video/extract_first_frame.py +1 -1
benchmark/diffusion/text-to-image/pegasus/A100/queue_1gpu.yaml +1 -1
benchmark/diffusion/text-to-image/scripts/aggregate_leaderboard_models.py +2 -1
benchmark/diffusion/text-to-image/scripts/benchmark_one_datapoint.py +49 -8
benchmark/diffusion/text-to-image/scripts/benchmark_one_model.py +20 -18
benchmark/diffusion/text-to-video/pegasus/A100/queue_1gpu.yaml +1 -1
benchmark/diffusion/text-to-video/pegasus/H100/queue_1gpu.yaml +1 -1
benchmark/diffusion/text-to-video/scripts/aggregate_leaderboard_models.py +2 -1
benchmark/diffusion/text-to-video/scripts/benchmark_one_datapoint.py +11 -11
benchmark/diffusion/text-to-video/scripts/benchmark_one_model.py +37 -35
data/diffusion/image-to-video/A100-SXM4-40GB/ali-vilab/i2vgen-xl/bs1+steps25+frames16.json +2 -2
data/diffusion/image-to-video/A100-SXM4-40GB/ali-vilab/i2vgen-xl/bs2+steps25+frames16.json +2 -2
data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid-xt/bs1+steps25+frames25.json +2 -2
data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid-xt/bs2+steps25+frames25.json +9 -0
data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid-xt/bs3+steps25+frames25.json +9 -0
data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid/bs1+steps25+frames14.json +2 -2
data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid/bs2+steps25+frames14.json +9 -0
data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid/bs3+steps25+frames14.json +9 -0
data/diffusion/image-to-video/H100 80GB HBM3/ali-vilab/i2vgen-xl/bs1+steps25+frames16.json +2 -2
data/diffusion/image-to-video/H100 80GB HBM3/ali-vilab/i2vgen-xl/bs2+steps25+frames16.json +2 -2
data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid-xt/bs1+steps25+frames25.json +2 -2
data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid-xt/bs2+steps25+frames25.json +9 -0
data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid-xt/bs3+steps25+frames25.json +9 -0
data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid/bs1+steps25+frames14.json +2 -2
data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid/bs2+steps25+frames14.json +9 -0
data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid/bs3+steps25+frames14.json +9 -0
data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs1+steps25.json +2 -2
data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs16+steps25.json +2 -2
data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs2+steps25.json +2 -2
data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs32+steps25.json +8 -0
data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs4+steps25.json +2 -2
data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs8+steps25.json +2 -2
data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-3/bs1+steps25.json +2 -2
data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-3/bs2+steps25.json +2 -2
data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs1+steps25.json +2 -2
data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs16+steps25.json +2 -2
data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs2+steps25.json +2 -2
data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs32+steps25.json +8 -0
data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs4+steps25.json +2 -2
data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs64+steps25.json +8 -0
data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs8+steps25.json +2 -2
data/diffusion/text-to-image/A100-SXM4-40GB/segmind/SSD-1B/bs1+steps25.json +2 -2
data/diffusion/text-to-image/A100-SXM4-40GB/segmind/SSD-1B/bs2+steps25.json +2 -2
data/diffusion/text-to-image/A100-SXM4-40GB/segmind/SSD-1B/bs4+steps25.json +2 -2

.gitignore CHANGED Viewed

@@ -18,4 +18,4 @@ build/
 # Data files
 *.log
-pegasus/consumed.yaml

 # Data files
 *.log
+figures/

benchmark/diffusion/image-to-video/pegasus/A100/queue_1gpu.yaml CHANGED Viewed

@@ -1,6 +1,6 @@
 - command:
-    - "python scripts/benchmark_one_model.py {{ model }} --result-root results/joule --dataset-path sharegpt4video/sharegpt4video_100.json --gpu-ids {{ gpu }} --batch-sizes 8 4 2 1 --power-limits 400 --num-inference-steps 25"
   model:
-    - '--model ali-vilab/i2vgen-xl --num-frames 16 --add-text-prompt'
-    - '--model stabilityai/stable-video-diffusion-img2vid --num-frames 14'
-    - '--model stabilityai/stable-video-diffusion-img2vid-xt --num-frames 25'

 - command:
+    - "python scripts/benchmark_one_model.py {{ model }} --result-root results/joule --dataset-path sharegpt4video/sharegpt4video_100.json --gpu-ids {{ gpu }} --batch-sizes 4 3 2 1 --power-limits 400 --num-inference-steps 1 2 4 8 16 25 30 40 50"
   model:
+    - '--model ali-vilab/i2vgen-xl --num-frames 16 --add-text-prompt --width 1280 --height 720'
+    - '--model stabilityai/stable-video-diffusion-img2vid --num-frames 14 --width 1024 --height 576'
+    - '--model stabilityai/stable-video-diffusion-img2vid-xt --num-frames 25 --width 1024 --height 576'

benchmark/diffusion/image-to-video/pegasus/H100/queue_1gpu.yaml CHANGED Viewed

@@ -1,6 +1,6 @@
 - command:
-    - "python scripts/benchmark_one_model.py {{ model }} --result-root results/joule --dataset-path sharegpt4video/sharegpt4video_700.json --gpu-ids {{ gpu }} --batch-sizes 64 32 16 8 4 2 1 --power-limits 700 --num-inference-steps 25"
   model:
-    - '--model ali-vilab/i2vgen-xl --num-frames 16 --add-text-prompt'
-    - '--model stabilityai/stable-video-diffusion-img2vid --num-frames 14'
-    - '--model stabilityai/stable-video-diffusion-img2vid-xt --num-frames 25'

 - command:
+    - "python scripts/benchmark_one_model.py {{ model }} --result-root results/joule --dataset-path sharegpt4video/sharegpt4video_100.json --gpu-ids {{ gpu }} --batch-sizes 4 3 2 1 --power-limits 700 --num-inference-steps 1 2 4 8 16 25 30 40 50"
   model:
+    - "--model ali-vilab/i2vgen-xl --num-frames 16 --add-text-prompt --width 1280 --height 720"
+    - "--model stabilityai/stable-video-diffusion-img2vid --num-frames 14 --width 1024 --height 576"
+    - "--model stabilityai/stable-video-diffusion-img2vid-xt --num-frames 25 --width 1024 --height 576"

benchmark/diffusion/image-to-video/scripts/aggregate_leaderboard_models.py CHANGED Viewed

@@ -15,7 +15,7 @@ def main(results_dir: Path, output_file: Path) -> None:
     for model_dir in sorted(glob(f"{results_dir}/*/*")):
         model_name = "/".join(model_dir.split("/")[-2:])
         print(f"  {model_name}")
-        result_file_cand = glob(f"{model_dir}/bs1+*+results.json")
         assert len(result_file_cand) == 1, model_name
         results_data = json.load(open(result_file_cand[0]))
         denosing_module_name = "unet" if "unet" in results_data["num_parameters"] else "transformer"

     for model_dir in sorted(glob(f"{results_dir}/*/*")):
         model_name = "/".join(model_dir.split("/")[-2:])
         print(f"  {model_name}")
+        result_file_cand = glob(f"{model_dir}/bs1+*+steps25+results.json")
         assert len(result_file_cand) == 1, model_name
         results_data = json.load(open(result_file_cand[0]))
         denosing_module_name = "unet" if "unet" in results_data["num_parameters"] else "transformer"

benchmark/diffusion/image-to-video/scripts/benchmark_one_datapoint.py CHANGED Viewed

@@ -27,10 +27,10 @@ class Results:
     model: str
     num_parameters: dict[str, int]
     gpu_model: str
-    num_infernece_steps: int
-    num_frames: int
     power_limit: int
     batch_size: int
     num_prompts: int
     total_runtime: float = 0.0
     total_energy: float = 0.0
@@ -80,6 +80,7 @@ def load_text_image_prompts(
     path: str,
     batch_size: int,
     num_batches: int | None = None,
 ) -> tuple[int, list[tuple[list[str], list[Image.Image]]]]:
     """Load the dataset to feed the model and return it as a list of batches of prompts.
@@ -93,6 +94,9 @@ def load_text_image_prompts(
     dataset = json.load(open(path))
     assert len(dataset["caption"]) == len(dataset["video_id"])
     if num_batches is not None:
         if len(dataset["caption"]) < num_batches * batch_size:
             raise ValueError("Not enough data for the requested number of batches.")
@@ -103,6 +107,8 @@ def load_text_image_prompts(
     dataset["first_frame"] = [
         load_image(str(image_path / f"{video_id}.jpg")) for video_id in dataset["video_id"]
     ]
     batched = [
         (dataset["caption"][i : i + batch_size], dataset["first_frame"][i : i + batch_size])
@@ -135,8 +141,8 @@ def benchmark(args: argparse.Namespace) -> None:
     results_dir = Path(args.result_root) / args.model
     results_dir.mkdir(parents=True, exist_ok=True)
-    benchmark_name = str(results_dir / f"bs{args.batch_size}+pl{args.power_limit}")
-    video_dir = results_dir / f"bs{args.batch_size}+pl{args.power_limit}+generated"
     video_dir.mkdir(exist_ok=True)
     arg_out_filename = f"{benchmark_name}+args.json"
@@ -150,11 +156,16 @@ def benchmark(args: argparse.Namespace) -> None:
     pynvml.nvmlInit()
     handle = pynvml.nvmlDeviceGetHandleByIndex(0)
     gpu_model = pynvml.nvmlDeviceGetName(handle)
-    pynvml.nvmlDeviceSetPersistenceMode(handle, pynvml.NVML_FEATURE_ENABLED)
-    pynvml.nvmlDeviceSetPowerManagementLimit(handle, args.power_limit * 1000)
     pynvml.nvmlShutdown()
-    num_prompts, batched_prompts = load_text_image_prompts(args.dataset_path, args.batch_size, args.num_batches)
     pipeline = get_pipeline(args.model)
@@ -189,7 +200,7 @@ def benchmark(args: argparse.Namespace) -> None:
     fps_param_name = fps_param_name_candidates[0]
     torch.cuda.reset_peak_memory_stats(device="cuda:0")
-    zeus_monitor.begin_window("benchmark", sync_cuda=False)
     # Build common parameter dict for all batches
     params: dict[str, Any] = dict(
@@ -210,15 +221,15 @@ def benchmark(args: argparse.Namespace) -> None:
         if args.add_text_prompt:
             params["prompt"] = intermediate.prompts
-        zeus_monitor.begin_window("batch", sync_cuda=False)
         frames = pipeline(**params).frames
-        batch_measurements = zeus_monitor.end_window("batch", sync_cuda=False)
         intermediate.frames = frames
         intermediate.batch_latency = batch_measurements.time
         intermediate.batch_energy = batch_measurements.total_energy
-    measurements = zeus_monitor.end_window("benchmark", sync_cuda=False)
     peak_memory = torch.cuda.max_memory_allocated(device="cuda:0")
     results: list[Result] = []
@@ -255,10 +266,10 @@ def benchmark(args: argparse.Namespace) -> None:
         model=args.model,
         num_parameters=count_parameters(pipeline),
         gpu_model=gpu_model,
-        num_infernece_steps=args.num_inference_steps,
-        num_frames=args.num_frames,
         power_limit=args.power_limit,
         batch_size=args.batch_size,
         num_prompts=num_prompts,
         total_runtime=measurements.time,
         total_energy=measurements.total_energy,
@@ -289,8 +300,8 @@ if __name__ == "__main__":
     parser.add_argument("--num-inference-steps", type=int, default=50, help="The number of denoising steps.")
     parser.add_argument("--num-frames", type=int, default=1, help="The number of frames to generate.")
     parser.add_argument("--fps", type=int, default=16, help="Frames per second for micro-conditioning.")
-    parser.add_argument("--height", type=int, help="Height of the generated video.")
-    parser.add_argument("--width", type=int, help="Width of the generated video.")
     parser.add_argument("--num-batches", type=int, default=None, help="The number of batches to use from the dataset.")
     parser.add_argument("--save-every", type=int, default=10, help="Save generations to file every N prompts.")
     parser.add_argument("--seed", type=int, default=0, help="The seed to use for the RNG.")

     model: str
     num_parameters: dict[str, int]
     gpu_model: str
     power_limit: int
     batch_size: int
+    num_inference_steps: int
+    num_frames: int
     num_prompts: int
     total_runtime: float = 0.0
     total_energy: float = 0.0
     path: str,
     batch_size: int,
     num_batches: int | None = None,
+    image_resize: tuple[int, int] | None = None,
 ) -> tuple[int, list[tuple[list[str], list[Image.Image]]]]:
     """Load the dataset to feed the model and return it as a list of batches of prompts.
     dataset = json.load(open(path))
     assert len(dataset["caption"]) == len(dataset["video_id"])
+    dataset["caption"] *= 10
+    dataset["video_id"] *= 10
     if num_batches is not None:
         if len(dataset["caption"]) < num_batches * batch_size:
             raise ValueError("Not enough data for the requested number of batches.")
     dataset["first_frame"] = [
         load_image(str(image_path / f"{video_id}.jpg")) for video_id in dataset["video_id"]
     ]
+    if image_resize is not None:
+        dataset["first_frame"] = [image.resize(image_resize) for image in dataset["first_frame"]]
     batched = [
         (dataset["caption"][i : i + batch_size], dataset["first_frame"][i : i + batch_size])
     results_dir = Path(args.result_root) / args.model
     results_dir.mkdir(parents=True, exist_ok=True)
+    benchmark_name = str(results_dir / f"bs{args.batch_size}+pl{args.power_limit}+steps{args.num_inference_steps}")
+    video_dir = results_dir / f"bs{args.batch_size}+pl{args.power_limit}+steps{args.num_inference_steps}+generated"
     video_dir.mkdir(exist_ok=True)
     arg_out_filename = f"{benchmark_name}+args.json"
     pynvml.nvmlInit()
     handle = pynvml.nvmlDeviceGetHandleByIndex(0)
     gpu_model = pynvml.nvmlDeviceGetName(handle)
+    # pynvml.nvmlDeviceSetPersistenceMode(handle, pynvml.NVML_FEATURE_ENABLED)
+    # pynvml.nvmlDeviceSetPowerManagementLimit(handle, args.power_limit * 1000)
     pynvml.nvmlShutdown()
+    num_prompts, batched_prompts = load_text_image_prompts(
+        args.dataset_path,
+        args.batch_size,
+        args.num_batches,
+        (args.width, args.height),
+    )
     pipeline = get_pipeline(args.model)
     fps_param_name = fps_param_name_candidates[0]
     torch.cuda.reset_peak_memory_stats(device="cuda:0")
+    zeus_monitor.begin_window("benchmark", sync_execution=False)
     # Build common parameter dict for all batches
     params: dict[str, Any] = dict(
         if args.add_text_prompt:
             params["prompt"] = intermediate.prompts
+        zeus_monitor.begin_window("batch", sync_execution=False)
         frames = pipeline(**params).frames
+        batch_measurements = zeus_monitor.end_window("batch", sync_execution=False)
         intermediate.frames = frames
         intermediate.batch_latency = batch_measurements.time
         intermediate.batch_energy = batch_measurements.total_energy
+    measurements = zeus_monitor.end_window("benchmark", sync_execution=False)
     peak_memory = torch.cuda.max_memory_allocated(device="cuda:0")
     results: list[Result] = []
         model=args.model,
         num_parameters=count_parameters(pipeline),
         gpu_model=gpu_model,
         power_limit=args.power_limit,
         batch_size=args.batch_size,
+        num_inference_steps=args.num_inference_steps,
+        num_frames=args.num_frames,
         num_prompts=num_prompts,
         total_runtime=measurements.time,
         total_energy=measurements.total_energy,
     parser.add_argument("--num-inference-steps", type=int, default=50, help="The number of denoising steps.")
     parser.add_argument("--num-frames", type=int, default=1, help="The number of frames to generate.")
     parser.add_argument("--fps", type=int, default=16, help="Frames per second for micro-conditioning.")
+    parser.add_argument("--height", type=int, required=True, help="Height of the generated video.")
+    parser.add_argument("--width", type=int, required=True, help="Width of the generated video.")
     parser.add_argument("--num-batches", type=int, default=None, help="The number of batches to use from the dataset.")
     parser.add_argument("--save-every", type=int, default=10, help="Save generations to file every N prompts.")
     parser.add_argument("--seed", type=int, default=0, help="The seed to use for the RNG.")

benchmark/diffusion/image-to-video/scripts/benchmark_one_model.py CHANGED Viewed

@@ -28,44 +28,48 @@ def main(args: argparse.Namespace) -> None:
     print_and_write(outfile, f"Benchmarking {args.model}\n")
     print_and_write(outfile, f"Batch sizes: {args.batch_sizes}\n")
     print_and_write(outfile, f"Power limits: {args.power_limits}\n")
     for batch_size in args.batch_sizes:
         for power_limit in args.power_limits:
-            print_and_write(outfile, f"{batch_size=}, {power_limit=}\n", flush=True)
-            with subprocess.Popen(
-                args=[
-                    "docker", "run",
-                    "--gpus", '"device=' + ','.join(args.gpu_ids) + '"',
-                    "--cap-add", "SYS_ADMIN",
-                    "--name", f"leaderboard-i2v-{''.join(args.gpu_ids)}",
-                    "--rm",
-                    "-v", "/data/leaderboard/hfcache:/root/.cache/huggingface",
-                    "-v", f"{os.getcwd()}:/workspace/image-to-video",
-                    "mlenergy/leaderboard:diffusion-i2v",
-                    "--dataset-path", args.dataset_path,
-                    "--result-root", args.result_root,
-                    "--batch-size", batch_size,
-                    "--num-batches", "10",
-                    "--power-limit", power_limit,
-                    "--model", args.model,
-                    "--huggingface-token", hf_token,
-                    "--num-frames", args.num_frames,
-                    "--num-inference-steps", args.num_inference_steps,
-                ] + (["--add-text-prompt"] if args.add_text_prompt else []),
-                stdout=subprocess.PIPE,
-                stderr=subprocess.STDOUT,
-                text=True,
-            ) as proc:
-                if proc.stdout:
-                    i = 0
-                    for line in proc.stdout:
-                        print_and_write(outfile, line, flush=i % 50 == 0)
-                        i += 1
-            # If proc exited with non-zero status, it's probably an OOM.
-            # Move on to the next batch size.
-            if proc.returncode != 0:
-                break
@@ -77,8 +81,10 @@ if __name__ == "__main__":
     parser.add_argument("--batch-sizes", type=str, nargs="+", default=["8", "4", "2", "1"], help="Batch sizes to benchmark")
     parser.add_argument("--power-limits", type=str, nargs="+", default=["400", "300", "200"], help="Power limits to benchmark")
     parser.add_argument("--num-frames", type=str, help="Number of frames to generate")
-    parser.add_argument("--num-inference-steps", type=str, help="Number of denoising steps")
     parser.add_argument("--add-text-prompt", action="store_true", help="Input text prompt alongside image.")
     parser.add_argument("--dataset-path", type=str, help="Path to the dataset JSON file.")
     args = parser.parse_args()
     main(args)

     print_and_write(outfile, f"Benchmarking {args.model}\n")
     print_and_write(outfile, f"Batch sizes: {args.batch_sizes}\n")
     print_and_write(outfile, f"Power limits: {args.power_limits}\n")
+    print_and_write(outfile, f"Number of inference steps: {args.num_inference_steps}\n")
     for batch_size in args.batch_sizes:
         for power_limit in args.power_limits:
+            for num_inference_steps in args.num_inference_steps:
+                print_and_write(outfile, f"{batch_size=}, {power_limit=}, {num_inference_steps=}\n", flush=True)
+                with subprocess.Popen(
+                    args=[
+                        "docker", "run",
+                        "--gpus", '"device=' + ','.join(args.gpu_ids) + '"',
+                        "--cap-add", "SYS_ADMIN",
+                        "--name", f"leaderboard-i2v-{''.join(args.gpu_ids)}",
+                        "--rm",
+                        "-v", "/data/leaderboard/hfcache:/root/.cache/huggingface",
+                        "-v", f"{os.getcwd()}:/workspace/image-to-video",
+                        "mlenergy/leaderboard:diffusion-i2v",
+                        "--dataset-path", args.dataset_path,
+                        "--result-root", args.result_root,
+                        "--batch-size", batch_size,
+                        "--num-batches", "8",
+                        "--power-limit", power_limit,
+                        "--model", args.model,
+                        "--huggingface-token", hf_token,
+                        "--num-frames", args.num_frames,
+                        "--num-inference-steps", num_inference_steps,
+                        "--width", str(args.width),
+                        "--height", str(args.height),
+                    ] + (["--add-text-prompt"] if args.add_text_prompt else []),
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.STDOUT,
+                    text=True,
+                ) as proc:
+                    if proc.stdout:
+                        i = 0
+                        for line in proc.stdout:
+                            print_and_write(outfile, line, flush=i % 50 == 0)
+                            i += 1
+                # If proc exited with non-zero status, it's probably an OOM.
+                # Move on to the next batch size.
+                if proc.returncode != 0:
+                    break
     parser.add_argument("--batch-sizes", type=str, nargs="+", default=["8", "4", "2", "1"], help="Batch sizes to benchmark")
     parser.add_argument("--power-limits", type=str, nargs="+", default=["400", "300", "200"], help="Power limits to benchmark")
     parser.add_argument("--num-frames", type=str, help="Number of frames to generate")
+    parser.add_argument("--num-inference-steps", type=str, nargs="+", default=["1", "2", "4", "8", "16", "30", "40", "50"], help="Number of inference steps to run")
     parser.add_argument("--add-text-prompt", action="store_true", help="Input text prompt alongside image.")
+    parser.add_argument("--height", type=int, required=True, help="Height of the generated video.")
+    parser.add_argument("--width", type=int, required=True, help="Width of the generated video.")
     parser.add_argument("--dataset-path", type=str, help="Path to the dataset JSON file.")
     args = parser.parse_args()
     main(args)

benchmark/diffusion/image-to-video/sharegpt4video/extract_first_frame.py CHANGED Viewed

@@ -3,7 +3,7 @@ import json
 import cv2
-DATASET_PATH = "sharegpt4video_700.json"
 def main() -> None:

 import cv2
+DATASET_PATH = "sharegpt4video_100.json"
 def main() -> None:

benchmark/diffusion/text-to-image/pegasus/A100/queue_1gpu.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 - command:
-    - "python scripts/benchmark_one_model.py --model {{ model }} --result-root results/joule --gpu-ids {{ gpu }} --batch-sizes 16 8 4 2 1 --power-limits 400"
   model:
     - stabilityai/stable-diffusion-2-1
     - stabilityai/stable-diffusion-xl-base-1.0

 - command:
+    - "python scripts/benchmark_one_model.py --model {{ model }} --result-root results/joule --gpu-ids {{ gpu }} --batch-sizes 64 32 16 8 4 2 1 --num-inference-steps 1 2 4 8 16 25 30 40 50 --power-limits 400"
   model:
     - stabilityai/stable-diffusion-2-1
     - stabilityai/stable-diffusion-xl-base-1.0

benchmark/diffusion/text-to-image/scripts/aggregate_leaderboard_models.py CHANGED Viewed

@@ -15,7 +15,7 @@ def main(results_dir: Path, output_file: Path) -> None:
     for model_dir in sorted(glob(f"{results_dir}/*/*")):
         model_name = "/".join(model_dir.split("/")[-2:])
         print(f"  {model_name}")
-        result_file_cand = glob(f"{model_dir}/bs1+*+results.json")
         assert len(result_file_cand) == 1, model_name
         results_data = json.load(open(result_file_cand[0]))
         denosing_module_name = "unet" if "unet" in results_data["num_parameters"] else "transformer"
@@ -24,6 +24,7 @@ def main(results_dir: Path, output_file: Path) -> None:
             nickname=model_name.split("/")[-1].replace("-", " ").title(),
             total_params=raw_params_to_readable(sum(results_data["num_parameters"].values())),
             denoising_params=raw_params_to_readable(results_data["num_parameters"][denosing_module_name]),
         )
         assert model_name not in models
         models[model_name] = model_info

     for model_dir in sorted(glob(f"{results_dir}/*/*")):
         model_name = "/".join(model_dir.split("/")[-2:])
         print(f"  {model_name}")
+        result_file_cand = glob(f"{model_dir}/bs1+*+steps25+results.json")
         assert len(result_file_cand) == 1, model_name
         results_data = json.load(open(result_file_cand[0]))
         denosing_module_name = "unet" if "unet" in results_data["num_parameters"] else "transformer"
             nickname=model_name.split("/")[-1].replace("-", " ").title(),
             total_params=raw_params_to_readable(sum(results_data["num_parameters"].values())),
             denoising_params=raw_params_to_readable(results_data["num_parameters"][denosing_module_name]),
+            resolution="NA",
         )
         assert model_name not in models
         models[model_name] = model_info

benchmark/diffusion/text-to-image/scripts/benchmark_one_datapoint.py CHANGED Viewed

@@ -1,8 +1,10 @@
 from __future__ import annotations
 import os
 import json
 import argparse
 from pprint import pprint
 from pathlib import Path
 from contextlib import suppress
@@ -11,6 +13,7 @@ from dataclasses import dataclass, field, asdict
 import torch
 import pynvml
 import numpy as np
 from PIL import Image
 from datasets import load_dataset, Dataset
 from transformers.trainer_utils import set_seed
@@ -35,9 +38,9 @@ class Results:
     model: str
     num_parameters: dict[str, int]
     gpu_model: str
-    num_inference_steps: int
     power_limit: int
     batch_size: int
     num_prompts: int
     average_clip_score: float = 0.0
     total_runtime: float = 0.0
@@ -118,6 +121,28 @@ def load_partiprompts(
     return len(batched) * batch_size, batched
 def calculate_clip_score(
     model: CLIPModel,
     processor: CLIPProcessor,
@@ -183,8 +208,8 @@ def benchmark(args: argparse.Namespace) -> None:
     results_dir = Path(args.result_root) / args.model
     results_dir.mkdir(parents=True, exist_ok=True)
-    benchmark_name = str(results_dir / f"bs{args.batch_size}+pl{args.power_limit}")
-    image_dir = results_dir / f"bs{args.batch_size}+pl{args.power_limit}+generated"
     image_dir.mkdir(exist_ok=True)
     arg_out_filename = f"{benchmark_name}+args.json"
@@ -222,27 +247,42 @@ def benchmark(args: argparse.Namespace) -> None:
         ResultIntermediateBatched(prompts=batch) for batch in batched_prompts
     ]
     torch.cuda.reset_peak_memory_stats(device="cuda:0")
-    zeus_monitor.begin_window("benchmark", sync_cuda=False)
     for ind, intermediate in enumerate(intermediates):
         print(f"Batch {ind + 1}/{len(intermediates)}")
-        zeus_monitor.begin_window("batch", sync_cuda=False)
         images = pipeline(
             intermediate.prompts,
             generator=rng,
             num_inference_steps=args.num_inference_steps,
             output_type="np",
         ).images
-        batch_measurements = zeus_monitor.end_window("batch", sync_cuda=False)
         intermediate.images = images
         intermediate.batch_latency = batch_measurements.time
         intermediate.batch_energy = batch_measurements.total_energy
-    measurements = zeus_monitor.end_window("benchmark", sync_cuda=False)
     peak_memory = torch.cuda.max_memory_allocated(device="cuda:0")
     # Scale images to [0, 256] and convert to uint8
     for intermediate in intermediates:
         intermediate.images = (intermediate.images * 255).astype("uint8")
@@ -292,9 +332,9 @@ def benchmark(args: argparse.Namespace) -> None:
         model=args.model,
         num_parameters=count_parameters(pipeline),
         gpu_model=gpu_model,
-        num_inference_steps=args.num_inference_steps,
         power_limit=args.power_limit,
         batch_size=args.batch_size,
         num_prompts=num_prompts,
         average_clip_score=sum(r.clip_score for r in results) / len(results),
         total_runtime=measurements.time,
@@ -326,6 +366,7 @@ if __name__ == "__main__":
     parser.add_argument("--image-save-every", type=int, default=10, help="Save images to file every N prompts.")
     parser.add_argument("--seed", type=int, default=0, help="The seed to use for the RNG.")
     parser.add_argument("--huggingface-token", type=str, help="The HuggingFace token to use.")
     args = parser.parse_args()
     benchmark(args)

 from __future__ import annotations
 import os
+import time
 import json
 import argparse
+import multiprocessing as mp
 from pprint import pprint
 from pathlib import Path
 from contextlib import suppress
 import torch
 import pynvml
 import numpy as np
+import pandas as pd
 from PIL import Image
 from datasets import load_dataset, Dataset
 from transformers.trainer_utils import set_seed
     model: str
     num_parameters: dict[str, int]
     gpu_model: str
     power_limit: int
     batch_size: int
+    num_inference_steps: int
     num_prompts: int
     average_clip_score: float = 0.0
     total_runtime: float = 0.0
     return len(batched) * batch_size, batched
+def power_monitor(csv_path: str, gpu_indices: list[int], chan: mp.SimpleQueue) -> None:
+    pynvml.nvmlInit()
+    handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in gpu_indices]
+    fields = [
+        (pynvml.NVML_FI_DEV_POWER_AVERAGE, pynvml.NVML_POWER_SCOPE_GPU),
+        (pynvml.NVML_FI_DEV_POWER_AVERAGE, pynvml.NVML_POWER_SCOPE_MEMORY),
+    ]
+    columns = ["timestamp"] + sum([[f"gpu{i}", f"vram{i}"] for i in gpu_indices], [])
+    power: list[list] = []
+    while chan.empty():
+        row = [time.monotonic()]
+        values = [pynvml.nvmlDeviceGetFieldValues(h, fields) for h in handles]
+        for value in values:
+            row.extend((value[0].value.uiVal, value[1].value.uiVal))
+        power.append(row)
+        time.sleep(max(0.0, 0.1 - (time.monotonic() - row[0])))
+    pd.DataFrame(power, columns=columns).to_csv(csv_path, index=False)
 def calculate_clip_score(
     model: CLIPModel,
     processor: CLIPProcessor,
     results_dir = Path(args.result_root) / args.model
     results_dir.mkdir(parents=True, exist_ok=True)
+    benchmark_name = str(results_dir / f"bs{args.batch_size}+pl{args.power_limit}+steps{args.num_inference_steps}")
+    image_dir = results_dir / f"bs{args.batch_size}+pl{args.power_limit}+steps{args.num_inference_steps}+generated"
     image_dir.mkdir(exist_ok=True)
     arg_out_filename = f"{benchmark_name}+args.json"
         ResultIntermediateBatched(prompts=batch) for batch in batched_prompts
     ]
+    pmon = None
+    pmon_chan = None
+    if args.monitor_power:
+        pmon_chan = mp.SimpleQueue()
+        pmon = mp.get_context("spawn").Process(
+            target=power_monitor,
+            args=(f"{benchmark_name}+power.csv", [g.gpu_index for g in zeus_monitor.gpus.gpus], pmon_chan),
+        )
+        pmon.start()
     torch.cuda.reset_peak_memory_stats(device="cuda:0")
+    zeus_monitor.begin_window("benchmark", sync_execution=False)
     for ind, intermediate in enumerate(intermediates):
         print(f"Batch {ind + 1}/{len(intermediates)}")
+        zeus_monitor.begin_window("batch", sync_execution=False)
         images = pipeline(
             intermediate.prompts,
             generator=rng,
             num_inference_steps=args.num_inference_steps,
             output_type="np",
         ).images
+        batch_measurements = zeus_monitor.end_window("batch", sync_execution=False)
         intermediate.images = images
         intermediate.batch_latency = batch_measurements.time
         intermediate.batch_energy = batch_measurements.total_energy
+    measurements = zeus_monitor.end_window("benchmark", sync_execution=False)
     peak_memory = torch.cuda.max_memory_allocated(device="cuda:0")
+    if pmon is not None and pmon_chan is not None:
+        pmon_chan.put("stop")
+        pmon.join(timeout=5.0)
+        pmon.terminate()
     # Scale images to [0, 256] and convert to uint8
     for intermediate in intermediates:
         intermediate.images = (intermediate.images * 255).astype("uint8")
         model=args.model,
         num_parameters=count_parameters(pipeline),
         gpu_model=gpu_model,
         power_limit=args.power_limit,
         batch_size=args.batch_size,
+        num_inference_steps=args.num_inference_steps,
         num_prompts=num_prompts,
         average_clip_score=sum(r.clip_score for r in results) / len(results),
         total_runtime=measurements.time,
     parser.add_argument("--image-save-every", type=int, default=10, help="Save images to file every N prompts.")
     parser.add_argument("--seed", type=int, default=0, help="The seed to use for the RNG.")
     parser.add_argument("--huggingface-token", type=str, help="The HuggingFace token to use.")
+    parser.add_argument("--monitor-power", default=False, action="store_true", help="Whether to monitor power over time.")
     args = parser.parse_args()
     benchmark(args)

benchmark/diffusion/text-to-image/scripts/benchmark_one_model.py CHANGED Viewed

@@ -28,12 +28,13 @@ def main(args: argparse.Namespace) -> None:
     print_and_write(outfile, f"Benchmarking {args.model}\n")
     print_and_write(outfile, f"Batch sizes: {args.batch_sizes}\n")
     print_and_write(outfile, f"Power limits: {args.power_limits}\n")
     for batch_size in args.batch_sizes:
         for power_limit in args.power_limits:
-            print_and_write(outfile, f"{batch_size=}, {power_limit=}\n", flush=True)
-            with subprocess.Popen(
-                args=[
                     "docker", "run",
                     "--gpus", '"device=' + ','.join(args.gpu_ids) + '"',
                     "--cap-add", "SYS_ADMIN",
@@ -48,22 +49,21 @@ def main(args: argparse.Namespace) -> None:
                     "--power-limit", power_limit,
                     "--model", args.model,
                     "--huggingface-token", hf_token,
-                    "--num-inference-steps", "25",
-                ],
-                stdout=subprocess.PIPE,
-                stderr=subprocess.STDOUT,
-                text=True,
-            ) as proc:
-                if proc.stdout:
-                    i = 0
-                    for line in proc.stdout:
-                        print_and_write(outfile, line, flush=i % 50 == 0)
-                        i += 1
-            # If proc exited with non-zero status, it's probably an OOM.
-            # Move on to the next batch size.
-            if proc.returncode != 0:
-                break
@@ -74,5 +74,7 @@ if __name__ == "__main__":
     parser.add_argument("--gpu-ids", type=str, nargs="+", help="GPU IDs to use")
     parser.add_argument("--batch-sizes", type=str, nargs="+", default=["8", "4", "2", "1"], help="Batch sizes to benchmark")
     parser.add_argument("--power-limits", type=str, nargs="+", default=["400", "300", "200"], help="Power limits to benchmark")
     args = parser.parse_args()
     main(args)

     print_and_write(outfile, f"Benchmarking {args.model}\n")
     print_and_write(outfile, f"Batch sizes: {args.batch_sizes}\n")
     print_and_write(outfile, f"Power limits: {args.power_limits}\n")
+    print_and_write(outfile, f"Number of inference steps: {args.num_inference_steps}\n")
     for batch_size in args.batch_sizes:
         for power_limit in args.power_limits:
+            for num_inference_steps in args.num_inference_steps:
+                print_and_write(outfile, f"{batch_size=}, {power_limit=}, {num_inference_steps=}\n", flush=True)
+                cmd=[
                     "docker", "run",
                     "--gpus", '"device=' + ','.join(args.gpu_ids) + '"',
                     "--cap-add", "SYS_ADMIN",
                     "--power-limit", power_limit,
                     "--model", args.model,
                     "--huggingface-token", hf_token,
+                    "--num-inference-steps", num_inference_steps,
+                ]
+                if args.monitor_power:
+                    cmd.append("--monitor-power")
+                with subprocess.Popen(args=cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) as proc:
+                    if proc.stdout:
+                        i = 0
+                        for line in proc.stdout:
+                            print_and_write(outfile, line, flush=i % 50 == 0)
+                            i += 1
+                # If proc exited with non-zero status, it's probably an OOM.
+                # Move on to the next batch size.
+                if proc.returncode != 0:
+                    break
     parser.add_argument("--gpu-ids", type=str, nargs="+", help="GPU IDs to use")
     parser.add_argument("--batch-sizes", type=str, nargs="+", default=["8", "4", "2", "1"], help="Batch sizes to benchmark")
     parser.add_argument("--power-limits", type=str, nargs="+", default=["400", "300", "200"], help="Power limits to benchmark")
+    parser.add_argument("--num-inference-steps", type=str, nargs="+", default=["1", "2", "4", "8", "16", "25", "30", "40", "50"], help="Number of inference steps to run")
+    parser.add_argument("--monitor-power", default=False, action="store_true", help="Whether to monitor power over time.")
     args = parser.parse_args()
     main(args)

benchmark/diffusion/text-to-video/pegasus/A100/queue_1gpu.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 - command:
-    - "python scripts/benchmark_one_model.py --model {{ model }} --result-root results/joule --dataset-path sharegpt4video/sharegpt4video_100.json --gpu-ids {{ gpu }} --batch-sizes 16 8 4 2 1 --power-limits 400 --num-inference-steps 25 --num-frames 16"
   model:
     - ali-vilab/text-to-video-ms-1.7b
     - guoyww/animatediff-motion-adapter-v1-5-3

 - command:
+    - "python scripts/benchmark_one_model.py --model {{ model }} --result-root results/joule --dataset-path sharegpt4video/sharegpt4video_100.json --gpu-ids {{ gpu }} --batch-sizes 32 16 8 4 2 1 --power-limits 400 --num-inference-steps 1 2 4 8 16 25 30 40 50 --num-frames 16"
   model:
     - ali-vilab/text-to-video-ms-1.7b
     - guoyww/animatediff-motion-adapter-v1-5-3

benchmark/diffusion/text-to-video/pegasus/H100/queue_1gpu.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 - command:
-    - "python scripts/benchmark_one_model.py --model {{ model }} --result-root results/joule --dataset-path sharegpt4video/sharegpt4video_700.json --gpu-ids {{ gpu }} --batch-sizes 64 32 16 8 4 2 1 --power-limits 700 --num-inference-steps 25 --num-frames 16"
   model:
     - ali-vilab/text-to-video-ms-1.7b
     - guoyww/animatediff-motion-adapter-v1-5-3

 - command:
+    - "python scripts/benchmark_one_model.py --model {{ model }} --result-root results/joule --dataset-path sharegpt4video/sharegpt4video_100.json --gpu-ids {{ gpu }} --batch-sizes 32 16 8 4 2 1 --power-limits 700 --num-inference-steps 1 2 4 8 16 25 30 40 50 --num-frames 16"
   model:
     - ali-vilab/text-to-video-ms-1.7b
     - guoyww/animatediff-motion-adapter-v1-5-3

benchmark/diffusion/text-to-video/scripts/aggregate_leaderboard_models.py CHANGED Viewed

@@ -15,7 +15,7 @@ def main(results_dir: Path, output_file: Path) -> None:
     for model_dir in sorted(glob(f"{results_dir}/*/*")):
         model_name = "/".join(model_dir.split("/")[-2:])
         print(f"  {model_name}")
-        result_file_cand = glob(f"{model_dir}/bs1+*+results.json")
         assert len(result_file_cand) == 1, model_name
         results_data = json.load(open(result_file_cand[0]))
         denosing_module_name = "unet" if "unet" in results_data["num_parameters"] else "transformer"
@@ -24,6 +24,7 @@ def main(results_dir: Path, output_file: Path) -> None:
             nickname=model_name.split("/")[-1].replace("-", " ").title(),
             total_params=raw_params_to_readable(sum(results_data["num_parameters"].values())),
             denoising_params=raw_params_to_readable(results_data["num_parameters"][denosing_module_name]),
         )
         assert model_name not in models
         models[model_name] = model_info

     for model_dir in sorted(glob(f"{results_dir}/*/*")):
         model_name = "/".join(model_dir.split("/")[-2:])
         print(f"  {model_name}")
+        result_file_cand = glob(f"{model_dir}/bs1+*+steps25+results.json")
         assert len(result_file_cand) == 1, model_name
         results_data = json.load(open(result_file_cand[0]))
         denosing_module_name = "unet" if "unet" in results_data["num_parameters"] else "transformer"
             nickname=model_name.split("/")[-1].replace("-", " ").title(),
             total_params=raw_params_to_readable(sum(results_data["num_parameters"].values())),
             denoising_params=raw_params_to_readable(results_data["num_parameters"][denosing_module_name]),
+            resolution="NA",
         )
         assert model_name not in models
         models[model_name] = model_info

benchmark/diffusion/text-to-video/scripts/benchmark_one_datapoint.py CHANGED Viewed

@@ -32,10 +32,10 @@ class Results:
     model: str
     num_parameters: dict[str, int]
     gpu_model: str
-    num_inference_steps: int
-    num_frames: int
     power_limit: int
     batch_size: int
     num_prompts: int
     total_runtime: float = 0.0
     total_energy: float = 0.0
@@ -119,7 +119,7 @@ def load_text_prompts(
     Returns:
         Total number of prompts and a list of batches of prompts.
     """
-    dataset = json.load(open(path))["caption"]
     if num_batches is not None:
         if len(dataset) < num_batches * batch_size:
             raise ValueError("Dataset is too small for the given number of batches.")
@@ -151,8 +151,8 @@ def benchmark(args: argparse.Namespace) -> None:
     results_dir = Path(args.result_root) / args.model
     results_dir.mkdir(parents=True, exist_ok=True)
-    benchmark_name = str(results_dir / f"bs{args.batch_size}+pl{args.power_limit}")
-    video_dir = results_dir / f"bs{args.batch_size}+pl{args.power_limit}+generated"
     video_dir.mkdir(exist_ok=True)
     arg_out_filename = f"{benchmark_name}+args.json"
@@ -190,7 +190,7 @@ def benchmark(args: argparse.Namespace) -> None:
     ]
     torch.cuda.reset_peak_memory_stats(device="cuda:0")
-    zeus_monitor.begin_window("benchmark", sync_cuda=False)
     # Build common parameter dict for all batches
     params: dict[str, Any] = dict(
@@ -208,15 +208,15 @@ def benchmark(args: argparse.Namespace) -> None:
         params["prompt"] = intermediate.prompts
-        zeus_monitor.begin_window("batch", sync_cuda=False)
         frames = pipeline(**params).frames
-        batch_measurements = zeus_monitor.end_window("batch", sync_cuda=False)
         intermediate.frames = frames
         intermediate.batch_latency = batch_measurements.time
         intermediate.batch_energy = batch_measurements.total_energy
-    measurements = zeus_monitor.end_window("benchmark", sync_cuda=False)
     peak_memory = torch.cuda.max_memory_allocated(device="cuda:0")
     results: list[Result] = []
@@ -253,10 +253,10 @@ def benchmark(args: argparse.Namespace) -> None:
         model=args.model,
         num_parameters=count_parameters(pipeline),
         gpu_model=gpu_model,
-        num_inference_steps=args.num_inference_steps,
-        num_frames=args.num_frames,
         power_limit=args.power_limit,
         batch_size=args.batch_size,
         num_prompts=num_prompts,
         total_runtime=measurements.time,
         total_energy=measurements.total_energy,

     model: str
     num_parameters: dict[str, int]
     gpu_model: str
     power_limit: int
     batch_size: int
+    num_inference_steps: int
+    num_frames: int
     num_prompts: int
     total_runtime: float = 0.0
     total_energy: float = 0.0
     Returns:
         Total number of prompts and a list of batches of prompts.
     """
+    dataset = json.load(open(path))["caption"] * 10
     if num_batches is not None:
         if len(dataset) < num_batches * batch_size:
             raise ValueError("Dataset is too small for the given number of batches.")
     results_dir = Path(args.result_root) / args.model
     results_dir.mkdir(parents=True, exist_ok=True)
+    benchmark_name = str(results_dir / f"bs{args.batch_size}+pl{args.power_limit}+steps{args.num_inference_steps}")
+    video_dir = results_dir / f"bs{args.batch_size}+pl{args.power_limit}+steps{args.num_inference_steps}+generated"
     video_dir.mkdir(exist_ok=True)
     arg_out_filename = f"{benchmark_name}+args.json"
     ]
     torch.cuda.reset_peak_memory_stats(device="cuda:0")
+    zeus_monitor.begin_window("benchmark", sync_execution=False)
     # Build common parameter dict for all batches
     params: dict[str, Any] = dict(
         params["prompt"] = intermediate.prompts
+        zeus_monitor.begin_window("batch", sync_execution=False)
         frames = pipeline(**params).frames
+        batch_measurements = zeus_monitor.end_window("batch", sync_execution=False)
         intermediate.frames = frames
         intermediate.batch_latency = batch_measurements.time
         intermediate.batch_energy = batch_measurements.total_energy
+    measurements = zeus_monitor.end_window("benchmark", sync_execution=False)
     peak_memory = torch.cuda.max_memory_allocated(device="cuda:0")
     results: list[Result] = []
         model=args.model,
         num_parameters=count_parameters(pipeline),
         gpu_model=gpu_model,
         power_limit=args.power_limit,
         batch_size=args.batch_size,
+        num_inference_steps=args.num_inference_steps,
+        num_frames=args.num_frames,
         num_prompts=num_prompts,
         total_runtime=measurements.time,
         total_energy=measurements.total_energy,

benchmark/diffusion/text-to-video/scripts/benchmark_one_model.py CHANGED Viewed

@@ -28,44 +28,46 @@ def main(args: argparse.Namespace) -> None:
     print_and_write(outfile, f"Benchmarking {args.model}\n")
     print_and_write(outfile, f"Batch sizes: {args.batch_sizes}\n")
     print_and_write(outfile, f"Power limits: {args.power_limits}\n")
     for batch_size in args.batch_sizes:
         for power_limit in args.power_limits:
-            print_and_write(outfile, f"{batch_size=}, {power_limit=}\n", flush=True)
-            with subprocess.Popen(
-                args=[
-                    "docker", "run",
-                    "--gpus", '"device=' + ','.join(args.gpu_ids) + '"',
-                    "--cap-add", "SYS_ADMIN",
-                    "--name", f"leaderboard-t2v-{''.join(args.gpu_ids)}",
-                    "--rm",
-                    "-v", "/data/leaderboard/hfcache:/root/.cache/huggingface",
-                    "-v", f"{os.getcwd()}:/workspace/text-to-video",
-                    "mlenergy/leaderboard:diffusion-t2v",
-                    "--result-root", args.result_root,
-                    "--batch-size", batch_size,
-                    "--num-batches", "10",
-                    "--power-limit", power_limit,
-                    "--model", args.model,
-                    "--dataset-path", args.dataset_path,
-                    "--huggingface-token", hf_token,
-                    "--num-inference-steps", args.num_inference_steps,
-                    "--num-frames", args.num_frames,
-                ],
-                stdout=subprocess.PIPE,
-                stderr=subprocess.STDOUT,
-                text=True,
-            ) as proc:
-                if proc.stdout:
-                    i = 0
-                    for line in proc.stdout:
-                        print_and_write(outfile, line, flush=i % 50 == 0)
-                        i += 1
-            # If proc exited with non-zero status, it's probably an OOM.
-            # Move on to the next batch size.
-            if proc.returncode != 0:
-                break
@@ -76,7 +78,7 @@ if __name__ == "__main__":
     parser.add_argument("--gpu-ids", type=str, nargs="+", help="GPU IDs to use")
     parser.add_argument("--batch-sizes", type=str, nargs="+", default=["8", "4", "2", "1"], help="Batch sizes to benchmark")
     parser.add_argument("--power-limits", type=str, nargs="+", default=["400", "300", "200"], help="Power limits to benchmark")
-    parser.add_argument("--num-inference-steps", type=str, required=True, help="Number of denoising steps")
     parser.add_argument("--num-frames", type=str, required=True, help="Number of frames to generate")
     parser.add_argument("--dataset-path", type=str, help="Path to the dataset JSON file.")
     args = parser.parse_args()

     print_and_write(outfile, f"Benchmarking {args.model}\n")
     print_and_write(outfile, f"Batch sizes: {args.batch_sizes}\n")
     print_and_write(outfile, f"Power limits: {args.power_limits}\n")
+    print_and_write(outfile, f"Number of inference steps: {args.num_inference_steps}\n")
     for batch_size in args.batch_sizes:
         for power_limit in args.power_limits:
+            for num_inference_steps in args.num_inference_steps:
+                print_and_write(outfile, f"{batch_size=}, {power_limit=}, {num_inference_steps=}\n", flush=True)
+                with subprocess.Popen(
+                    args=[
+                        "docker", "run",
+                        "--gpus", '"device=' + ','.join(args.gpu_ids) + '"',
+                        "--cap-add", "SYS_ADMIN",
+                        "--name", f"leaderboard-t2v-{''.join(args.gpu_ids)}",
+                        "--rm",
+                        "-v", "/data/leaderboard/hfcache:/root/.cache/huggingface",
+                        "-v", f"{os.getcwd()}:/workspace/text-to-video",
+                        "mlenergy/leaderboard:diffusion-t2v",
+                        "--result-root", args.result_root,
+                        "--batch-size", batch_size,
+                        "--num-batches", "10",
+                        "--power-limit", power_limit,
+                        "--model", args.model,
+                        "--dataset-path", args.dataset_path,
+                        "--huggingface-token", hf_token,
+                        "--num-inference-steps", num_inference_steps,
+                        "--num-frames", args.num_frames,
+                    ],
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.STDOUT,
+                    text=True,
+                ) as proc:
+                    if proc.stdout:
+                        i = 0
+                        for line in proc.stdout:
+                            print_and_write(outfile, line, flush=i % 50 == 0)
+                            i += 1
+                # If proc exited with non-zero status, it's probably an OOM.
+                # Move on to the next batch size.
+                if proc.returncode != 0:
+                    break
     parser.add_argument("--gpu-ids", type=str, nargs="+", help="GPU IDs to use")
     parser.add_argument("--batch-sizes", type=str, nargs="+", default=["8", "4", "2", "1"], help="Batch sizes to benchmark")
     parser.add_argument("--power-limits", type=str, nargs="+", default=["400", "300", "200"], help="Power limits to benchmark")
+    parser.add_argument("--num-inference-steps", type=str, nargs="+", default=["1", "2", "4", "8", "16", "25", "30", "40", "50"], help="Number of denoising steps")
     parser.add_argument("--num-frames", type=str, required=True, help="Number of frames to generate")
     parser.add_argument("--dataset-path", type=str, help="Path to the dataset JSON file.")
     args = parser.parse_args()

data/diffusion/image-to-video/A100-SXM4-40GB/ali-vilab/i2vgen-xl/bs1+steps25+frames16.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
   "Model": "ali-vilab/i2vgen-xl",
   "GPU": "NVIDIA A100-SXM4-40GB",
-  "Energy/video (J)": 16348.217100000009,
-  "Batch latency (s)": 44.41898396015167,
   "Batch size": 1,
   "Denoising steps": 25,
   "Frames": 16

 {
   "Model": "ali-vilab/i2vgen-xl",
   "GPU": "NVIDIA A100-SXM4-40GB",
+  "Energy/video (J)": 16915.850124999997,
+  "Batch latency (s)": 46.14208295941353,
   "Batch size": 1,
   "Denoising steps": 25,
   "Frames": 16

data/diffusion/image-to-video/A100-SXM4-40GB/ali-vilab/i2vgen-xl/bs2+steps25+frames16.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
   "Model": "ali-vilab/i2vgen-xl",
   "GPU": "NVIDIA A100-SXM4-40GB",
-  "Energy/video (J)": 16091.048200000008,
-  "Batch latency (s)": 85.8618726491928,
   "Batch size": 2,
   "Denoising steps": 25,
   "Frames": 16

 {
   "Model": "ali-vilab/i2vgen-xl",
   "GPU": "NVIDIA A100-SXM4-40GB",
+  "Energy/video (J)": 16496.045437499997,
+  "Batch latency (s)": 89.03019031882286,
   "Batch size": 2,
   "Denoising steps": 25,
   "Frames": 16

data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid-xt/bs1+steps25+frames25.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
   "Model": "stabilityai/stable-video-diffusion-img2vid-xt",
   "GPU": "NVIDIA A100-SXM4-40GB",
-  "Energy/video (J)": 15346.527300000005,
-  "Batch latency (s)": 42.11920440196991,
   "Batch size": 1,
   "Denoising steps": 25,
   "Frames": 25

 {
   "Model": "stabilityai/stable-video-diffusion-img2vid-xt",
   "GPU": "NVIDIA A100-SXM4-40GB",
+  "Energy/video (J)": 15709.767625000095,
+  "Batch latency (s)": 42.397395104169846,
   "Batch size": 1,
   "Denoising steps": 25,
   "Frames": 25

data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid-xt/bs2+steps25+frames25.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "Model": "stabilityai/stable-video-diffusion-img2vid-xt",
+  "GPU": "NVIDIA A100-SXM4-40GB",
+  "Energy/video (J)": 15291.016625000047,
+  "Batch latency (s)": 82.90474811196327,
+  "Batch size": 2,
+  "Denoising steps": 25,
+  "Frames": 25
+}

data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid-xt/bs3+steps25+frames25.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "Model": "stabilityai/stable-video-diffusion-img2vid-xt",
+  "GPU": "NVIDIA A100-SXM4-40GB",
+  "Energy/video (J)": 14761.389999999976,
+  "Batch latency (s)": 120.65004900523594,
+  "Batch size": 3,
+  "Denoising steps": 25,
+  "Frames": 25
+}

data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid/bs1+steps25+frames14.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
   "Model": "stabilityai/stable-video-diffusion-img2vid",
   "GPU": "NVIDIA A100-SXM4-40GB",
-  "Energy/video (J)": 8803.383999999985,
-  "Batch latency (s)": 24.10387804508209,
   "Batch size": 1,
   "Denoising steps": 25,
   "Frames": 14

 {
   "Model": "stabilityai/stable-video-diffusion-img2vid",
   "GPU": "NVIDIA A100-SXM4-40GB",
+  "Energy/video (J)": 9066.434124999912,
+  "Batch latency (s)": 24.369865357875824,
   "Batch size": 1,
   "Denoising steps": 25,
   "Frames": 14

data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid/bs2+steps25+frames14.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "Model": "stabilityai/stable-video-diffusion-img2vid",
+  "GPU": "NVIDIA A100-SXM4-40GB",
+  "Energy/video (J)": 8835.22312499996,
+  "Batch latency (s)": 47.65615049004555,
+  "Batch size": 2,
+  "Denoising steps": 25,
+  "Frames": 14
+}

data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid/bs3+steps25+frames14.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "Model": "stabilityai/stable-video-diffusion-img2vid",
+  "GPU": "NVIDIA A100-SXM4-40GB",
+  "Energy/video (J)": 8683.536285714292,
+  "Batch latency (s)": 70.55723374230521,
+  "Batch size": 3,
+  "Denoising steps": 25,
+  "Frames": 14
+}

data/diffusion/image-to-video/H100 80GB HBM3/ali-vilab/i2vgen-xl/bs1+steps25+frames16.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
   "Model": "ali-vilab/i2vgen-xl",
   "GPU": "NVIDIA H100 80GB HBM3",
-  "Energy/video (J)": 14222.658400000026,
-  "Batch latency (s)": 22.950254821777342,
   "Batch size": 1,
   "Denoising steps": 25,
   "Frames": 16

 {
   "Model": "ali-vilab/i2vgen-xl",
   "GPU": "NVIDIA H100 80GB HBM3",
+  "Energy/video (J)": 14867.419125000015,
+  "Batch latency (s)": 23.717748790979385,
   "Batch size": 1,
   "Denoising steps": 25,
   "Frames": 16

data/diffusion/image-to-video/H100 80GB HBM3/ali-vilab/i2vgen-xl/bs2+steps25+frames16.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
   "Model": "ali-vilab/i2vgen-xl",
   "GPU": "NVIDIA H100 80GB HBM3",
-  "Energy/video (J)": 13657.628800000017,
-  "Batch latency (s)": 42.94859471321106,
   "Batch size": 2,
   "Denoising steps": 25,
   "Frames": 16

 {
   "Model": "ali-vilab/i2vgen-xl",
   "GPU": "NVIDIA H100 80GB HBM3",
+  "Energy/video (J)": 14348.508499999996,
+  "Batch latency (s)": 44.71498331427574,
   "Batch size": 2,
   "Denoising steps": 25,
   "Frames": 16

data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid-xt/bs1+steps25+frames25.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
   "Model": "stabilityai/stable-video-diffusion-img2vid-xt",
   "GPU": "NVIDIA H100 80GB HBM3",
-  "Energy/video (J)": 13366.447699999995,
-  "Batch latency (s)": 20.89660472869873,
   "Batch size": 1,
   "Denoising steps": 25,
   "Frames": 25

 {
   "Model": "stabilityai/stable-video-diffusion-img2vid-xt",
   "GPU": "NVIDIA H100 80GB HBM3",
+  "Energy/video (J)": 13392.813624999952,
+  "Batch latency (s)": 20.788252592086792,
   "Batch size": 1,
   "Denoising steps": 25,
   "Frames": 25

data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid-xt/bs2+steps25+frames25.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "Model": "stabilityai/stable-video-diffusion-img2vid-xt",
+  "GPU": "NVIDIA H100 80GB HBM3",
+  "Energy/video (J)": 12901.83275000006,
+  "Batch latency (s)": 39.99498334527016,
+  "Batch size": 2,
+  "Denoising steps": 25,
+  "Frames": 25
+}

data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid-xt/bs3+steps25+frames25.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "Model": "stabilityai/stable-video-diffusion-img2vid-xt",
+  "GPU": "NVIDIA H100 80GB HBM3",
+  "Energy/video (J)": 12790.552809523862,
+  "Batch latency (s)": 59.380911929266794,
+  "Batch size": 3,
+  "Denoising steps": 25,
+  "Frames": 25
+}

data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid/bs1+steps25+frames14.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
   "Model": "stabilityai/stable-video-diffusion-img2vid",
   "GPU": "NVIDIA H100 80GB HBM3",
-  "Energy/video (J)": 7550.921200000029,
-  "Batch latency (s)": 12.265265846252442,
   "Batch size": 1,
   "Denoising steps": 25,
   "Frames": 14

 {
   "Model": "stabilityai/stable-video-diffusion-img2vid",
   "GPU": "NVIDIA H100 80GB HBM3",
+  "Energy/video (J)": 7623.074500000104,
+  "Batch latency (s)": 12.191031396389008,
   "Batch size": 1,
   "Denoising steps": 25,
   "Frames": 14

data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid/bs2+steps25+frames14.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "Model": "stabilityai/stable-video-diffusion-img2vid",
+  "GPU": "NVIDIA H100 80GB HBM3",
+  "Energy/video (J)": 7416.721437499975,
+  "Batch latency (s)": 23.368041068315506,
+  "Batch size": 2,
+  "Denoising steps": 25,
+  "Frames": 14
+}

data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid/bs3+steps25+frames14.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "Model": "stabilityai/stable-video-diffusion-img2vid",
+  "GPU": "NVIDIA H100 80GB HBM3",
+  "Energy/video (J)": 7354.00133333333,
+  "Batch latency (s)": 34.5100462777274,
+  "Batch size": 3,
+  "Denoising steps": 25,
+  "Frames": 14
+}

data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs1+steps25.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
   "Model": "kandinsky-community/kandinsky-2-2-decoder",
   "GPU": "NVIDIA A100-SXM4-40GB",
-  "Energy/image (J)": 329.6848999999929,
-  "Batch latency (s)": 1.808762288093567,
   "Batch size": 1,
   "Denoising steps": 25
 }

 {
   "Model": "kandinsky-community/kandinsky-2-2-decoder",
   "GPU": "NVIDIA A100-SXM4-40GB",
+  "Energy/image (J)": 324.06850000005215,
+  "Batch latency (s)": 1.6537675857543945,
   "Batch size": 1,
   "Denoising steps": 25
 }

data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs16+steps25.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
   "Model": "kandinsky-community/kandinsky-2-2-decoder",
   "GPU": "NVIDIA A100-SXM4-40GB",
-  "Energy/image (J)": 174.24531874999812,
-  "Batch latency (s)": 7.439638161659241,
   "Batch size": 16,
   "Denoising steps": 25
 }

 {
   "Model": "kandinsky-community/kandinsky-2-2-decoder",
   "GPU": "NVIDIA A100-SXM4-40GB",
+  "Energy/image (J)": 172.51030000000029,
+  "Batch latency (s)": 7.375234842300415,
   "Batch size": 16,
   "Denoising steps": 25
 }

data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs2+steps25.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
   "Model": "kandinsky-community/kandinsky-2-2-decoder",
   "GPU": "NVIDIA A100-SXM4-40GB",
-  "Energy/image (J)": 232.40825000000186,
-  "Batch latency (s)": 1.640995717048645,
   "Batch size": 2,
   "Denoising steps": 25
 }

 {
   "Model": "kandinsky-community/kandinsky-2-2-decoder",
   "GPU": "NVIDIA A100-SXM4-40GB",
+  "Energy/image (J)": 230.3378000000026,
+  "Batch latency (s)": 1.5861663103103638,
   "Batch size": 2,
   "Denoising steps": 25
 }

data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs32+steps25.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "Model": "kandinsky-community/kandinsky-2-2-decoder",
+  "GPU": "NVIDIA A100-SXM4-40GB",
+  "Energy/image (J)": 163.0797656249997,
+  "Batch latency (s)": 13.998618459701538,
+  "Batch size": 32,
+  "Denoising steps": 25
+}

data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs4+steps25.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
   "Model": "kandinsky-community/kandinsky-2-2-decoder",
   "GPU": "NVIDIA A100-SXM4-40GB",
-  "Energy/image (J)": 202.8745750000002,
-  "Batch latency (s)": 2.3463359832763673,
   "Batch size": 4,
   "Denoising steps": 25
 }

 {
   "Model": "kandinsky-community/kandinsky-2-2-decoder",
   "GPU": "NVIDIA A100-SXM4-40GB",
+  "Energy/image (J)": 200.16462499999906,
+  "Batch latency (s)": 2.299217462539673,
   "Batch size": 4,
   "Denoising steps": 25
 }

data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs8+steps25.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
   "Model": "kandinsky-community/kandinsky-2-2-decoder",
   "GPU": "NVIDIA A100-SXM4-40GB",
-  "Energy/image (J)": 187.65767500000075,
-  "Batch latency (s)": 4.030062103271485,
   "Batch size": 8,
   "Denoising steps": 25
 }

 {
   "Model": "kandinsky-community/kandinsky-2-2-decoder",
   "GPU": "NVIDIA A100-SXM4-40GB",
+  "Energy/image (J)": 184.9021625000052,
+  "Batch latency (s)": 4.0124232292175295,
   "Batch size": 8,
   "Denoising steps": 25
 }

data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-3/bs1+steps25.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
   "Model": "kandinsky-community/kandinsky-3",
   "GPU": "NVIDIA A100-SXM4-40GB",
-  "Energy/image (J)": 914.0325000000187,
-  "Batch latency (s)": 3.1329710721969604,
   "Batch size": 1,
   "Denoising steps": 25
 }

 {
   "Model": "kandinsky-community/kandinsky-3",
   "GPU": "NVIDIA A100-SXM4-40GB",
+  "Energy/image (J)": 930.2532999999821,
+  "Batch latency (s)": 3.0359585523605346,
   "Batch size": 1,
   "Denoising steps": 25
 }

data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-3/bs2+steps25.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
   "Model": "kandinsky-community/kandinsky-3",
   "GPU": "NVIDIA A100-SXM4-40GB",
-  "Energy/image (J)": 875.4787999999942,
-  "Batch latency (s)": 5.2747025966644285,
   "Batch size": 2,
   "Denoising steps": 25
 }

 {
   "Model": "kandinsky-community/kandinsky-3",
   "GPU": "NVIDIA A100-SXM4-40GB",
+  "Energy/image (J)": 895.7575500000036,
+  "Batch latency (s)": 5.261959171295166,
   "Batch size": 2,
   "Denoising steps": 25
 }

data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs1+steps25.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
   "Model": "prompthero/openjourney-v4",
   "GPU": "NVIDIA A100-SXM4-40GB",
-  "Energy/image (J)": 235.712099999981,
-  "Batch latency (s)": 1.0208970069885255,
   "Batch size": 1,
   "Denoising steps": 25
 }

 {
   "Model": "prompthero/openjourney-v4",
   "GPU": "NVIDIA A100-SXM4-40GB",
+  "Energy/image (J)": 227.21699999999254,
+  "Batch latency (s)": 0.9210062503814698,
   "Batch size": 1,
   "Denoising steps": 25
 }

data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs16+steps25.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
   "Model": "prompthero/openjourney-v4",
   "GPU": "NVIDIA A100-SXM4-40GB",
-  "Energy/image (J)": 157.4185124999989,
-  "Batch latency (s)": 6.579187059402466,
   "Batch size": 16,
   "Denoising steps": 25
 }

 {
   "Model": "prompthero/openjourney-v4",
   "GPU": "NVIDIA A100-SXM4-40GB",
+  "Energy/image (J)": 156.51368749999673,
+  "Batch latency (s)": 6.559858226776123,
   "Batch size": 16,
   "Denoising steps": 25
 }

data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs2+steps25.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
   "Model": "prompthero/openjourney-v4",
   "GPU": "NVIDIA A100-SXM4-40GB",
-  "Energy/image (J)": 196.30995000000112,
-  "Batch latency (s)": 1.1641260623931884,
   "Batch size": 2,
   "Denoising steps": 25
 }

 {
   "Model": "prompthero/openjourney-v4",
   "GPU": "NVIDIA A100-SXM4-40GB",
+  "Energy/image (J)": 188.78500000000932,
+  "Batch latency (s)": 1.1187455892562865,
   "Batch size": 2,
   "Denoising steps": 25
 }

data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs32+steps25.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "Model": "prompthero/openjourney-v4",
+  "GPU": "NVIDIA A100-SXM4-40GB",
+  "Energy/image (J)": 154.23499999999768,
+  "Batch latency (s)": 12.850126147270203,
+  "Batch size": 32,
+  "Denoising steps": 25
+}

data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs4+steps25.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
   "Model": "prompthero/openjourney-v4",
   "GPU": "NVIDIA A100-SXM4-40GB",
-  "Energy/image (J)": 177.43804999999702,
-  "Batch latency (s)": 1.884285831451416,
   "Batch size": 4,
   "Denoising steps": 25
 }

 {
   "Model": "prompthero/openjourney-v4",
   "GPU": "NVIDIA A100-SXM4-40GB",
+  "Energy/image (J)": 175.33082500000017,
+  "Batch latency (s)": 1.8664743423461914,
   "Batch size": 4,
   "Denoising steps": 25
 }

data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs64+steps25.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "Model": "prompthero/openjourney-v4",
+  "GPU": "NVIDIA A100-SXM4-40GB",
+  "Energy/image (J)": 150.57691875000017,
+  "Batch latency (s)": 25.000647592544556,
+  "Batch size": 64,
+  "Denoising steps": 25
+}

data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs8+steps25.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
   "Model": "prompthero/openjourney-v4",
   "GPU": "NVIDIA A100-SXM4-40GB",
-  "Energy/image (J)": 162.92667500000098,
-  "Batch latency (s)": 3.505508875846863,
   "Batch size": 8,
   "Denoising steps": 25
 }

 {
   "Model": "prompthero/openjourney-v4",
   "GPU": "NVIDIA A100-SXM4-40GB",
+  "Energy/image (J)": 163.7534500000067,
+  "Batch latency (s)": 3.423132634162903,
   "Batch size": 8,
   "Denoising steps": 25
 }

data/diffusion/text-to-image/A100-SXM4-40GB/segmind/SSD-1B/bs1+steps25.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
   "Model": "segmind/SSD-1B",
   "GPU": "NVIDIA A100-SXM4-40GB",
-  "Energy/image (J)": 717.2012000000104,
-  "Batch latency (s)": 1.9508831262588502,
   "Batch size": 1,
   "Denoising steps": 25
 }

 {
   "Model": "segmind/SSD-1B",
   "GPU": "NVIDIA A100-SXM4-40GB",
+  "Energy/image (J)": 745.7899999999441,
+  "Batch latency (s)": 1.9644724607467652,
   "Batch size": 1,
   "Denoising steps": 25
 }

data/diffusion/text-to-image/A100-SXM4-40GB/segmind/SSD-1B/bs2+steps25.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
   "Model": "segmind/SSD-1B",
   "GPU": "NVIDIA A100-SXM4-40GB",
-  "Energy/image (J)": 681.1273499999894,
-  "Batch latency (s)": 3.633535361289978,
   "Batch size": 2,
   "Denoising steps": 25
 }

 {
   "Model": "segmind/SSD-1B",
   "GPU": "NVIDIA A100-SXM4-40GB",
+  "Energy/image (J)": 700.4580500000156,
+  "Batch latency (s)": 3.6897377252578734,
   "Batch size": 2,
   "Denoising steps": 25
 }

data/diffusion/text-to-image/A100-SXM4-40GB/segmind/SSD-1B/bs4+steps25.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
   "Model": "segmind/SSD-1B",
   "GPU": "NVIDIA A100-SXM4-40GB",
-  "Energy/image (J)": 672.6853499999968,
-  "Batch latency (s)": 7.193562436103821,
   "Batch size": 4,
   "Denoising steps": 25
 }

 {
   "Model": "segmind/SSD-1B",
   "GPU": "NVIDIA A100-SXM4-40GB",
+  "Energy/image (J)": 688.6121250000084,
+  "Batch latency (s)": 7.168970584869385,
   "Batch size": 4,
   "Denoising steps": 25
 }