Spaces:
Running
Running
File size: 5,220 Bytes
56a3a83 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
import gc
import time
from diffusers import AutoPipelineForText2Image, DiffusionPipeline
import numpy as np
from PIL import Image
import torch
from transformers.trainer_utils import set_seed
import tyro
from zeus.monitor import ZeusMonitor
from utils import get_logger, CsvHandler
from metrics import load_prompts, calculate_clip_score
# default parameters
DEVICE = "cuda:0"
WEIGHT_DTYPE = torch.float16
SEED = 0
OUTPUT_FILE = "results.csv"
OUTPUT_IMAGES = "images/"
def get_pipeline(model, device=DEVICE, weight_dtype=WEIGHT_DTYPE):
try:
return AutoPipelineForText2Image.from_pretrained(
model, torch_dtype=weight_dtype, safety_checker=None
).to(device)
except ValueError:
return DiffusionPipeline.from_pretrained(
model, torch_dtype=weight_dtype, safety_checker=None
).to(device)
def gpu_warmup(pipeline):
"""Warm up the GPU by running the given pipeline for 10 secs."""
logger = get_logger()
logger.info("Warming up GPU")
generator = torch.manual_seed(2)
timeout_start = time.time()
prompts, _ = load_prompts(1, 1)
while time.time() < timeout_start + 10:
_ = pipeline(
prompts, num_images_per_prompt=10, generator=generator, output_type="numpy"
).images
logger.info("Finished warming up GPU")
def benchmark(
model: str,
benchmark_size: int = 0,
batch_size: int = 1,
result_file: str = OUTPUT_FILE,
images_path: str = OUTPUT_IMAGES,
device: str = DEVICE,
seed: int = SEED,
weight_dtype: torch.dtype = WEIGHT_DTYPE,
write_header: bool = False,
warmup: bool = False,
settings: dict = {},
) -> None:
"""Benchmarks given model with a set of parameters.
Args:
model: The name of the model to benchmark, as shown on HuggingFace.
benchmark_size: The number of prompts to benchmark on. If 0, benchmarks
the entire parti-prompts dataset.
batch_size: The size of each batch of prompts. When benchmarking, the
prompts are split into batches of this size, and prompts are fed into
the model in batches.
result_file: The path to the output csv file.
images_path: The path to the output images directory.
device: The device to run the benchmark on.
seed: The seed to use for the RNG.
weight_dtype: The weight dtype to use for the model.
write_header: Whether to write the header row to the output csv file,
recommended to be True for the first run.
warmup: Whether to warm up the GPU before running the benchmark,
recommended to be True for the first run of a model.
settings: Any additional settings to pass to the pipeline, supports
any keyword parameters accepted by the model chosen. See HuggingFace
documentation on particular models for more details.
"""
logger = get_logger()
logger.info("Running benchmark for model: " + model)
csv_handler = CsvHandler(result_file)
if write_header:
csv_handler.write_header(
[
"model",
"GPU",
"num_prompts",
"batch_size",
"clip_score",
"average_batch_latency(s)",
"throughput(image/s)",
"avg_energy(J)",
"peak_memory(GB)",
]
)
set_seed(seed)
prompts, batched_prompts = load_prompts(benchmark_size, batch_size)
logger.info("Loaded prompts")
generator = torch.manual_seed(seed)
torch.cuda.set_device(device)
monitor = ZeusMonitor(gpu_indices=[torch.cuda.current_device()])
pipeline = get_pipeline(model, device=device, weight_dtype=weight_dtype)
if warmup:
gpu_warmup(pipeline)
torch.cuda.empty_cache()
gc.collect()
torch.cuda.reset_peak_memory_stats(device=device)
monitor.begin_window("generate")
images = []
for batch in batched_prompts:
image = pipeline(
batch, generator=generator, output_type="np", **settings
).images
images.append(image)
images = np.concatenate(images)
result_monitor = monitor.end_window("generate")
peak_memory = torch.cuda.max_memory_allocated(device=device)
for saved_image, saved_prompt in zip(images[::10], prompts[::10]):
saved_image = (saved_image * 255).astype(np.uint8)
Image.fromarray(saved_image).save(images_path + saved_prompt + ".png")
clip_score = calculate_clip_score(images, prompts)
result = {
"model": model,
"GPU": torch.cuda.get_device_name(device),
"num_prompts": len(prompts),
"batch_size": batch_size,
"clip_score": clip_score,
"avg_batch_latency": result_monitor.time / (benchmark_size / batch_size),
"throughput": benchmark_size / result_monitor.time,
"avg_energy": result_monitor.total_energy / benchmark_size,
"peak_memory": peak_memory,
}
logger.info("Results for model " + model + ":")
logger.info(result)
csv_handler.write_results(result)
logger.info("Finished benchmarking for " + model)
if __name__ == "__main__":
tyro.cli(benchmark)
|