File size: 3,244 Bytes
2fe55e2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
#!/usr/bin/env python3
import argparse
import os
import time
import numpy as np
import nvidia_smi
import psutil
import torch
from lama_cleaner.model_manager import ModelManager
from lama_cleaner.schema import Config, HDStrategy, SDSampler
try:
torch._C._jit_override_can_fuse_on_cpu(False)
torch._C._jit_override_can_fuse_on_gpu(False)
torch._C._jit_set_texpr_fuser_enabled(False)
torch._C._jit_set_nvfuser_enabled(False)
except:
pass
NUM_THREADS = str(4)
os.environ["OMP_NUM_THREADS"] = NUM_THREADS
os.environ["OPENBLAS_NUM_THREADS"] = NUM_THREADS
os.environ["MKL_NUM_THREADS"] = NUM_THREADS
os.environ["VECLIB_MAXIMUM_THREADS"] = NUM_THREADS
os.environ["NUMEXPR_NUM_THREADS"] = NUM_THREADS
if os.environ.get("CACHE_DIR"):
os.environ["TORCH_HOME"] = os.environ["CACHE_DIR"]
def run_model(model, size):
# RGB
image = np.random.randint(0, 256, (size[0], size[1], 3)).astype(np.uint8)
mask = np.random.randint(0, 255, size).astype(np.uint8)
config = Config(
ldm_steps=2,
hd_strategy=HDStrategy.ORIGINAL,
hd_strategy_crop_margin=128,
hd_strategy_crop_trigger_size=128,
hd_strategy_resize_limit=128,
prompt="a fox is sitting on a bench",
sd_steps=5,
sd_sampler=SDSampler.ddim
)
model(image, mask, config)
def benchmark(model, times: int, empty_cache: bool):
sizes = [(512, 512)]
nvidia_smi.nvmlInit()
device_id = 0
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(device_id)
def format(metrics):
return f"{np.mean(metrics):.2f} ± {np.std(metrics):.2f}"
process = psutil.Process(os.getpid())
# 每个 size 给出显存和内存占用的指标
for size in sizes:
torch.cuda.empty_cache()
time_metrics = []
cpu_metrics = []
memory_metrics = []
gpu_memory_metrics = []
for _ in range(times):
start = time.time()
run_model(model, size)
torch.cuda.synchronize()
# cpu_metrics.append(process.cpu_percent())
time_metrics.append((time.time() - start) * 1000)
memory_metrics.append(process.memory_info().rss / 1024 / 1024)
gpu_memory_metrics.append(nvidia_smi.nvmlDeviceGetMemoryInfo(handle).used / 1024 / 1024)
print(f"size: {size}".center(80, "-"))
# print(f"cpu: {format(cpu_metrics)}")
print(f"latency: {format(time_metrics)}ms")
print(f"memory: {format(memory_metrics)} MB")
print(f"gpu memory: {format(gpu_memory_metrics)} MB")
nvidia_smi.nvmlShutdown()
def get_args_parser():
parser = argparse.ArgumentParser()
parser.add_argument("--name")
parser.add_argument("--device", default="cuda", type=str)
parser.add_argument("--times", default=10, type=int)
parser.add_argument("--empty-cache", action="store_true")
return parser.parse_args()
if __name__ == "__main__":
args = get_args_parser()
device = torch.device(args.device)
model = ModelManager(
name=args.name,
device=device,
sd_run_local=True,
disable_nsfw=True,
sd_cpu_textencoder=True,
hf_access_token="123"
)
benchmark(model, args.times, args.empty_cache)
|