Spaces:
Running
on
L40S
Running
on
L40S
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | |
# | |
# Permission is hereby granted, free of charge, to any person obtaining a | |
# copy of this software and associated documentation files (the "Software"), | |
# to deal in the Software without restriction, including without limitation | |
# the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
# and/or sell copies of the Software, and to permit persons to whom the | |
# Software is furnished to do so, subject to the following conditions: | |
# | |
# The above copyright notice and this permission notice shall be included in | |
# all copies or substantial portions of the Software. | |
# | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
# DEALINGS IN THE SOFTWARE. | |
# | |
# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES | |
# SPDX-License-Identifier: MIT | |
import logging | |
import time | |
from abc import ABC, abstractmethod | |
from typing import Optional | |
import numpy as np | |
import torch | |
from se3_transformer.runtime.loggers import Logger | |
from se3_transformer.runtime.metrics import MeanAbsoluteError | |
class BaseCallback(ABC): | |
def on_fit_start(self, optimizer, args): | |
pass | |
def on_fit_end(self): | |
pass | |
def on_epoch_end(self): | |
pass | |
def on_batch_start(self): | |
pass | |
def on_validation_step(self, input, target, pred): | |
pass | |
def on_validation_end(self, epoch=None): | |
pass | |
def on_checkpoint_load(self, checkpoint): | |
pass | |
def on_checkpoint_save(self, checkpoint): | |
pass | |
class LRSchedulerCallback(BaseCallback): | |
def __init__(self, logger: Optional[Logger] = None): | |
self.logger = logger | |
self.scheduler = None | |
def get_scheduler(self, optimizer, args): | |
pass | |
def on_fit_start(self, optimizer, args): | |
self.scheduler = self.get_scheduler(optimizer, args) | |
def on_checkpoint_load(self, checkpoint): | |
self.scheduler.load_state_dict(checkpoint['scheduler_state_dict']) | |
def on_checkpoint_save(self, checkpoint): | |
checkpoint['scheduler_state_dict'] = self.scheduler.state_dict() | |
def on_epoch_end(self): | |
if self.logger is not None: | |
self.logger.log_metrics({'learning rate': self.scheduler.get_last_lr()[0]}, step=self.scheduler.last_epoch) | |
self.scheduler.step() | |
class QM9MetricCallback(BaseCallback): | |
""" Logs the rescaled mean absolute error for QM9 regression tasks """ | |
def __init__(self, logger, targets_std, prefix=''): | |
self.mae = MeanAbsoluteError() | |
self.logger = logger | |
self.targets_std = targets_std | |
self.prefix = prefix | |
self.best_mae = float('inf') | |
def on_validation_step(self, input, target, pred): | |
self.mae(pred.detach(), target.detach()) | |
def on_validation_end(self, epoch=None): | |
mae = self.mae.compute() * self.targets_std | |
logging.info(f'{self.prefix} MAE: {mae}') | |
self.logger.log_metrics({f'{self.prefix} MAE': mae}, epoch) | |
self.best_mae = min(self.best_mae, mae) | |
def on_fit_end(self): | |
if self.best_mae != float('inf'): | |
self.logger.log_metrics({f'{self.prefix} best MAE': self.best_mae}) | |
class QM9LRSchedulerCallback(LRSchedulerCallback): | |
def __init__(self, logger, epochs): | |
super().__init__(logger) | |
self.epochs = epochs | |
def get_scheduler(self, optimizer, args): | |
min_lr = args.min_learning_rate if args.min_learning_rate else args.learning_rate / 10.0 | |
return torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, self.epochs, eta_min=min_lr) | |
class PerformanceCallback(BaseCallback): | |
def __init__(self, logger, batch_size: int, warmup_epochs: int = 1, mode: str = 'train'): | |
self.batch_size = batch_size | |
self.warmup_epochs = warmup_epochs | |
self.epoch = 0 | |
self.timestamps = [] | |
self.mode = mode | |
self.logger = logger | |
def on_batch_start(self): | |
if self.epoch >= self.warmup_epochs: | |
self.timestamps.append(time.time() * 1000.0) | |
def _log_perf(self): | |
stats = self.process_performance_stats() | |
for k, v in stats.items(): | |
logging.info(f'performance {k}: {v}') | |
self.logger.log_metrics(stats) | |
def on_epoch_end(self): | |
self.epoch += 1 | |
def on_fit_end(self): | |
if self.epoch > self.warmup_epochs: | |
self._log_perf() | |
self.timestamps = [] | |
def process_performance_stats(self): | |
timestamps = np.asarray(self.timestamps) | |
deltas = np.diff(timestamps) | |
throughput = (self.batch_size / deltas).mean() | |
stats = { | |
f"throughput_{self.mode}": throughput, | |
f"latency_{self.mode}_mean": deltas.mean(), | |
f"total_time_{self.mode}": timestamps[-1] - timestamps[0], | |
} | |
for level in [90, 95, 99]: | |
stats.update({f"latency_{self.mode}_{level}": np.percentile(deltas, level)}) | |
return stats | |