Spaces:

surokpro2
/

Unboxing_SDXL_with_SAEs

Running on Zero

App Files Files Community

Unboxing_SDXL_with_SAEs / scripts /train_sae.py

surokpro2

Upload folder using huggingface_hub

8cd00a9 verified 11 days ago

raw

history blame

9.81 kB

	'''
	Adapted from
	https://github.com/openai/sparse_autoencoder/blob/main/sparse_autoencoder/train.py
	'''


	import os
	import sys
	sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
	from typing import Callable, Iterable, Iterator

	import torch
	import torch.distributed as dist
	import torch.nn as nn
	import torch.nn.functional as F
	from torch.distributed import ReduceOp
	from SAE.dataset_iterator import ActivationsDataloader
	from SAE.sae import SparseAutoencoder, unit_norm_decoder_, unit_norm_decoder_grad_adjustment_
	from SAE.sae_utils import SAETrainingConfig, Config

	from types import SimpleNamespace
	from typing import Optional, List
	import json

	import tqdm

	def weighted_average(points: torch.Tensor, weights: torch.Tensor):
	weights = weights / weights.sum()
	return (points * weights.view(-1, 1)).sum(dim=0)


	@torch.no_grad()
	def geometric_median_objective(
	median: torch.Tensor, points: torch.Tensor, weights: torch.Tensor
	) -> torch.Tensor:

	norms = torch.linalg.norm(points - median.view(1, -1), dim=1) # type: ignore

	return (norms * weights).sum()


	def compute_geometric_median(
	points: torch.Tensor,
	weights: Optional[torch.Tensor] = None,
	eps: float = 1e-6,
	maxiter: int = 100,
	ftol: float = 1e-20,
	do_log: bool = False,
	):
	"""
	:param points: ``torch.Tensor`` of shape ``(n, d)``
	:param weights: Optional ``torch.Tensor`` of shape :math:``(n,)``.
	:param eps: Smallest allowed value of denominator, to avoid divide by zero.
	Equivalently, this is a smoothing parameter. Default 1e-6.
	:param maxiter: Maximum number of Weiszfeld iterations. Default 100
	:param ftol: If objective value does not improve by at least this `ftol` fraction, terminate the algorithm. Default 1e-20.
	:param do_log: If true will return a log of function values encountered through the course of the algorithm
	:return: SimpleNamespace object with fields
	- `median`: estimate of the geometric median, which is a ``torch.Tensor`` object of shape :math:``(d,)``
	- `termination`: string explaining how the algorithm terminated.
	- `logs`: function values encountered through the course of the algorithm in a list (None if do_log is false).
	"""
	with torch.no_grad():

	if weights is None:
	weights = torch.ones((points.shape[0],), device=points.device)
	# initialize median estimate at mean
	new_weights = weights
	median = weighted_average(points, weights)
	objective_value = geometric_median_objective(median, points, weights)
	if do_log:
	logs = [objective_value]
	else:
	logs = None

	# Weiszfeld iterations
	early_termination = False
	pbar = tqdm.tqdm(range(maxiter))
	for _ in pbar:
	prev_obj_value = objective_value

	norms = torch.linalg.norm(points - median.view(1, -1), dim=1) # type: ignore
	new_weights = weights / torch.clamp(norms, min=eps)
	median = weighted_average(points, new_weights)
	objective_value = geometric_median_objective(median, points, weights)

	if logs is not None:
	logs.append(objective_value)
	if abs(prev_obj_value - objective_value) <= ftol * objective_value:
	early_termination = True
	break

	pbar.set_description(f"Objective value: {objective_value:.4f}")

	median = weighted_average(points, new_weights) # allow autodiff to track it
	return SimpleNamespace(
	median=median,
	new_weights=new_weights,
	termination=(
	"function value converged within tolerance"
	if early_termination
	else "maximum iterations reached"
	),
	logs=logs,
	)

	def maybe_transpose(x):
	return x.T if not x.is_contiguous() and x.T.is_contiguous() else x

	import wandb

	RANK = 0

	class Logger:
	def __init__(self, sae_name, **kws):
	self.vals = {}
	self.enabled = (RANK == 0) and not kws.pop("dummy", False)
	self.sae_name = sae_name

	def logkv(self, k, v):
	if self.enabled:
	self.vals[f'{self.sae_name}/{k}'] = v.detach() if isinstance(v, torch.Tensor) else v
	return v

	def dumpkvs(self, step):
	if self.enabled:
	wandb.log(self.vals, step=step)
	self.vals = {}


	class FeaturesStats:
	def __init__(self, dim, logger):
	self.dim = dim
	self.logger = logger
	self.reinit()

	def reinit(self):
	self.n_activated = torch.zeros(self.dim, dtype=torch.long, device="cuda")
	self.n = 0

	def update(self, inds):
	self.n += inds.shape[0]
	inds = inds.flatten().detach()
	self.n_activated.scatter_add_(0, inds, torch.ones_like(inds))

	def log(self):
	self.logger.logkv('activated', (self.n_activated / self.n + 1e-9).log10().cpu().numpy())

	def training_loop_(
	aes,
	train_acts_iter,
	loss_fn,
	log_interval,
	save_interval,
	loggers,
	sae_cfgs,
	):
	sae_packs = []
	for ae, cfg, logger in zip(aes, sae_cfgs, loggers):
	pbar = tqdm.tqdm(unit=" steps", desc="Training Loss: ")
	fstats = FeaturesStats(ae.n_dirs, logger)
	opt = torch.optim.Adam(ae.parameters(), lr=cfg.lr, eps=cfg.eps, fused=True)
	sae_packs.append((ae, cfg, logger, pbar, fstats, opt))

	for i, flat_acts_train_batch in enumerate(train_acts_iter):
	flat_acts_train_batch = flat_acts_train_batch.cuda()

	for ae, cfg, logger, pbar, fstats, opt in sae_packs:
	recons, info = ae(flat_acts_train_batch)
	loss = loss_fn(ae, cfg, flat_acts_train_batch, recons, info, logger)

	fstats.update(info['inds'])

	bs = flat_acts_train_batch.shape[0]
	logger.logkv('not-activated 1e4', (ae.stats_last_nonzero > 1e4 / bs).mean(dtype=float).item())
	logger.logkv('not-activated 1e6', (ae.stats_last_nonzero > 1e6 / bs).mean(dtype=float).item())
	logger.logkv('not-activated 1e7', (ae.stats_last_nonzero > 1e7 / bs).mean(dtype=float).item())

	logger.logkv('explained variance', explained_variance(recons, flat_acts_train_batch))
	logger.logkv('l2_div', (torch.linalg.norm(recons, dim=1) / torch.linalg.norm(flat_acts_train_batch, dim=1)).mean())

	if (i + 1) % log_interval == 0:
	fstats.log()
	fstats.reinit()

	if (i + 1) % save_interval == 0:
	ae.save_to_disk(f"{cfg.save_path}/{i + 1}")

	loss.backward()

	unit_norm_decoder_(ae)
	unit_norm_decoder_grad_adjustment_(ae)

	opt.step()
	opt.zero_grad()
	logger.dumpkvs(i)

	pbar.set_description(f"Training Loss {loss.item():.4f}")
	pbar.update(1)


	for ae, cfg, logger, pbar, fstats, opt in sae_packs:
	pbar.close()
	ae.save_to_disk(f"{cfg.save_path}/final")


	def init_from_data_(ae, stats_acts_sample):
	ae.pre_bias.data = (
	compute_geometric_median(stats_acts_sample[:32768].float().cpu()).median.cuda().float()
	)


	def mse(recons, x):
	# return ((recons - x) ** 2).sum(dim=-1).mean()
	return ((recons - x) ** 2).mean()

	def normalized_mse(recon: torch.Tensor, xs: torch.Tensor) -> torch.Tensor:
	# only used for auxk
	xs_mu = xs.mean(dim=0)

	loss = mse(recon, xs) / mse(
	xs_mu[None, :].broadcast_to(xs.shape), xs
	)

	return loss

	def explained_variance(recons, x):
	# Compute the variance of the difference
	diff = x - recons
	diff_var = torch.var(diff, dim=0, unbiased=False)

	# Compute the variance of the original tensor
	x_var = torch.var(x, dim=0, unbiased=False)

	# Avoid division by zero
	explained_var = 1 - diff_var / (x_var + 1e-8)

	return explained_var.mean()


	def main():
	cfg = Config(json.load(open('SAE/config.json')))

	dataloader = ActivationsDataloader(cfg.paths_to_latents, cfg.block_name, cfg.bs)

	acts_iter = dataloader.iterate()
	stats_acts_sample = torch.cat([
	next(acts_iter).cpu() for _ in range(10)
	], dim=0)

	aes = [
	SparseAutoencoder(
	n_dirs_local=sae.n_dirs,
	d_model=sae.d_model,
	k=sae.k,
	auxk=sae.auxk,
	dead_steps_threshold=sae.dead_toks_threshold // cfg.bs,
	).cuda()
	for sae in cfg.saes
	]

	for ae in aes:
	init_from_data_(ae, stats_acts_sample)

	mse_scale = (
	1 / ((stats_acts_sample.float().mean(dim=0) - stats_acts_sample.float()) ** 2).mean()
	)
	mse_scale = mse_scale.item()
	del stats_acts_sample

	wandb.init(
	project=cfg.wandb_project,
	name=cfg.wandb_name,
	)

	loggers = [Logger(
	sae_name=cfg_sae.sae_name,
	dummy=False,
	) for cfg_sae in cfg.saes]

	training_loop_(
	aes,
	acts_iter,
	lambda ae, cfg_sae, flat_acts_train_batch, recons, info, logger: (
	# MSE
	logger.logkv("train_recons", mse_scale * mse(recons, flat_acts_train_batch))
	# AuxK
	+ logger.logkv(
	"train_maxk_recons",
	cfg_sae.auxk_coef
	* normalized_mse(
	ae.decode_sparse(
	info["auxk_inds"],
	info["auxk_vals"],
	),
	flat_acts_train_batch - recons.detach() + ae.pre_bias.detach(),
	).nan_to_num(0),
	)
	),
	sae_cfgs = cfg.saes,
	loggers=loggers,
	log_interval=cfg.log_interval,
	save_interval=cfg.save_interval,
	)


	if __name__ == "__main__":
	main()