malenia1
/

ternary-weight-embedding

8-bit precision

Model card Files Files and versions Community

ternary-weight-embedding / vocab.txt

malenia1's picture

Upload vocab.txt with huggingface_hub

0bfc3d6 verified 23 days ago

3.93 kB

	import os

	os.environ["PATH"] = "/usr/local/cuda/bin:" + os.environ["PATH"]
	os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"

	import bitblas
	import torch
	import torch.nn as nn
	from transformers import BertConfig, BertModel, PreTrainedModel, PretrainedConfig,AutoModel,AutoConfig,BertPreTrainedModel

	class bitlinear(bitblas.Linear):
	def __init__(
	self,
	in_features: int,
	out_features: int,
	bias: bool = False,
	A_dtype: str = "float16",
	W_dtype: str = "int2",
	accum_dtype: str = "float16",
	out_dtype: str = "float16",
	group_size: int = -1,
	with_scaling: bool = False,
	with_zeros: bool = False,
	zeros_mode: str = None,
	opt_M: list = [1, 16, 32, 64, 128, 256, 512],
	fast_decoding: bool = True,
	alpha: torch.dtype = torch.float16,
	b:torch.Tensor=None
	):
	super().__init__(
	in_features=in_features,
	out_features=out_features,
	bias=bias,
	A_dtype=A_dtype,
	W_dtype=W_dtype,
	accum_dtype=accum_dtype,
	out_dtype=out_dtype,
	group_size=group_size,
	with_scaling=with_scaling,
	with_zeros=with_zeros,
	zeros_mode=zeros_mode,
	opt_M=opt_M,
	fast_decoding=fast_decoding,
	)
	self.alpha = nn.Parameter(alpha,requires_grad=False)
	self.b = nn.Parameter(b,requires_grad=False)

	def forward(self, A: torch.Tensor, out: torch.Tensor = None) -> torch.Tensor:
	out = super().forward(A, out)
	out *= self.alpha
	if self.b is not None:
	out += self.b.view(1, -1).expand_as(out)
	return out.to(torch.float32)


	class TernaryBertConfig(BertConfig):
	model_type = "ternarybert"
	def __init__(self, **kwargs):
	super().__init__(**kwargs)


	class TernaryBert(PreTrainedModel):
	#config_class = TernaryBertConfig
	config_class = BertConfig

	def __init__(self, config):
	super().__init__(config)
	self.bert = BertModel(config)
	self.replace_linear2bitblas(self.bert)

	#def forward(self, input_ids, attention_mask=None,token_type_ids=None):
	# return self.bert(input_ids, attention_mask=attention_mask,token_type_ids=token_type_ids)
	def forward(self, **kwargs):
	return self.bert(**kwargs)

	def convert_to_bitlinear(self,layer):
	bitlayer = bitlinear(
	in_features=layer.in_features,
	out_features=layer.out_features,
	bias=False,
	A_dtype="float16", # activation A dtype
	W_dtype="int2", # weight W dtype
	accum_dtype="float16", # accumulation dtype
	out_dtype="float16", # output dtype
	# configs for weight only quantization
	group_size=-1, # setting for grouped quantization
	with_scaling=False, # setting for scaling factor
	with_zeros=False, # setting for zeros
	zeros_mode=None, # setting for how to calculating zeros
	# Target optimization var for dynamic symbolic.
	# For detailed information please checkout docs/PythonAPI.md
	# By default, the optimization var is [1, 16, 32, 64, 128, 256, 512]
	opt_M=[1, 16, 32, 64, 128, 256, 512],
	fast_decoding=True,
	alpha=torch.tensor(1.).to(torch.float16),
	b = layer.bias.data.to(torch.float16)
	)
	return bitlayer

	def replace_linear2bitblas(self,model):
	for name, module in model.named_children():
	if isinstance(module, nn.Linear):
	new_layer = self.convert_to_bitlinear(module)
	setattr(model, name, new_layer)
	elif len(list(module.children())) > 0:
	self.replace_linear2bitblas(module)