encoderblocks.py · qilowoq/AbLang_heavy at 0a2f2967b9bc8243c46d379faff97ad8f597d20e

AbLang_heavy / encoderblocks.py

Upload AbLang

0344cb1 over 1 year ago

5.38 kB

	import math
	from transformers import PreTrainedModel
	from typing import List, Optional, Tuple
	from dataclasses import dataclass
	import torch
	import torch.nn as nn
	from fairseq.modules.multihead_attention import MultiheadAttention
	from .extra_fns import ACT2FN


	@dataclass
	class AbRepOutput():
	"""
	Dataclass used to store AbRep output.
	"""
	last_hidden_state: torch.FloatTensor
	all_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
	attentions: Optional[Tuple[torch.FloatTensor]] = None


	class EncoderBlocks(PreTrainedModel):
	"""
	Wrapper for multiple EncoderBlocks (or a single).
	"""
	def __init__(self, config):
	super().__init__(config)
	self.config = config
	self.Layers = nn.ModuleList([EncoderBlock(config) for _ in range(config.num_hidden_layers)])

	def forward(self, hidden_states, attention_mask=None, output_attentions=False, output_hidden_states=False):
	all_hidden_states = () if output_hidden_states else None
	all_self_attentions = () if output_attentions else None
	for num_block, a_EncoderBlock in enumerate(self.Layers):
	hidden_states, attentions = a_EncoderBlock(hidden_states, attention_mask, output_attentions)
	if output_hidden_states:
	all_hidden_states = all_hidden_states + (hidden_states,) # Takes out each hidden states after each EncoderBlock
	if output_attentions:
	all_self_attentions = all_self_attentions + (attentions,) # Takes out attention layers for analysis
	return AbRepOutput(last_hidden_state=hidden_states, all_hidden_states=all_hidden_states, attentions=all_self_attentions)


	class EncoderBlock(PreTrainedModel):
	"""
	Single EncoderBlock.
	An EncoderBlock consists of a MultiHeadAttention and a IntermediateLayer.
	"""
	def __init__(self, config):
	super().__init__(config)
	self.MultiHeadAttention = ThirdMultiHeadAttention(config)
	self.MHADropout = nn.Dropout(config.hidden_dropout_prob)
	self.MHALayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
	self.IntermediateLayer = IntermediateLayer(config)

	def forward(self, hidden_states, attention_mask=None, output_attentions=False):
	MHAoutput, attentions = self.MultiHeadAttention(hidden_states, attention_mask, output_attentions=output_attentions)
	output = self.MHADropout(MHAoutput)
	output = self.MHALayerNorm(output + hidden_states) # HIDDEN_STATES ARE ADDED FOR RESIDUAL BLOCK EFFECT
	output = self.IntermediateLayer(output) # INTERMEDIATELAYER HAS RESIDUAL BLOCK EFFECT INTERNALLY
	return output, attentions


	class ThirdMultiHeadAttention(PreTrainedModel):
	"""
	New MultiHeadAttention which can return the weights of the individual heads.
	"""
	def __init__(self, config):
	super().__init__(config)
	self.Attention = MultiheadAttention(config.hidden_size, config.num_attention_heads, dropout=config.attention_probs_dropout_prob, self_attention=True)

	def forward(self, hidden_states, attention_mask=None, output_attentions=False):
	hidden_states = torch.transpose(hidden_states, 0, 1)
	# static_kv is only True because there is currently a bug which doesn't return the head weights unaveraged unless its true
	attn_output, attn_weights = self.Attention(hidden_states, hidden_states, hidden_states, key_padding_mask=attention_mask, static_kv=True,
	need_weights=output_attentions, need_head_weights=output_attentions)
	return torch.transpose(attn_output, 0, 1), attn_weights


	class OldMultiHeadAttention(PreTrainedModel):
	"""
	MultiHeadAttention contains a Scaled Dot Product Attention and a Linear Layer.
	"""
	def __init__(self, config):
	super().__init__(config)
	self.Attention = torch.nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, config.attention_probs_dropout_prob)

	def forward(self, hidden_states, attention_mask=None, output_attentions=False):
	hidden_states = torch.transpose(hidden_states, 0, 1)
	output, attentions = self.Attention(hidden_states, hidden_states, hidden_states, key_padding_mask=attention_mask, need_weights=output_attentions)
	attention_output = torch.transpose(output, 0, 1)
	return attention_output, attentions


	class IntermediateLayer(PreTrainedModel):
	"""
	Contains an expanding layer, while also functioning as a residual block ending with a drop-norm layer
	"""
	def __init__(self, config):
	super().__init__(config)
	self.expand_dense = nn.Linear(config.hidden_size, config.intermediate_size)
	self.intermediate_act_fn = ACT2FN[config.hidden_act]

	self.dense_dense = nn.Linear(config.intermediate_size, config.hidden_size)
	self.dropout = nn.Dropout(config.hidden_dropout_prob)
	self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

	def forward(self, hidden_states):
	output = self.expand_dense(hidden_states)
	output = self.intermediate_act_fn(output)
	output = self.dense_dense(output)
	output = self.dropout(output)
	output = self.LayerNorm(output + hidden_states)
	return output