lola_v1 / modeling_lola_gpt2.py

Upload model

5b878c2 verified 8 months ago

29.2 kB

	# This script provides an implementation of GPT2 based mixture-of-experts model.
	# Most of its functionality is copied from existing GPT2 implementation on huggingface: https://huggingface.co/docs/transformers/v4.20.1/en/model_doc/gpt2
	# MoE layers are inspired by Mixtral: https://huggingface.co/docs/transformers/v4.39.1/en/model_doc/mixtral
	# There are however, slight differences in this implementation to adapt it to behave like DeepSpeed Megatron's GPT2 MoE: https://github.com/microsoft/Megatron-DeepSpeed/blob/main/examples_deepspeed/MoE/ds_pretrain_gpt_1.3B_MoE128.sh
	# Please note: Most of the the features from DeepSpeed Megatron's GPT MoE are not implemented here.

	import warnings
	from typing import Optional, Tuple, Union

	from .configuration_lola_gpt2 import LOLAConfig
	import torch
	import torch.utils.checkpoint
	from torch import nn
	import torch.nn.functional as F
	from torch.nn import CrossEntropyLoss

	from transformers.modeling_outputs import (
	BaseModelOutputWithPastAndCrossAttentions,
	SequenceClassifierOutputWithPast,
	QuestionAnsweringModelOutput
	)
	from transformers.modeling_utils import SequenceSummary
	from transformers.pytorch_utils import Conv1D
	from transformers.utils import (
	logging
	)
	from transformers.utils.model_parallel_utils import assert_device_map, get_device_map

	from transformers.models.gpt2.modeling_gpt2 import GPT2Attention, GPT2MLP, GPT2Block, GPT2PreTrainedModel
	from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel, GPT2DoubleHeadsModel, GPT2ForSequenceClassification, GPT2ForTokenClassification


	logger = logging.get_logger(__name__)

	# LOLA
	class LOLAModel(GPT2PreTrainedModel):

	config_class = LOLAConfig

	def __init__(self, config):
	super().__init__(config)

	self.embed_dim = config.hidden_size

	self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
	self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)

	self.drop = nn.Dropout(config.embd_pdrop)
	self.h = nn.ModuleList([
	GPT2Block(config, layer_idx=i) if i % 2 == 0 else LOLABlock(config, layer_idx=i) for i in range(config.num_hidden_layers)
	])
	self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)

	# Model parallel
	self.model_parallel = False
	self.device_map = None
	self.gradient_checkpointing = False

	# Initialize weights and apply final processing
	self.post_init()


	def parallelize(self, device_map=None):
	# Check validity of device_map
	warnings.warn(
	"`GPT2Model.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your"
	" model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
	" `device_map` but it needs to be a dictionary module_name to device, so for instance {'h.0': 0, 'h.1': 1,"
	" ...}",
	FutureWarning,
	)
	self.device_map = (
	get_device_map(len(self.h), range(torch.cuda.device_count())) if device_map is None else device_map
	)
	assert_device_map(self.device_map, len(self.h))
	self.model_parallel = True
	self.first_device = "cpu" if "cpu" in self.device_map.keys() else "cuda:" + str(min(self.device_map.keys()))
	self.last_device = "cuda:" + str(max(self.device_map.keys()))
	self.wte = self.wte.to(self.first_device)
	self.wpe = self.wpe.to(self.first_device)
	# Load onto devices
	for k, v in self.device_map.items():
	for block in v:
	cuda_device = "cuda:" + str(k)
	self.h[block] = self.h[block].to(cuda_device)
	# ln_f to last
	self.ln_f = self.ln_f.to(self.last_device)


	def deparallelize(self):
	warnings.warn(
	"Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
	FutureWarning,
	)
	self.model_parallel = False
	self.device_map = None
	self.first_device = "cpu"
	self.last_device = "cpu"
	self.wte = self.wte.to("cpu")
	self.wpe = self.wpe.to("cpu")
	for index in range(len(self.h)):
	self.h[index] = self.h[index].to("cpu")
	self.ln_f = self.ln_f.to("cpu")
	torch.cuda.empty_cache()

	def get_input_embeddings(self):
	return self.wte

	def set_input_embeddings(self, new_embeddings):
	self.wte = new_embeddings

	def _prune_heads(self, heads_to_prune):
	"""
	Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
	"""
	for layer, heads in heads_to_prune.items():
	self.h[layer].attn.prune_heads(heads)

	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
	attention_mask: Optional[torch.FloatTensor] = None,
	token_type_ids: Optional[torch.LongTensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	head_mask: Optional[torch.FloatTensor] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	encoder_hidden_states: Optional[torch.Tensor] = None,
	encoder_attention_mask: Optional[torch.FloatTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	use_cache = use_cache if use_cache is not None else self.config.use_cache
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	if input_ids is not None and inputs_embeds is not None:
	raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
	elif input_ids is not None:
	# self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
	input_shape = input_ids.size()
	input_ids = input_ids.view(-1, input_shape[-1])
	batch_size = input_ids.shape[0]
	elif inputs_embeds is not None:
	input_shape = inputs_embeds.size()[:-1]
	batch_size = inputs_embeds.shape[0]
	else:
	raise ValueError("You have to specify either input_ids or inputs_embeds")

	device = input_ids.device if input_ids is not None else inputs_embeds.device

	if token_type_ids is not None:
	token_type_ids = token_type_ids.view(-1, input_shape[-1])

	if past_key_values is None:
	past_length = 0
	past_key_values = tuple([None] * len(self.h))
	else:
	past_length = past_key_values[0][0].size(-2)
	if position_ids is None:
	position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
	position_ids = position_ids.unsqueeze(0)

	# GPT2Attention mask.
	if attention_mask is not None:
	if batch_size <= 0:
	raise ValueError("batch_size has to be defined and > 0")
	attention_mask = attention_mask.view(batch_size, -1)
	# We create a 3D attention mask from a 2D tensor mask.
	# Sizes are [batch_size, 1, 1, to_seq_length]
	# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
	# this attention mask is more simple than the triangular masking of causal attention
	# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
	attention_mask = attention_mask[:, None, None, :]

	# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
	# masked positions, this operation will create a tensor which is 0.0 for
	# positions we want to attend and the dtype's smallest value for masked positions.
	# Since we are adding it to the raw scores before the softmax, this is
	# effectively the same as removing these entirely.
	attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility
	attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min

	# If a 2D or 3D attention mask is provided for the cross-attention
	# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
	if self.config.add_cross_attention and encoder_hidden_states is not None:
	encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
	encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
	if encoder_attention_mask is None:
	encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
	encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask)
	else:
	encoder_attention_mask = None

	# Prepare head mask if needed
	# 1.0 in head_mask indicate we keep the head
	# attention_probs has shape bsz x n_heads x N x N
	# head_mask has shape n_layer x batch x n_heads x N x N
	head_mask = self.get_head_mask(head_mask, self.config.n_layer)

	if inputs_embeds is None:
	inputs_embeds = self.wte(input_ids)
	position_embeds = self.wpe(position_ids)
	hidden_states = inputs_embeds + position_embeds

	if token_type_ids is not None:
	token_type_embeds = self.wte(token_type_ids)
	hidden_states = hidden_states + token_type_embeds

	hidden_states = self.drop(hidden_states)

	output_shape = (-1,) + input_shape[1:] + (hidden_states.size(-1),)

	if self.gradient_checkpointing and self.training:
	if use_cache:
	logger.warning_once(
	"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
	)
	use_cache = False

	presents = () if use_cache else None
	all_self_attentions = () if output_attentions else None
	all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
	all_hidden_states = () if output_hidden_states else None
	for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
	# Model parallel
	if self.model_parallel:
	torch.cuda.set_device(hidden_states.device)
	# Ensure layer_past is on same device as hidden_states (might not be correct)
	if layer_past is not None:
	layer_past = tuple(past_state.to(hidden_states.device) for past_state in layer_past)
	# Ensure that attention_mask is always on the same device as hidden_states
	if attention_mask is not None:
	attention_mask = attention_mask.to(hidden_states.device)
	if isinstance(head_mask, torch.Tensor):
	head_mask = head_mask.to(hidden_states.device)
	if output_hidden_states:
	all_hidden_states = all_hidden_states + (hidden_states,)

	if self.gradient_checkpointing and self.training:
	outputs = self._gradient_checkpointing_func(
	block.__call__,
	hidden_states,
	None,
	attention_mask,
	head_mask[i],
	encoder_hidden_states,
	encoder_attention_mask,
	use_cache,
	output_attentions,
	)
	else:
	outputs = block(
	hidden_states,
	layer_past=layer_past,
	attention_mask=attention_mask,
	head_mask=head_mask[i],
	encoder_hidden_states=encoder_hidden_states,
	encoder_attention_mask=encoder_attention_mask,
	use_cache=use_cache,
	output_attentions=output_attentions,
	)

	hidden_states = outputs[0]
	if use_cache is True:
	presents = presents + (outputs[1],)

	if output_attentions:
	all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
	if self.config.add_cross_attention:
	all_cross_attentions = all_cross_attentions + (outputs[3 if use_cache else 2],)

	# Model Parallel: If it's the last layer for that device, put things on the next device
	if self.model_parallel:
	for k, v in self.device_map.items():
	if i == v[-1] and "cuda:" + str(k) != self.last_device:
	hidden_states = hidden_states.to("cuda:" + str(k + 1))

	hidden_states = self.ln_f(hidden_states)

	hidden_states = hidden_states.view(output_shape)
	# Add last hidden state
	if output_hidden_states:
	all_hidden_states = all_hidden_states + (hidden_states,)

	if not return_dict:
	return tuple(
	v
	for v in [hidden_states, presents, all_hidden_states, all_self_attentions, all_cross_attentions]
	if v is not None
	)

	return BaseModelOutputWithPastAndCrossAttentions(
	last_hidden_state=hidden_states,
	past_key_values=presents,
	hidden_states=all_hidden_states,
	attentions=all_self_attentions,
	cross_attentions=all_cross_attentions,
	)

	class LOLABlock(nn.Module):
	def __init__(self, config, layer_idx=None):
	super().__init__()
	hidden_size = config.hidden_size
	inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size

	self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
	self.attn = GPT2Attention(config, layer_idx=layer_idx)
	self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)

	self.moe = LOLAMOE(
	hidden_size,
	inner_dim,
	config,
	config.num_experts,
	k=config.topk,
	# capacity_factor=1.0,
	# min_capacity=4,
	# drop_tokens=False,
	# use_tutel=False,
	# enable_expert_tensor_parallelism=False,
	)

	def forward(
	self,
	hidden_states: Optional[Tuple[torch.FloatTensor]],
	layer_past: Optional[Tuple[torch.Tensor]] = None,
	attention_mask: Optional[torch.FloatTensor] = None,
	head_mask: Optional[torch.FloatTensor] = None,
	encoder_hidden_states: Optional[torch.Tensor] = None,
	encoder_attention_mask: Optional[torch.FloatTensor] = None,
	use_cache: Optional[bool] = False,
	output_attentions: Optional[bool] = False,
	) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
	residual = hidden_states
	hidden_states = self.ln_1(hidden_states)
	attn_outputs = self.attn(
	hidden_states,
	layer_past=layer_past,
	attention_mask=attention_mask,
	head_mask=head_mask,
	use_cache=use_cache,
	output_attentions=output_attentions,
	)
	attn_output = attn_outputs[0] # output_attn: a, present, (attentions)
	outputs = attn_outputs[1:]
	# residual connection
	hidden_states = attn_output + residual

	if encoder_hidden_states is not None:
	# add one self-attention block for cross-attention
	if not hasattr(self, "crossattention"):
	raise ValueError(
	f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
	"cross-attention layers by setting `config.add_cross_attention=True`"
	)
	residual = hidden_states
	hidden_states = self.ln_cross_attn(hidden_states)
	cross_attn_outputs = self.crossattention(
	hidden_states,
	attention_mask=attention_mask,
	head_mask=head_mask,
	encoder_hidden_states=encoder_hidden_states,
	encoder_attention_mask=encoder_attention_mask,
	output_attentions=output_attentions,
	)
	attn_output = cross_attn_outputs[0]
	# residual connection
	hidden_states = residual + attn_output
	outputs = outputs + cross_attn_outputs[2:] # add cross attentions if we output attention weights

	residual = hidden_states
	hidden_states = self.ln_2(hidden_states)
	feed_forward_hidden_states, _ = self.moe(hidden_states)
	# residual connection
	hidden_states = residual + feed_forward_hidden_states

	if use_cache:
	outputs = (hidden_states,) + outputs
	else:
	outputs = (hidden_states,) + outputs[1:]

	return outputs # hidden_states, present, (attentions, cross_attentions)

	class LOLAMOE(nn.Module):
	def __init__(self,
	hidden_size,
	inner_dim,
	config,
	num_experts,
	k
	):
	super().__init__()
	self.hidden_dim = hidden_size
	self.num_experts = num_experts
	self.top_k = k

	self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)
	self.experts = nn.ModuleList([GPT2MLP(inner_dim, config) for _ in range(self.num_experts)])

	def forward(self, hidden_states):
	# https://github.com/huggingface/transformers/blob/main/src/transformers/models/mixtral/modeling_mixtral.py#L816
	# FIXME do it as in top1gating
	# https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/moe/sharded_moe.py

	batch_size, sequence_length, hidden_dim = hidden_states.shape
	hidden_states = hidden_states.view(-1, hidden_dim)

	router_logits = self.gate(hidden_states)
	# router_logits = router_logits.squeeze(dim=0)

	# TODO: fix the weights logic to be the same as Megatron
	routing_weights = F.softmax(router_logits, dim=1)
	routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
	# routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
	# commenting the statement above for LOLA and removing the "/" operator to avoid getting weights as 1
	routing_weights = routing_weights.sum(dim=-1, keepdim=True)
	routing_weights = routing_weights.to(hidden_states.dtype)

	final_hidden_states = torch.zeros(
	(batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
	)
	expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
	for expert_idx in range(self.num_experts):
	expert_layer = self.experts[expert_idx]
	idx, top_x = torch.where(expert_mask[expert_idx])

	if top_x.shape[0] == 0:
	continue

	# in torch it is faster to index using lists than torch tensors
	top_x_list = top_x.tolist()
	idx_list = idx.tolist()

	# Index the correct hidden states and compute the expert hidden state for
	# the current expert. We need to make sure to multiply the output hidden
	# states by `routing_weights` on the corresponding tokens (top-1 and top-2)
	current_state = hidden_states[None, top_x_list].reshape(-1, hidden_dim)
	current_hidden_states = expert_layer(current_state) * routing_weights[top_x_list, idx_list, None]

	# However `index_add_` only support torch tensors for indexing so we'll use
	# the `top_x` tensor here.
	final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
	final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
	return final_hidden_states, router_logits

	class LOLAAttention(GPT2Attention):
	def __init__(self, config, is_cross_attention=False, layer_idx=None):
	super(GPT2Attention, SequenceClassifierOutputWithPast).__init__()

	max_positions = config.max_position_embeddings
	self.register_buffer(
	"bias",
	torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
	1, 1, max_positions, max_positions
	),
	#persistent=False,
	)
	self.register_buffer("masked_bias", torch.tensor(-1e4),
	#persistent=False
	)

	self.embed_dim = config.hidden_size
	self.num_heads = config.num_attention_heads
	self.head_dim = self.embed_dim // self.num_heads
	self.split_size = self.embed_dim
	if self.head_dim * self.num_heads != self.embed_dim:
	raise ValueError(
	f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
	f" {self.num_heads})."
	)

	self.scale_attn_weights = config.scale_attn_weights
	self.is_cross_attention = is_cross_attention

	# Layer-wise attention scaling, reordering, and upcasting
	self.scale_attn_by_inverse_layer_idx = config.scale_attn_by_inverse_layer_idx
	self.layer_idx = layer_idx
	self.reorder_and_upcast_attn = config.reorder_and_upcast_attn

	if self.is_cross_attention:
	self.c_attn = Conv1D(2 * self.embed_dim, self.embed_dim)
	self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
	else:
	self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim)
	self.c_proj = Conv1D(self.embed_dim, self.embed_dim)

	self.attn_dropout = nn.Dropout(config.attn_pdrop)
	self.resid_dropout = nn.Dropout(config.resid_pdrop)

	self.pruned_heads = set()


	class LOLALMHeadModel(GPT2LMHeadModel):

	config_class = LOLAConfig

	def __init__(self, config):
	# preventing initiation of GPT2LMHeadModel directly
	super(GPT2LMHeadModel, self).__init__(config)
	self.transformer = LOLAModel(config)
	self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

	# Model parallel
	self.model_parallel = False
	self.device_map = None

	# Initialize weights and apply final processing
	self.post_init()


	class LOLADoubleHeadsModel(GPT2DoubleHeadsModel):

	config_class = LOLAConfig

	def __init__(self, config):
	super(GPT2DoubleHeadsModel, self).__init__(config)
	config.num_labels = 1
	self.transformer = LOLAModel(config)
	self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
	self.multiple_choice_head = SequenceSummary(config)

	# Model parallel
	self.model_parallel = False
	self.device_map = None

	# Initialize weights and apply final processing
	self.post_init()


	class LOLAForSequenceClassification(GPT2ForSequenceClassification):

	config_class = LOLAConfig

	def __init__(self, config):
	super(GPT2ForSequenceClassification, self).__init__(config)
	self.num_labels = config.num_labels
	self.transformer = LOLAModel(config)
	self.score = nn.Linear(config.n_embd, self.num_labels, bias=False)

	# Model parallel
	self.model_parallel = False
	self.device_map = None

	# Initialize weights and apply final processing
	self.post_init()

	class LOLAForTokenClassification(GPT2ForTokenClassification):

	config_class = LOLAConfig

	def __init__(self, config):
	super(GPT2ForTokenClassification, self).__init__(config)
	self.num_labels = config.num_labels

	self.transformer = LOLAModel(config)
	if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
	classifier_dropout = config.classifier_dropout
	elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
	classifier_dropout = config.hidden_dropout
	else:
	classifier_dropout = 0.1
	self.dropout = nn.Dropout(classifier_dropout)
	self.classifier = nn.Linear(config.hidden_size, config.num_labels)

	# Model parallel
	self.model_parallel = False
	self.device_map = None

	# Initialize weights and apply final processing
	self.post_init()

	class LOLAForQuestionAnswering(GPT2PreTrainedModel):

	config_class = LOLAConfig

	def __init__(self, config):
	super().__init__(config)
	self.num_labels = config.num_labels
	self.transformer = LOLAModel(config)
	self.qa_outputs = nn.Linear(config.hidden_size, 2)

	# Model parallel
	self.model_parallel = False
	self.device_map = None

	# Initialize weights and apply final processing
	self.post_init()

	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.FloatTensor] = None,
	token_type_ids: Optional[torch.LongTensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	head_mask: Optional[torch.FloatTensor] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	start_positions: Optional[torch.LongTensor] = None,
	end_positions: Optional[torch.LongTensor] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> Union[Tuple, QuestionAnsweringModelOutput]:
	r"""
	start_positions (`torch.LongTensor` of shape `(batch_size,)`, optional):
	Labels for position (index) of the start of the labelled span for computing the token classification loss.
	Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
	are not taken into account for computing the loss.
	end_positions (`torch.LongTensor` of shape `(batch_size,)`, optional):
	Labels for position (index) of the end of the labelled span for computing the token classification loss.
	Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
	are not taken into account for computing the loss.
	"""
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	outputs = self.transformer(
	input_ids,
	attention_mask=attention_mask,
	token_type_ids=token_type_ids,
	position_ids=position_ids,
	head_mask=head_mask,
	inputs_embeds=inputs_embeds,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)

	sequence_output = outputs[0]

	logits = self.qa_outputs(sequence_output)
	start_logits, end_logits = logits.split(1, dim=-1)
	start_logits = start_logits.squeeze(-1).contiguous()
	end_logits = end_logits.squeeze(-1).contiguous()

	total_loss = None
	if start_positions is not None and end_positions is not None:
	# If we are on multi-GPU, split add a dimension
	if len(start_positions.size()) > 1:
	start_positions = start_positions.squeeze(-1).to(start_logits.device)
	if len(end_positions.size()) > 1:
	end_positions = end_positions.squeeze(-1).to(end_logits.device)
	# sometimes the start/end positions are outside our model inputs, we ignore these terms
	ignored_index = start_logits.size(1)
	start_positions = start_positions.clamp(0, ignored_index)
	end_positions = end_positions.clamp(0, ignored_index)

	loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
	start_loss = loss_fct(start_logits, start_positions)
	end_loss = loss_fct(end_logits, end_positions)
	total_loss = (start_loss + end_loss) / 2

	if not return_dict:
	output = (start_logits, end_logits) + outputs[2:]
	return ((total_loss,) + output) if total_loss is not None else output

	return QuestionAnsweringModelOutput(
	loss=total_loss,
	start_logits=start_logits,
	end_logits=end_logits,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	)