ritaranx commited on
Commit
1b758c5
1 Parent(s): 5423779

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +11 -5
README.md CHANGED
@@ -45,17 +45,19 @@ from transformers import AutoTokenizer, AutoModel
45
 
46
  def last_token_pool(last_hidden_states: Tensor,
47
  attention_mask: Tensor) -> Tensor:
 
48
  left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
49
  if left_padding:
50
- return last_hidden_states[:, -1]
51
  else:
52
  sequence_lengths = attention_mask.sum(dim=1) - 1
53
- batch_size = last_hidden_states.shape[0]
54
- return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
 
55
 
56
 
57
  def get_detailed_instruct_query(task_description: str, query: str) -> str:
58
- return f'Instruct: {task_description}\nQuery: {query}'
59
 
60
  def get_detailed_instruct_passage(passage: str) -> str:
61
  return f'Represent this passage\npassage: {passage}'
@@ -77,7 +79,11 @@ input_texts = queries + documents
77
  max_length = 512
78
 
79
  # Tokenize the input texts
80
- batch_dict = tokenizer(input_texts, max_length=max_length, padding=True, truncation=True, return_tensors='pt')
 
 
 
 
81
 
82
  model.eval()
83
  with torch.no_grad():
 
45
 
46
  def last_token_pool(last_hidden_states: Tensor,
47
  attention_mask: Tensor) -> Tensor:
48
+ last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
49
  left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
50
  if left_padding:
51
+ embedding = last_hidden[:, -1]
52
  else:
53
  sequence_lengths = attention_mask.sum(dim=1) - 1
54
+ batch_size = last_hidden.shape[0]
55
+ embedding = last_hidden[torch.arange(batch_size, device=last_hidden.device), sequence_lengths]
56
+ return embedding
57
 
58
 
59
  def get_detailed_instruct_query(task_description: str, query: str) -> str:
60
+ return f'{task_description}\nQuery: {query}'
61
 
62
  def get_detailed_instruct_passage(passage: str) -> str:
63
  return f'Represent this passage\npassage: {passage}'
 
79
  max_length = 512
80
 
81
  # Tokenize the input texts
82
+ batch_dict = tokenizer(input_texts, max_length=max_length-1, padding=True, truncation=True, return_tensors='pt')
83
+
84
+ # Important! Adding EOS token at the end
85
+ batch_dict['input_ids'] = [input_ids + [tokenizer.eos_token_id] for input_ids in batch_dict['input_ids']]
86
+ batch_dict = tokenizer.pad(batch_dict, padding=True, return_attention_mask=True, return_tensors='pt').to("cuda")
87
 
88
  model.eval()
89
  with torch.no_grad():