Update README.md
Browse files
README.md
CHANGED
@@ -45,17 +45,19 @@ from transformers import AutoTokenizer, AutoModel
|
|
45 |
|
46 |
def last_token_pool(last_hidden_states: Tensor,
|
47 |
attention_mask: Tensor) -> Tensor:
|
|
|
48 |
left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
|
49 |
if left_padding:
|
50 |
-
|
51 |
else:
|
52 |
sequence_lengths = attention_mask.sum(dim=1) - 1
|
53 |
-
batch_size =
|
54 |
-
|
|
|
55 |
|
56 |
|
57 |
def get_detailed_instruct_query(task_description: str, query: str) -> str:
|
58 |
-
return f'
|
59 |
|
60 |
def get_detailed_instruct_passage(passage: str) -> str:
|
61 |
return f'Represent this passage\npassage: {passage}'
|
@@ -77,7 +79,11 @@ input_texts = queries + documents
|
|
77 |
max_length = 512
|
78 |
|
79 |
# Tokenize the input texts
|
80 |
-
batch_dict = tokenizer(input_texts, max_length=max_length, padding=True, truncation=True, return_tensors='pt')
|
|
|
|
|
|
|
|
|
81 |
|
82 |
model.eval()
|
83 |
with torch.no_grad():
|
|
|
45 |
|
46 |
def last_token_pool(last_hidden_states: Tensor,
|
47 |
attention_mask: Tensor) -> Tensor:
|
48 |
+
last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
|
49 |
left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
|
50 |
if left_padding:
|
51 |
+
embedding = last_hidden[:, -1]
|
52 |
else:
|
53 |
sequence_lengths = attention_mask.sum(dim=1) - 1
|
54 |
+
batch_size = last_hidden.shape[0]
|
55 |
+
embedding = last_hidden[torch.arange(batch_size, device=last_hidden.device), sequence_lengths]
|
56 |
+
return embedding
|
57 |
|
58 |
|
59 |
def get_detailed_instruct_query(task_description: str, query: str) -> str:
|
60 |
+
return f'{task_description}\nQuery: {query}'
|
61 |
|
62 |
def get_detailed_instruct_passage(passage: str) -> str:
|
63 |
return f'Represent this passage\npassage: {passage}'
|
|
|
79 |
max_length = 512
|
80 |
|
81 |
# Tokenize the input texts
|
82 |
+
batch_dict = tokenizer(input_texts, max_length=max_length-1, padding=True, truncation=True, return_tensors='pt')
|
83 |
+
|
84 |
+
# Important! Adding EOS token at the end
|
85 |
+
batch_dict['input_ids'] = [input_ids + [tokenizer.eos_token_id] for input_ids in batch_dict['input_ids']]
|
86 |
+
batch_dict = tokenizer.pad(batch_dict, padding=True, return_attention_mask=True, return_tensors='pt').to("cuda")
|
87 |
|
88 |
model.eval()
|
89 |
with torch.no_grad():
|