how to extract general text embedding
is it possible you can share an example of extracting general text embedding ? I would love to test the clustering task based on semantic similarity
Hi
@daisyyayueyue
Hope the below code will help you.
def embedding_generation(query):
input_ids = tokenizer(query, max_length=4096, padding=True, truncation=True, return_tensors='pt')
inputs_embeds = model.model.embed_tokens(input_ids.input_ids)
embeddings = average_pool(inputs_embeds, input_ids['attention_mask'])
embeddings = F.normalize(embeddings, p=2, dim=1)
# scores = (embeddings[:2] @ embeddings[2:].T) * 100
# print(scores.tolist())
return embeddings
Where is the average_pool defined, I only see last_token_pool
@pduggi use below function
def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
@mrhimanshu can this only work with batch size 1? Is there any way to have a bigger batch size?
@mrhimanshu this worked for me:
import torch
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel
def last_token_pool(last_hidden_states: Tensor,
attention_mask: Tensor) -> Tensor:
left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
if left_padding:
return last_hidden_states[:, -1]
else:
sequence_lengths = attention_mask.sum(dim=1) - 1
batch_size = last_hidden_states.shape[0]
return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
documents = [
"As a general guideline...",
"Definition of summit..."
]
tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-mistral-7b-instruct')
model = AutoModel.from_pretrained('intfloat/e5-mistral-7b-instruct')
# model.to('cuda')
max_length = 4096
batch_dict = tokenizer(documents, max_length=max_length - 1, return_attention_mask=False, padding=False, truncation=True)
batch_dict['input_ids'] = [input_ids + [tokenizer.eos_token_id] for input_ids in batch_dict['input_ids']]
batch_dict = tokenizer.pad(batch_dict, padding=True, return_attention_mask=True, return_tensors='pt')
# batch_dict.to('cuda')
outputs = model(**batch_dict)
embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
embeddings = F.normalize(embeddings, p=2, dim=1)
embeddings = embeddings.tolist()
Move things to 'cuda'
if you need GPU
Does above code work for text embeddings for retrieval task. Data is in chunks of size 3000 characters.
@nirupamadasari05 yeah it will work