|
--- |
|
language: |
|
- en |
|
--- |
|
|
|
This model was trained with [Sparsembed](https://github.com/raphaelsty/sparsembed). You can find details on how to use it in the [Sparsembed](https://github.com/raphaelsty/sparsembed) repository. |
|
|
|
```sh |
|
pip install sparsembed |
|
``` |
|
|
|
```python |
|
from sparsembed import model, retrieve |
|
from transformers import AutoModelForMaskedLM, AutoTokenizer |
|
|
|
device = "cuda" # cpu |
|
|
|
batch_size = 10 |
|
|
|
# List documents to index: |
|
documents = [ |
|
{'id': 0, |
|
'title': 'Paris', |
|
'url': 'https://en.wikipedia.org/wiki/Paris', |
|
'text': 'Paris is the capital and most populous city of France.'}, |
|
{'id': 1, |
|
'title': 'Paris', |
|
'url': 'https://en.wikipedia.org/wiki/Paris', |
|
'text': "Since the 17th century, Paris has been one of Europe's major centres of science, and arts."}, |
|
{'id': 2, |
|
'title': 'Paris', |
|
'url': 'https://en.wikipedia.org/wiki/Paris', |
|
'text': 'The City of Paris is the centre and seat of government of the region and province of Île-de-France.' |
|
}] |
|
|
|
model = model.SparsEmbed( |
|
model=AutoModelForMaskedLM.from_pretrained("raphaelsty/sparsembed-max").to(device), |
|
tokenizer=AutoTokenizer.from_pretrained("raphaelsty/sparsembed-max"), |
|
device=device |
|
) |
|
|
|
retriever = retrieve.SpladeRetriever( |
|
key="id", # Key identifier of each document. |
|
on=["title", "text"], # Fields to search. |
|
model=model # Splade retriever. |
|
) |
|
|
|
retriever = retriever.add( |
|
documents=documents, |
|
batch_size=batch_size, |
|
k_tokens=256, # Number of activated tokens. |
|
) |
|
|
|
retriever( |
|
["paris", "Toulouse"], # Queries |
|
k_tokens=20, # Maximum number of activated tokens. |
|
k=100, # Number of documents to retrieve. |
|
batch_size=batch_size |
|
) |
|
``` |