SamuelYang
commited on
Commit
•
e9c77ac
1
Parent(s):
7e5390b
Upload 8 files
Browse files- README.md +94 -0
- config.json +27 -0
- modeling_sparse.py +35 -0
- pytorch_model.bin +3 -0
- special_tokens_map.json +7 -0
- tokenization_sparse.py +112 -0
- tokenizer_config.json +10 -0
- vocab.txt +0 -0
README.md
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language:
|
3 |
+
- zh
|
4 |
+
base_model: junnyu/roformer_chinese_base
|
5 |
+
tags:
|
6 |
+
- transformers
|
7 |
+
---
|
8 |
+
|
9 |
+
## INF Word-level Sparse Embedding (INF-WSE)
|
10 |
+
|
11 |
+
**INF-WSE** is a series of word-level sparse embedding models developed by [INFLY TECH](https://www.infly.cn/en).
|
12 |
+
These models are optimized to generate sparse, high-dimensional text embeddings that excel in capturing the most
|
13 |
+
relevant information for search and retrieval, particularly in Chinese text.
|
14 |
+
|
15 |
+
### Key Features:
|
16 |
+
|
17 |
+
- **Optimized for Retrieval**: INF-WSE is designed with retrieval tasks in mind. The sparse embeddings enable efficient
|
18 |
+
matching between queries and documents, making it highly effective for semantic search, ranking, and information
|
19 |
+
retrieval scenarios where speed and accuracy are critical.
|
20 |
+
- **Word-level Sparse Embeddings**: The model generates sparse representations at the word level, capturing essential
|
21 |
+
semantic details that help improve the relevance of search results. This is particularly useful for Chinese language
|
22 |
+
retrieval tasks, where word segmentation can significantly impact performance.
|
23 |
+
- **Sparse Representation for Efficiency**: Unlike dense embeddings that have a fixed number of dimensions, INF-WSE
|
24 |
+
produces sparse embeddings where the dimensionality matches the vocabulary size. Most dimensions are set to zero,
|
25 |
+
focusing only on the most significant terms. This sparsity reduces the computational load, enabling faster retrieval
|
26 |
+
without compromising on precision.
|
27 |
+
|
28 |
+
## Usage
|
29 |
+
|
30 |
+
### Transformers
|
31 |
+
|
32 |
+
#### Infer Embeddings
|
33 |
+
```python
|
34 |
+
import torch
|
35 |
+
from transformers import AutoTokenizer, AutoModel
|
36 |
+
|
37 |
+
queries = ['电脑一体机由什么构成?', '什么是掌上电脑?']
|
38 |
+
documents = [
|
39 |
+
'电脑一体机,是由一台显示器、一个电脑键盘和一个鼠标组成的电脑。',
|
40 |
+
'掌上电脑是一种运行在嵌入式操作系统和内嵌式应用软件之上的、小巧、轻便、易带、实用、价廉的手持式计算设备。',
|
41 |
+
]
|
42 |
+
input_texts = queries + documents
|
43 |
+
|
44 |
+
tokenizer = AutoTokenizer.from_pretrained("infly/inf-wse-v1-base-zh", trust_remote_code=True, use_fast=False) # Fast tokenizer has not been supported yet
|
45 |
+
model = AutoModel.from_pretrained("infly/inf-wse-v1-base-zh", trust_remote_code=True)
|
46 |
+
model.eval()
|
47 |
+
|
48 |
+
max_length = 512
|
49 |
+
|
50 |
+
input_batch = tokenizer(input_texts, padding=True, max_length=max_length, truncation=True, return_tensors="pt")
|
51 |
+
with torch.no_grad():
|
52 |
+
embeddings = model(input_batch['input_ids'], input_batch['attention_mask'], return_sparse=False) # if return_sparse=True, return sparse tensor, else return dense tensor
|
53 |
+
|
54 |
+
scores = embeddings[:2] @ embeddings[2:].T
|
55 |
+
print(scores.tolist())
|
56 |
+
# [[21.224790573120117, 4.520412921905518], [10.290857315063477, 19.359437942504883]]
|
57 |
+
```
|
58 |
+
|
59 |
+
#### Convert embeddings to lexical weights
|
60 |
+
```python
|
61 |
+
from collections import OrderedDict
|
62 |
+
def convert_embeddings_to_weights(embeddings, tokenizer):
|
63 |
+
values, indices = torch.sort(embeddings, dim=-1, descending=True)
|
64 |
+
|
65 |
+
token2weight = []
|
66 |
+
for i in range(embeddings.size(0)):
|
67 |
+
token2weight.append(OrderedDict())
|
68 |
+
|
69 |
+
non_zero_mask = values[i] != 0
|
70 |
+
tokens = tokenizer.convert_ids_to_tokens(indices[i][non_zero_mask])
|
71 |
+
weights = values[i][non_zero_mask].tolist()
|
72 |
+
|
73 |
+
for token, weight in zip(tokens, weights):
|
74 |
+
token2weight[i][token] = weight
|
75 |
+
|
76 |
+
return token2weight
|
77 |
+
|
78 |
+
token2weight = convert_embeddings_to_weights(embeddings, tokenizer)
|
79 |
+
print(token2weight[0])
|
80 |
+
# OrderedDict([('一体机', 3.3438382148742676), ('由', 2.493837356567383), ('电脑', 2.0291812419891357), ('构成', 1.986171841621399), ('什么', 1.0218793153762817)])
|
81 |
+
```
|
82 |
+
|
83 |
+
## Evaluation
|
84 |
+
|
85 |
+
### C-MTEB Retrieval task
|
86 |
+
([Chinese Massive Text Embedding Benchmark](https://github.com/FlagOpen/FlagEmbedding/tree/master/C_MTEB))
|
87 |
+
|
88 |
+
Metric: nDCG@10
|
89 |
+
|
90 |
+
| Model Name | Max Length | Average | Cmedqa | Covid | Du | Ecom | Medical | MMarco | T2 | Video |
|
91 |
+
|:---------------------------------------------------:|:----------:|:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|
|
92 |
+
| [BM25-zh](https://github.com/castorini/pyserini) | - | 25.39 | 13.70 | **86.66** | 13.68 | 11.49 | 15.48 | 6.56 | 29.53 | 25.98 |
|
93 |
+
| [bge-m3-sparse](https://huggingface.co/BAAI/bge-m3) | 512 | 29.94 | **24.50** | 76.16 | 22.12 | 17.62 | 27.52 | 9.78 | **37.69** | 24.12 |
|
94 |
+
| **inf-wse-v1-base-zh** | 512 | **32.83** | 20.51 | 76.40 | **36.77** | **19.97** | **28.61** | **13.32** | 36.81 | **30.25** |
|
config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"RoFormerModel"
|
4 |
+
],
|
5 |
+
"auto_map": {
|
6 |
+
"AutoModel": "modeling_sparse.RoFormerForSparseEmbedding"
|
7 |
+
},
|
8 |
+
"attention_probs_dropout_prob": 0.1,
|
9 |
+
"embedding_size": 768,
|
10 |
+
"hidden_act": "gelu",
|
11 |
+
"hidden_dropout_prob": 0.1,
|
12 |
+
"hidden_size": 768,
|
13 |
+
"initializer_range": 0.02,
|
14 |
+
"intermediate_size": 3072,
|
15 |
+
"layer_norm_eps": 1e-12,
|
16 |
+
"max_position_embeddings": 1536,
|
17 |
+
"model_type": "roformer",
|
18 |
+
"num_attention_heads": 12,
|
19 |
+
"num_hidden_layers": 12,
|
20 |
+
"pad_token_id": 0,
|
21 |
+
"rotary_value": false,
|
22 |
+
"torch_dtype": "float16",
|
23 |
+
"transformers_version": "4.39.3",
|
24 |
+
"type_vocab_size": 2,
|
25 |
+
"use_cache": true,
|
26 |
+
"vocab_size": 50000
|
27 |
+
}
|
modeling_sparse.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
from transformers import RoFormerModel, RoFormerPreTrainedModel
|
4 |
+
|
5 |
+
|
6 |
+
class RoFormerForSparseEmbedding(RoFormerPreTrainedModel):
|
7 |
+
def __init__(self, config):
|
8 |
+
super().__init__(config)
|
9 |
+
|
10 |
+
self.encoder = RoFormerModel(config)
|
11 |
+
self.linear_layer = nn.Linear(config.hidden_size, 1)
|
12 |
+
|
13 |
+
# Initialize weights and apply final processing
|
14 |
+
self.post_init()
|
15 |
+
|
16 |
+
def forward(self, input_ids, attention_mask, return_sparse=False):
|
17 |
+
B, L = input_ids.shape
|
18 |
+
|
19 |
+
last_hidden_states = self.encoder(input_ids, attention_mask)['last_hidden_state'] # [B,L,D]
|
20 |
+
token_weights = self.linear_layer(last_hidden_states).squeeze(-1) # [B,L]
|
21 |
+
token_mask = (1 - attention_mask) * -1e4 # [B,L]
|
22 |
+
token_mask[:, 0] = -1e4
|
23 |
+
last_ind = torch.sum(attention_mask, -1, keepdim=True) - 1 # [B,1]
|
24 |
+
token_mask = torch.scatter(token_mask, -1, last_ind, -1e4) # [B,L]
|
25 |
+
token_weights = token_weights + token_mask # [B,L]
|
26 |
+
|
27 |
+
emb = torch.zeros(B, L, self.encoder.config.vocab_size, dtype=token_weights.dtype,
|
28 |
+
device=token_weights.device) # [B,L,V]
|
29 |
+
emb = torch.scatter(emb, dim=-1, index=input_ids.unsqueeze(-1), src=token_weights.unsqueeze(-1)) # [B,L,V]
|
30 |
+
emb = torch.max(torch.relu(emb), dim=-2).values # [B,V]
|
31 |
+
|
32 |
+
if return_sparse:
|
33 |
+
emb = emb.to_sparse()
|
34 |
+
|
35 |
+
return emb
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:86173a3f7b0551db07283cea1a4bdf092dbeabeb6cace5c022883289265ae549
|
3 |
+
size 494908542
|
special_tokens_map.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"mask_token": "[MASK]",
|
4 |
+
"pad_token": "[PAD]",
|
5 |
+
"sep_token": "[SEP]",
|
6 |
+
"unk_token": "[UNK]"
|
7 |
+
}
|
tokenization_sparse.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers.models.roformer.tokenization_roformer import (WordpieceTokenizer, whitespace_tokenize,
|
2 |
+
RoFormerTokenizer)
|
3 |
+
|
4 |
+
|
5 |
+
# Copied from transformers.models.roformer.tokenization_roformer.BasicTokenizer._is_chinese_char
|
6 |
+
def _is_chinese_char(cp):
|
7 |
+
"""Checks whether CP is the codepoint of a CJK character."""
|
8 |
+
# This defines a "chinese character" as anything in the CJK Unicode block:
|
9 |
+
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
|
10 |
+
#
|
11 |
+
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
|
12 |
+
# despite its name. The modern Korean Hangul alphabet is a different block,
|
13 |
+
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
|
14 |
+
# space-separated words, so they are not treated specially and handled
|
15 |
+
# like the all of the other languages.
|
16 |
+
if (
|
17 |
+
(cp >= 0x4E00 and cp <= 0x9FFF)
|
18 |
+
or (cp >= 0x3400 and cp <= 0x4DBF) #
|
19 |
+
or (cp >= 0x20000 and cp <= 0x2A6DF) #
|
20 |
+
or (cp >= 0x2A700 and cp <= 0x2B73F) #
|
21 |
+
or (cp >= 0x2B740 and cp <= 0x2B81F) #
|
22 |
+
or (cp >= 0x2B820 and cp <= 0x2CEAF) #
|
23 |
+
or (cp >= 0xF900 and cp <= 0xFAFF)
|
24 |
+
or (cp >= 0x2F800 and cp <= 0x2FA1F) #
|
25 |
+
): #
|
26 |
+
return True
|
27 |
+
|
28 |
+
return False
|
29 |
+
|
30 |
+
|
31 |
+
# Modified from transformers.models.roformer.tokenization_roformer.WordpieceTokenizer
|
32 |
+
class ChineseWordpieceTokenizer(WordpieceTokenizer):
|
33 |
+
def tokenize(self, text):
|
34 |
+
"""
|
35 |
+
Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
|
36 |
+
tokenization using the given vocabulary.
|
37 |
+
|
38 |
+
For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
|
39 |
+
|
40 |
+
Args:
|
41 |
+
text: A single token or whitespace separated tokens. This should have
|
42 |
+
already been passed through *BasicTokenizer*.
|
43 |
+
|
44 |
+
Returns:
|
45 |
+
A list of wordpiece tokens.
|
46 |
+
"""
|
47 |
+
|
48 |
+
output_tokens = []
|
49 |
+
for token in whitespace_tokenize(text):
|
50 |
+
chars = list(token)
|
51 |
+
if len(chars) > self.max_input_chars_per_word:
|
52 |
+
output_tokens.append(self.unk_token)
|
53 |
+
continue
|
54 |
+
|
55 |
+
is_bad = False
|
56 |
+
start = 0
|
57 |
+
sub_tokens = []
|
58 |
+
while start < len(chars):
|
59 |
+
end = len(chars)
|
60 |
+
cur_substr = None
|
61 |
+
while start < end:
|
62 |
+
substr = "".join(chars[start:end])
|
63 |
+
if start > 0 and not _is_chinese_char(ord(substr[0])): # only add ## when not Chinese character
|
64 |
+
substr = "##" + substr
|
65 |
+
if substr in self.vocab:
|
66 |
+
cur_substr = substr
|
67 |
+
break
|
68 |
+
end -= 1
|
69 |
+
if cur_substr is None:
|
70 |
+
is_bad = True
|
71 |
+
break
|
72 |
+
sub_tokens.append(cur_substr)
|
73 |
+
start = end
|
74 |
+
|
75 |
+
if is_bad:
|
76 |
+
output_tokens.append(self.unk_token)
|
77 |
+
else:
|
78 |
+
output_tokens.extend(sub_tokens)
|
79 |
+
return output_tokens
|
80 |
+
|
81 |
+
|
82 |
+
class ChineseRoFormerTokenizer(RoFormerTokenizer):
|
83 |
+
def __init__(
|
84 |
+
self,
|
85 |
+
vocab_file,
|
86 |
+
do_lower_case=True,
|
87 |
+
do_basic_tokenize=True,
|
88 |
+
never_split=None,
|
89 |
+
unk_token="[UNK]",
|
90 |
+
sep_token="[SEP]",
|
91 |
+
pad_token="[PAD]",
|
92 |
+
cls_token="[CLS]",
|
93 |
+
mask_token="[MASK]",
|
94 |
+
tokenize_chinese_chars=False,
|
95 |
+
strip_accents=None,
|
96 |
+
**kwargs,
|
97 |
+
):
|
98 |
+
super().__init__(
|
99 |
+
vocab_file=vocab_file,
|
100 |
+
do_lower_case=do_lower_case,
|
101 |
+
do_basic_tokenize=do_basic_tokenize,
|
102 |
+
never_split=never_split,
|
103 |
+
unk_token=unk_token,
|
104 |
+
sep_token=sep_token,
|
105 |
+
pad_token=pad_token,
|
106 |
+
cls_token=cls_token,
|
107 |
+
mask_token=mask_token,
|
108 |
+
tokenize_chinese_chars=tokenize_chinese_chars,
|
109 |
+
strip_accents=strip_accents,
|
110 |
+
**kwargs,
|
111 |
+
)
|
112 |
+
self.wordpiece_tokenizer = ChineseWordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
|
tokenizer_config.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"auto_map": {
|
3 |
+
"AutoTokenizer": [
|
4 |
+
"tokenization_sparse.ChineseRoFormerTokenizer",
|
5 |
+
""
|
6 |
+
]
|
7 |
+
},
|
8 |
+
"tokenizer_class": "ChineseRoFormerTokenizer",
|
9 |
+
"tokenize_chinese_chars": false
|
10 |
+
}
|
vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|