Upload 8 files

Browse files

Files changed (8) hide show

README.md +94 -0
config.json +27 -0
modeling_sparse.py +35 -0
pytorch_model.bin +3 -0
special_tokens_map.json +7 -0
tokenization_sparse.py +112 -0
tokenizer_config.json +10 -0
vocab.txt +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,94 @@

+---
+language:
+  - zh
+base_model: junnyu/roformer_chinese_base
+tags:
+  - transformers
+---
+## INF Word-level Sparse Embedding (INF-WSE)
+**INF-WSE** is a series of word-level sparse embedding models developed by [INFLY TECH](https://www.infly.cn/en).
+These models are optimized to generate sparse, high-dimensional text embeddings that excel in capturing the most
+relevant information for search and retrieval, particularly in Chinese text.
+### Key Features:
+- **Optimized for Retrieval**: INF-WSE is designed with retrieval tasks in mind. The sparse embeddings enable efficient
+  matching between queries and documents, making it highly effective for semantic search, ranking, and information
+  retrieval scenarios where speed and accuracy are critical.
+- **Word-level Sparse Embeddings**: The model generates sparse representations at the word level, capturing essential
+  semantic details that help improve the relevance of search results. This is particularly useful for Chinese language
+  retrieval tasks, where word segmentation can significantly impact performance.
+- **Sparse Representation for Efficiency**: Unlike dense embeddings that have a fixed number of dimensions, INF-WSE
+  produces sparse embeddings where the dimensionality matches the vocabulary size. Most dimensions are set to zero,
+  focusing only on the most significant terms. This sparsity reduces the computational load, enabling faster retrieval
+  without compromising on precision.
+## Usage
+### Transformers
+#### Infer Embeddings
+```python
+import torch
+from transformers import AutoTokenizer, AutoModel
+queries = ['电脑一体机由什么构成？', '什么是掌上电脑？']
+documents = [
+    '电脑一体机，是由一台显示器、一个电脑键盘和一个鼠标组成的电脑。',
+    '掌上电脑是一种运行在嵌入式操作系统和内嵌式应用软件之上的、小巧、轻便、易带、实用、价廉的手持式计算设备。',
+]
+input_texts = queries + documents
+tokenizer = AutoTokenizer.from_pretrained("infly/inf-wse-v1-base-zh", trust_remote_code=True, use_fast=False)  # Fast tokenizer has not been supported yet
+model = AutoModel.from_pretrained("infly/inf-wse-v1-base-zh", trust_remote_code=True)
+model.eval()
+max_length = 512
+input_batch = tokenizer(input_texts, padding=True, max_length=max_length, truncation=True, return_tensors="pt")
+with torch.no_grad():
+    embeddings = model(input_batch['input_ids'], input_batch['attention_mask'], return_sparse=False)  # if return_sparse=True, return sparse tensor, else return dense tensor
+scores = embeddings[:2] @ embeddings[2:].T
+print(scores.tolist())
+# [[21.224790573120117, 4.520412921905518], [10.290857315063477, 19.359437942504883]]
+```
+#### Convert embeddings to lexical weights
+```python
+from collections import OrderedDict
+def convert_embeddings_to_weights(embeddings, tokenizer):
+    values, indices = torch.sort(embeddings, dim=-1, descending=True)
+    token2weight = []
+    for i in range(embeddings.size(0)):
+        token2weight.append(OrderedDict())
+        non_zero_mask = values[i] != 0
+        tokens = tokenizer.convert_ids_to_tokens(indices[i][non_zero_mask])
+        weights = values[i][non_zero_mask].tolist()
+        for token, weight in zip(tokens, weights):
+            token2weight[i][token] = weight
+    return token2weight
+token2weight = convert_embeddings_to_weights(embeddings, tokenizer)
+print(token2weight[0])
+# OrderedDict([('一体机', 3.3438382148742676), ('由', 2.493837356567383), ('电脑', 2.0291812419891357), ('构成', 1.986171841621399), ('什么', 1.0218793153762817)])
+```
+## Evaluation
+### C-MTEB Retrieval task
+([Chinese Massive Text Embedding Benchmark](https://github.com/FlagOpen/FlagEmbedding/tree/master/C_MTEB))
+Metric: nDCG@10
+|                     Model Name                      | Max Length |  Average  |  Cmedqa   |   Covid   |    Du     |   Ecom    |  Medical  |  MMarco   |    T2     |   Video   |
+|:---------------------------------------------------:|:----------:|:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|
+|  [BM25-zh](https://github.com/castorini/pyserini)   |     -      |   25.39   |   13.70   | **86.66** |   13.68   |   11.49   |   15.48   |   6.56    |   29.53   |   25.98   |
+| [bge-m3-sparse](https://huggingface.co/BAAI/bge-m3) |    512     |   29.94   | **24.50** |   76.16   |   22.12   |   17.62   |   27.52   |   9.78    | **37.69** |   24.12   |
+|               **inf-wse-v1-base-zh**                |    512     | **32.83** |   20.51   |   76.40   | **36.77** | **19.97** | **28.61** | **13.32** |   36.81   | **30.25** |

config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "architectures": [
+    "RoFormerModel"
+  ],
+  "auto_map": {
+    "AutoModel": "modeling_sparse.RoFormerForSparseEmbedding"
+  },
+  "attention_probs_dropout_prob": 0.1,
+  "embedding_size": 768,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 1536,
+  "model_type": "roformer",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "rotary_value": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.39.3",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 50000
+}

modeling_sparse.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import torch
+import torch.nn as nn
+from transformers import RoFormerModel, RoFormerPreTrainedModel
+class RoFormerForSparseEmbedding(RoFormerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.encoder = RoFormerModel(config)
+        self.linear_layer = nn.Linear(config.hidden_size, 1)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(self, input_ids, attention_mask, return_sparse=False):
+        B, L = input_ids.shape
+        last_hidden_states = self.encoder(input_ids, attention_mask)['last_hidden_state']  # [B,L,D]
+        token_weights = self.linear_layer(last_hidden_states).squeeze(-1)  # [B,L]
+        token_mask = (1 - attention_mask) * -1e4  # [B,L]
+        token_mask[:, 0] = -1e4
+        last_ind = torch.sum(attention_mask, -1, keepdim=True) - 1  # [B,1]
+        token_mask = torch.scatter(token_mask, -1, last_ind, -1e4)  # [B,L]
+        token_weights = token_weights + token_mask  # [B,L]
+        emb = torch.zeros(B, L, self.encoder.config.vocab_size, dtype=token_weights.dtype,
+                          device=token_weights.device)  # [B,L,V]
+        emb = torch.scatter(emb, dim=-1, index=input_ids.unsqueeze(-1), src=token_weights.unsqueeze(-1))  # [B,L,V]
+        emb = torch.max(torch.relu(emb), dim=-2).values  # [B,V]
+        if return_sparse:
+            emb = emb.to_sparse()
+        return emb

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:86173a3f7b0551db07283cea1a4bdf092dbeabeb6cace5c022883289265ae549
+size 494908542

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenization_sparse.py ADDED Viewed

	@@ -0,0 +1,112 @@

+from transformers.models.roformer.tokenization_roformer import (WordpieceTokenizer, whitespace_tokenize,
+                                                                RoFormerTokenizer)
+# Copied from transformers.models.roformer.tokenization_roformer.BasicTokenizer._is_chinese_char
+def _is_chinese_char(cp):
+    """Checks whether CP is the codepoint of a CJK character."""
+    # This defines a "chinese character" as anything in the CJK Unicode block:
+    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+    #
+    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+    # despite its name. The modern Korean Hangul alphabet is a different block,
+    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+    # space-separated words, so they are not treated specially and handled
+    # like the all of the other languages.
+    if (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)  #
+            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
+            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
+            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
+    ):  #
+        return True
+    return False
+# Modified from transformers.models.roformer.tokenization_roformer.WordpieceTokenizer
+class ChineseWordpieceTokenizer(WordpieceTokenizer):
+    def tokenize(self, text):
+        """
+        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
+        tokenization using the given vocabulary.
+        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
+        Args:
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through *BasicTokenizer*.
+        Returns:
+            A list of wordpiece tokens.
+        """
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0 and not _is_chinese_char(ord(substr[0])):  # only add ## when not Chinese character
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+class ChineseRoFormerTokenizer(RoFormerTokenizer):
+    def __init__(
+            self,
+            vocab_file,
+            do_lower_case=True,
+            do_basic_tokenize=True,
+            never_split=None,
+            unk_token="[UNK]",
+            sep_token="[SEP]",
+            pad_token="[PAD]",
+            cls_token="[CLS]",
+            mask_token="[MASK]",
+            tokenize_chinese_chars=False,
+            strip_accents=None,
+            **kwargs,
+    ):
+        super().__init__(
+            vocab_file=vocab_file,
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
+        self.wordpiece_tokenizer = ChineseWordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_sparse.ChineseRoFormerTokenizer",
+      ""
+    ]
+  },
+  "tokenizer_class": "ChineseRoFormerTokenizer",
+  "tokenize_chinese_chars": false
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff