kevinkrahn commited on
Commit
891db93
1 Parent(s): 89a4cd4

Add new SentenceTransformer model.

Browse files
1_Pooling/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 768,
3
+ "pooling_mode_cls_token": true,
4
+ "pooling_mode_mean_tokens": false,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": false,
9
+ "include_prompt": true
10
+ }
README.md ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: sentence-transformers
3
+ pipeline_tag: sentence-similarity
4
+ tags:
5
+ - sentence-transformers
6
+ - feature-extraction
7
+ - sentence-similarity
8
+ - transformers
9
+ - semantic-search
10
+
11
+ ---
12
+
13
+ # sge-hlm
14
+
15
+ ## Sentence embeddings for English and Ancient Greek
16
+
17
+ The HLM model architecture is based on [Heidelberg-Boston @ SIGTYP 2024 Shared Task: Enhancing Low-Resource Language Analysis With Character-Aware Hierarchical Transformers](https://aclanthology.org/2024.sigtyp-1.16/) but uses a simpler architecture with rotary embeddings (see the implementation in the `HLM` folder) instead of using DeBERTa as a base architecture. This architecture produces superior results compared to the vanilla BERT architecture for low-resource languages like Ancient Greek. It is trained to produce sentence embeddings using the method described in [Sentence Embedding Models for Ancient Greek Using Multiligual Knowledge Distillation](https://aclanthology.org/2023.alp-1.2/).
18
+
19
+ This model was distilled from `BAAI/bge-base-en-v1.5` for embedding English and Ancient Greek text.
20
+
21
+ ## Usage (Sentence-Transformers)
22
+
23
+ Using [sentence-transformers](https://www.SBERT.net):
24
+
25
+ ```
26
+ pip install -U sentence-transformers
27
+ ```
28
+
29
+ Then you can use the model like this:
30
+
31
+ ```python
32
+ from sentence_transformers import SentenceTransformer
33
+ sentences = ["This is an example sentence", "Each sentence is converted"]
34
+
35
+ model = SentenceTransformer('kevinkrahn/shlm-grc-en')
36
+ embeddings = model.encode(sentences)
37
+ print(embeddings)
38
+ ```
39
+
40
+
41
+
42
+ ## Usage (HuggingFace Transformers)
43
+ Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
44
+
45
+ ```python
46
+ from transformers import AutoTokenizer, AutoModel
47
+ import torch
48
+
49
+
50
+ def cls_pooling(model_output, attention_mask):
51
+ return model_output[0][:,0]
52
+
53
+
54
+ # Sentences we want sentence embeddings for
55
+ sentences = ['This is an example sentence', 'Each sentence is converted']
56
+
57
+ # Load model from HuggingFace Hub
58
+ tokenizer = AutoTokenizer.from_pretrained('kevinkrahn/shlm-grc-en')
59
+ model = AutoModel.from_pretrained('kevinkrahn/shlm-grc-en')
60
+
61
+ # Tokenize sentences
62
+ encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
63
+
64
+ # Compute token embeddings
65
+ with torch.no_grad():
66
+ model_output = model(**encoded_input)
67
+
68
+ # Perform pooling. In this case, cls pooling.
69
+ sentence_embeddings = cls_pooling(model_output, encoded_input['attention_mask'])
70
+
71
+ print("Sentence embeddings:")
72
+ print(sentence_embeddings)
73
+ ```
74
+
75
+ ## Citing & Authors
76
+
77
+ ```
78
+ @inproceedings{riemenschneider-krahn-2024-heidelberg,
79
+ title = "Heidelberg-Boston @ {SIGTYP} 2024 Shared Task: Enhancing Low-Resource Language Analysis With Character-Aware Hierarchical Transformers",
80
+ author = "Riemenschneider, Frederick and
81
+ Krahn, Kevin",
82
+ editor = "Hahn, Michael and
83
+ Sorokin, Alexey and
84
+ Kumar, Ritesh and
85
+ Shcherbakov, Andreas and
86
+ Otmakhova, Yulia and
87
+ Yang, Jinrui and
88
+ Serikov, Oleg and
89
+ Rani, Priya and
90
+ Ponti, Edoardo M. and
91
+ Murado{\u{g}}lu, Saliha and
92
+ Gao, Rena and
93
+ Cotterell, Ryan and
94
+ Vylomova, Ekaterina",
95
+ booktitle = "Proceedings of the 6th Workshop on Research in Computational Linguistic Typology and Multilingual NLP",
96
+ month = mar,
97
+ year = "2024",
98
+ address = "St. Julian's, Malta",
99
+ publisher = "Association for Computational Linguistics",
100
+ url = "https://aclanthology.org/2024.sigtyp-1.16",
101
+ pages = "131--141",
102
+ }
103
+ ```
104
+
105
+ ```
106
+ @inproceedings{krahn-etal-2023-sentence,
107
+ title = "Sentence Embedding Models for {A}ncient {G}reek Using Multilingual Knowledge Distillation",
108
+ author = "Krahn, Kevin and
109
+ Tate, Derrick and
110
+ Lamicela, Andrew C.",
111
+ editor = "Anderson, Adam and
112
+ Gordin, Shai and
113
+ Li, Bin and
114
+ Liu, Yudong and
115
+ Passarotti, Marco C.",
116
+ booktitle = "Proceedings of the Ancient Language Processing Workshop",
117
+ month = sep,
118
+ year = "2023",
119
+ address = "Varna, Bulgaria",
120
+ publisher = "INCOMA Ltd., Shoumen, Bulgaria",
121
+ url = "https://aclanthology.org/2023.alp-1.2",
122
+ pages = "13--22",
123
+ }
124
+
125
+ ```
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "models/output/shlm-grc-en",
3
+ "architectures": [
4
+ "HLMModel"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_hlm.HLMConfig",
8
+ "AutoModel": "modeling_hlm.HLMModel"
9
+ },
10
+ "embedding_size": -1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "inter_word_encoder": {
14
+ "intermediate_size": 2048,
15
+ "model_type": "",
16
+ "sandwich_size": 2
17
+ },
18
+ "intra_word_encoder": {
19
+ "intermediate_size": 1536,
20
+ "model_type": "",
21
+ "num_hidden_layers": 4
22
+ },
23
+ "max_seq_length": 256,
24
+ "max_word_length": 16,
25
+ "model_type": "hlm",
26
+ "pad_token_id": 0,
27
+ "residual_word_embedding": false,
28
+ "torch_dtype": "float32",
29
+ "transformers_version": "4.38.2",
30
+ "type_vocab_size": 2,
31
+ "vocab_size": 512
32
+ }
config_sentence_transformers.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "2.4.0.dev0",
4
+ "transformers": "4.39.3",
5
+ "pytorch": "2.3.0+cu121"
6
+ },
7
+ "prompts": {},
8
+ "default_prompt_name": null
9
+ }
configuration_hlm.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+
3
+ class HLMEncoderConfig(PretrainedConfig):
4
+ def __init__(
5
+ self,
6
+ hidden_size=768,
7
+ num_hidden_layers=12,
8
+ num_attention_heads=12,
9
+ intermediate_size=3072,
10
+ hidden_dropout_prob=0.1,
11
+ layer_norm_eps=1e-7,
12
+ sandwich=False,
13
+ sandwich_size=0,
14
+ **kwargs,
15
+ ):
16
+ super().__init__(**kwargs)
17
+
18
+ self.hidden_size = hidden_size
19
+ self.num_hidden_layers = num_hidden_layers
20
+ self.num_attention_heads = num_attention_heads
21
+ self.intermediate_size = intermediate_size
22
+ self.dropout_prob = hidden_dropout_prob
23
+ self.layer_norm_eps = layer_norm_eps
24
+ if sandwich:
25
+ self.sandwich_size = num_hidden_layers // 6
26
+ else:
27
+ self.sandwich_size = sandwich_size
28
+
29
+
30
+ class HLMConfig(PretrainedConfig):
31
+ model_type = "hlm"
32
+
33
+ def __init__(
34
+ self,
35
+ vocab_size=512,
36
+ type_vocab_size=2,
37
+ embedding_size=-1,
38
+ max_seq_length=256,
39
+ max_word_length=16,
40
+ initializer_range=0.02,
41
+ pad_token_id=0,
42
+ intra_word_encoder={},
43
+ inter_word_encoder={},
44
+ residual_word_embedding=False,
45
+ **kwargs,
46
+ ):
47
+ super().__init__(**kwargs)
48
+
49
+ self.vocab_size = vocab_size
50
+ self.type_vocab_size = type_vocab_size
51
+ self.embedding_size = embedding_size
52
+ self.initializer_range = initializer_range
53
+ self.max_seq_length = max_seq_length
54
+ self.max_word_length = max_word_length
55
+ self.pad_token_id = pad_token_id
56
+ self.intra_word_encoder = HLMEncoderConfig(**intra_word_encoder)
57
+ self.inter_word_encoder = HLMEncoderConfig(**inter_word_encoder)
58
+ self.hidden_size = self.inter_word_encoder.hidden_size
59
+ self.residual_word_embedding = residual_word_embedding
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0a6e4c5f4eb9a71f57b56dac6a207932e0def2a9fb3c9956ae28482b39cfe6f
3
+ size 379310632
modeling_hlm.py ADDED
@@ -0,0 +1,614 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import torch.nn.functional as F
4
+
5
+ from dataclasses import dataclass
6
+ import copy
7
+
8
+ from transformers.modeling_outputs import BaseModelOutput, ModelOutput, MaskedLMOutput, TokenClassifierOutput, SequenceClassifierOutput
9
+ from transformers.modeling_utils import PreTrainedModel
10
+ from transformers import AutoConfig, AutoModel, AutoModelForTokenClassification, AutoModelForMaskedLM, AutoTokenizer, AutoModelForSequenceClassification
11
+ from .configuration_hlm import HLMConfig, HLMEncoderConfig
12
+ from .tokenization_hlm import HLMTokenizer
13
+
14
+ from typing import Tuple, Optional, Union
15
+
16
+ @dataclass
17
+ class HLMBaseModelOutput(ModelOutput):
18
+ last_hidden_state: torch.FloatTensor = None
19
+ hidden_states: Tuple[torch.FloatTensor] = None
20
+ attentions: Tuple[torch.FloatTensor] = None # Not currently supported
21
+
22
+ initial_embeds: torch.FloatTensor = None
23
+ initial_word_embeds: torch.FloatTensor = None
24
+ intra_word_mask: torch.LongTensor = None
25
+ char_embeds: torch.LongTensor = None
26
+ input_shape: Tuple[int, int, int, int] = None
27
+
28
+
29
+ class HLMEncoder(nn.Module):
30
+ def __init__(self, config) -> None:
31
+ super().__init__()
32
+
33
+ if config.sandwich_size > 0:
34
+ sandwich_start_index = config.num_hidden_layers // 2 - config.sandwich_size
35
+ sandwich_indices = [sandwich_start_index + i*2 + 1 for i in range(config.sandwich_size)]
36
+ #print('Sandwich indices:', sandwich_indices)
37
+ self.layers = nn.ModuleList([
38
+ TransformerBlock(config, bias=i in sandwich_indices) for i in range(config.num_hidden_layers)])
39
+ for i in range(config.sandwich_size):
40
+ self.layers[sandwich_start_index + i*2+1].make_sandwich(self.layers[sandwich_start_index + i*2])
41
+ else:
42
+ self.layers = nn.ModuleList([TransformerBlock(config) for _ in range(config.num_hidden_layers)])
43
+
44
+ def _get_attention_mask(self, attn_mask, dtype):
45
+ if attn_mask.dim() <= 2:
46
+ extended_mask = attn_mask.unsqueeze(1).unsqueeze(2)
47
+ extended_mask = extended_mask*extended_mask.squeeze(-2).unsqueeze(-1)
48
+ elif attn_mask.dim() == 3:
49
+ extended_mask = attn_mask.unsqueeze(1)
50
+ else:
51
+ extended_mask = attn_mask
52
+
53
+ # Convert to float to avoid zero in denominator of softmax in SDPA, resulting in NaNs
54
+ min_dtype = torch.finfo(dtype).min
55
+ extended_mask = ((1.0 - extended_mask.float()) * min_dtype)
56
+
57
+ # SDPA returns NaNs for fully masked rows, so attend to all tokens instead
58
+ extended_mask = extended_mask.mul(~torch.all(extended_mask==min_dtype, dim=-1, keepdim=True))
59
+
60
+ return extended_mask
61
+
62
+ def forward(self, hidden_states, attention_mask, freqs_cos, freqs_sin, return_dict=True, output_hidden_states=False):
63
+ all_hidden_states = []
64
+ attn_mask = self._get_attention_mask(attention_mask, hidden_states.dtype)
65
+ for layer in self.layers:
66
+ hidden_states = layer(hidden_states, attn_mask, freqs_cos, freqs_sin)
67
+ all_hidden_states.append(hidden_states)
68
+
69
+ if return_dict:
70
+ return BaseModelOutput(
71
+ last_hidden_state=all_hidden_states[-1],
72
+ hidden_states=all_hidden_states if output_hidden_states else None,
73
+ attentions=None,
74
+ )
75
+ else:
76
+ return (all_hidden_states[-1], all_hidden_states) if output_hidden_states else all_hidden_states
77
+
78
+
79
+ class HLMPreTrainedModel(PreTrainedModel):
80
+ """
81
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
82
+ models.
83
+ """
84
+
85
+ config_class = HLMConfig
86
+ base_model_prefix = "hlm"
87
+ _keys_to_ignore_on_load_unexpected = []
88
+ supports_gradient_checkpointing = True
89
+
90
+ def _init_weights(self, module):
91
+ """Initialize the weights."""
92
+ if isinstance(module, nn.Linear):
93
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
94
+ if module.bias is not None:
95
+ module.bias.data.zero_()
96
+ elif isinstance(module, nn.Embedding):
97
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
98
+ if module.padding_idx is not None:
99
+ module.weight.data[module.padding_idx].zero_()
100
+
101
+
102
+ class HLMModel(HLMPreTrainedModel):
103
+ def __init__(self, config):
104
+ super().__init__(config)
105
+
106
+ self.config = config
107
+
108
+ self.char_embeddings = nn.Embedding(config.vocab_size, config.intra_word_encoder.hidden_size, padding_idx=0)
109
+ self.char_embedding_dropout = nn.Dropout(config.intra_word_encoder.dropout_prob)
110
+
111
+ if self.config.embedding_size != -1 and self.config.embedding_size != self.config.intra_word_encoder.hidden_size:
112
+ self.char_embedding_project = nn.Linear(self.config.embedding_size, self.config.intra_word_encoder.hidden_size, bias=False)
113
+
114
+ freqs_cos, freqs_sin = precompute_freqs_cis(config.intra_word_encoder.hidden_size // config.intra_word_encoder.num_attention_heads, config.max_seq_length)
115
+ self.register_buffer("freqs_cos", freqs_cos)
116
+ self.register_buffer("freqs_sin", freqs_sin)
117
+
118
+ self.word_type_embeddings = nn.Embedding(config.type_vocab_size, config.intra_word_encoder.hidden_size)
119
+
120
+ self.intra_word_encoder = HLMEncoder(config.intra_word_encoder)
121
+ if self.config.intra_word_encoder.hidden_size != self.config.inter_word_encoder.hidden_size:
122
+ self.intra_word_project = nn.Linear(self.config.intra_word_encoder.hidden_size, self.config.inter_word_encoder.hidden_size, bias=False)
123
+
124
+ self.inter_word_encoder = HLMEncoder(config.inter_word_encoder)
125
+
126
+ # Initialize weights and apply final processing
127
+ self.post_init()
128
+
129
+ def get_input_embeddings(self):
130
+ return self.char_embeddings
131
+
132
+ def set_input_embeddings(self, new_embeddings):
133
+ self.char_embeddings = new_embeddings
134
+
135
+ def forward(self, input_ids, char_input_mask, word_input_mask, word_type_ids=None, combined_word_embeddings: Optional[bool]=False, output_hidden_states: Optional[bool]=False, return_dict: Optional[bool]=True):
136
+ input_embeds = self.char_embeddings(input_ids)
137
+ input_embeds = self.char_embedding_dropout(input_embeds)
138
+
139
+ if hasattr(self, "char_embedding_project"):
140
+ input_embeds = self.char_embedding_project(input_embeds)
141
+
142
+ batch_size, num_word, _, _ = input_embeds.shape
143
+ num_char = self.config.max_word_length
144
+
145
+ # reshape to attend to intra-word tokens rather than full sequence
146
+ input_embeds = input_embeds.view(batch_size * num_word, num_char, self.config.intra_word_encoder.hidden_size)
147
+ intra_word_mask = char_input_mask.view(batch_size * num_word, num_char)
148
+ intra_word_output = self.intra_word_encoder(
149
+ input_embeds,
150
+ intra_word_mask,
151
+ self.freqs_cos[:num_char],
152
+ self.freqs_sin[:num_char],
153
+ output_hidden_states=False,
154
+ return_dict=True,
155
+ )
156
+ initial_embeds = intra_word_output.last_hidden_state
157
+
158
+ # extract [WORD_CLS] embeddings, which are always at the beginning of each word
159
+ initial_word_embeds = initial_embeds[:,0,:]
160
+
161
+ if word_type_ids is not None:
162
+ word_type_embeds = self.word_type_embeddings(word_type_ids)
163
+ word_type_embeds = word_type_embeds.view(batch_size * num_word, self.config.intra_word_encoder.hidden_size)
164
+ initial_word_embeds = initial_word_embeds + word_type_embeds
165
+
166
+ if hasattr(self, "intra_word_project"):
167
+ initial_embeds = self.intra_word_project(initial_embeds)
168
+
169
+ # reshape and extract contextualized inter-word representation
170
+ word_embeds = initial_word_embeds.view(batch_size, num_word, self.config.inter_word_encoder.hidden_size)
171
+ inter_word_output = self.inter_word_encoder(
172
+ word_embeds,
173
+ word_input_mask,
174
+ self.freqs_cos[:num_word],
175
+ self.freqs_sin[:num_word],
176
+ output_hidden_states=output_hidden_states,
177
+ return_dict=True,
178
+ )
179
+
180
+ if combined_word_embeddings:
181
+ initial_word_embeds = initial_word_embeds.view(batch_size, num_word, self.config.inter_word_encoder.hidden_size)
182
+ contextual_word_embeds = inter_word_output.last_hidden_state
183
+ combined_word_embeds = torch.cat([initial_word_embeds, contextual_word_embeds], dim=2)
184
+ last_hidden_state = combined_word_embeds
185
+ else:
186
+ last_hidden_state = inter_word_output.last_hidden_state
187
+
188
+ if return_dict:
189
+ return HLMBaseModelOutput(
190
+ last_hidden_state=last_hidden_state,
191
+ hidden_states=inter_word_output.hidden_states if output_hidden_states else None,
192
+ initial_embeds=initial_embeds,
193
+ initial_word_embeds=initial_word_embeds,
194
+ intra_word_mask=intra_word_mask,
195
+ char_embeds=input_embeds,
196
+ input_shape=(batch_size, num_word, num_char, self.config.inter_word_encoder.hidden_size),
197
+ )
198
+ else:
199
+ return (
200
+ last_hidden_state,
201
+ inter_word_output.hidden_states if output_hidden_states else None,
202
+ initial_embeds,
203
+ initial_word_embeds,
204
+ intra_word_mask,
205
+ input_embeds,
206
+ (batch_size, num_word, num_char, self.config.inter_word_encoder.hidden_size),
207
+ )
208
+
209
+
210
+ def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
211
+ ndim = x.ndim
212
+ assert 0 <= 1 < ndim
213
+ assert freqs_cis.shape == (x.shape[1], x.shape[-1])
214
+ shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
215
+ return freqs_cis.view(*shape)
216
+
217
+
218
+ def apply_rotary_emb(xq: torch.Tensor, xk: torch.Tensor, freqs_cos: torch.Tensor, freqs_sin: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
219
+ # reshape xq and xk to match the complex representation
220
+ xq_r, xq_i = xq.float().reshape(*xq.shape[:-1], -1, 2).unbind(-1)
221
+ xk_r, xk_i = xk.float().reshape(*xk.shape[:-1], -1, 2).unbind(-1)
222
+
223
+ # reshape freqs_cos and freqs_sin for broadcasting
224
+ freqs_cos = reshape_for_broadcast(freqs_cos, xq_r)
225
+ freqs_sin = reshape_for_broadcast(freqs_sin, xq_r)
226
+
227
+ # apply rotation using real numbers
228
+ xq_out_r = xq_r * freqs_cos - xq_i * freqs_sin
229
+ xq_out_i = xq_r * freqs_sin + xq_i * freqs_cos
230
+ xk_out_r = xk_r * freqs_cos - xk_i * freqs_sin
231
+ xk_out_i = xk_r * freqs_sin + xk_i * freqs_cos
232
+
233
+ # flatten last two dimensions
234
+ xq_out = torch.stack([xq_out_r, xq_out_i], dim=-1).flatten(3)
235
+ xk_out = torch.stack([xk_out_r, xk_out_i], dim=-1).flatten(3)
236
+
237
+ return xq_out.type_as(xq), xk_out.type_as(xk)
238
+
239
+
240
+ def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
241
+ freqs = 1.0 / (
242
+ theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)
243
+ )
244
+ t = torch.arange(end, device=freqs.device) # type: ignore
245
+ freqs = torch.outer(t, freqs).float() # type: ignore
246
+ freqs_cos = torch.cos(freqs) # real part
247
+ freqs_sin = torch.sin(freqs) # imaginary part
248
+ return freqs_cos, freqs_sin
249
+
250
+
251
+ class RMSNorm(torch.nn.Module):
252
+ def __init__(self, dim: int, eps: float = 1e-6):
253
+ super().__init__()
254
+ self.eps = eps
255
+ self.weight = nn.Parameter(torch.ones(dim))
256
+
257
+ def _norm(self, x):
258
+ return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
259
+
260
+ def forward(self, x):
261
+ output = self._norm(x.float()).type_as(x)
262
+ return output * self.weight
263
+
264
+
265
+ class TransformerBlock(nn.Module):
266
+ def __init__(self, config: HLMEncoderConfig, bias: bool = False):
267
+ super().__init__()
268
+
269
+ self.pad_id = config.pad_token_id
270
+ self.drop_p = config.dropout_prob
271
+ self.n_heads = config.num_attention_heads
272
+ self.d_head = config.hidden_size // config.num_attention_heads
273
+ self.has_bias = bias
274
+ dim = config.hidden_size
275
+
276
+ # Attention
277
+ self.q = nn.Linear(in_features=dim, out_features=dim, bias=bias)
278
+ self.k = nn.Linear(in_features=dim, out_features=dim, bias=bias)
279
+ self.v = nn.Linear(in_features=dim, out_features=dim, bias=bias)
280
+ self.att_proj_linear = nn.Linear(in_features=dim, out_features=dim, bias=bias)
281
+ self.resid_dropout = nn.Dropout(self.drop_p)
282
+
283
+ # Feedforward layer
284
+ self.ff_dropout = nn.Dropout(self.drop_p)
285
+ self.ff_linear_1 = nn.Linear(in_features=dim, out_features=config.intermediate_size, bias=bias)
286
+ self.ff_linear_2 = nn.Linear(in_features=config.intermediate_size, out_features=dim, bias=bias)
287
+ self.ff_linear_3 = nn.Linear(in_features=dim, out_features=config.intermediate_size, bias=bias)
288
+
289
+ # Pre-layer norms
290
+ self.attn_norm = RMSNorm(dim, eps=config.layer_norm_eps)
291
+ self.ff_norm = RMSNorm(dim, eps=config.layer_norm_eps)
292
+
293
+ def make_sandwich(self, other):
294
+ assert self.has_bias
295
+ assert not other.has_bias
296
+ self.q.weight = other.q.weight
297
+ self.k.weight = other.k.weight
298
+ self.v.weight = other.v.weight
299
+ self.att_proj_linear.weight = other.att_proj_linear.weight
300
+ self.ff_linear_1.weight = other.ff_linear_1.weight
301
+ self.ff_linear_2.weight = other.ff_linear_2.weight
302
+ self.ff_linear_3.weight = other.ff_linear_3.weight
303
+
304
+ def forward(self, x: torch.Tensor, pad_mask: torch.Tensor, freqs_cos: torch.Tensor, freqs_sin: torch.Tensor):
305
+ x = x + self._attention_block(self.attn_norm(x), pad_mask, freqs_cos, freqs_sin)
306
+ x = x + self._feedforward_block(self.ff_norm(x))
307
+ return x
308
+
309
+ def _attention_block(self, x: torch.Tensor, attn_mask: torch.Tensor, freqs_cos: torch.Tensor, freqs_sin: torch.Tensor):
310
+ batch_size, seq_len, _ = x.shape
311
+ xq, xk, xv = self.q(x), self.k(x), self.v(x)
312
+
313
+ # Reshape for rotary embeddings
314
+ xq = xq.view(batch_size, seq_len, self.n_heads, self.d_head)
315
+ xk = xk.view(batch_size, seq_len, self.n_heads, self.d_head)
316
+ xv = xv.view(batch_size, seq_len, self.n_heads, self.d_head)
317
+ xq, xk = apply_rotary_emb(xq, xk, freqs_cos, freqs_sin)
318
+
319
+ # Reshape for attention calculation: (b_sz, n_head, s_len, d_head)
320
+ xq = xq.transpose(1, 2)
321
+ xk = xk.transpose(1, 2)
322
+ xv = xv.transpose(1, 2)
323
+
324
+ att = F.scaled_dot_product_attention(
325
+ query=xq, key=xk, value=xv,
326
+ attn_mask=attn_mask,
327
+ dropout_p=self.drop_p if self.training else 0.0,
328
+ is_causal=False,
329
+ )
330
+
331
+ # Shape (b_sz, s_len, n_head, d_head)
332
+ out = att.transpose(1, 2).contiguous()
333
+ out = out.view(batch_size, seq_len, self.n_heads * self.d_head)
334
+
335
+ return self.resid_dropout(self.att_proj_linear(out))
336
+
337
+ def _feedforward_block(self, x: torch.Tensor):
338
+ # SWiGLU activation
339
+ x = self.ff_linear_2(F.silu(self.ff_linear_1(x)) * self.ff_linear_3(x))
340
+ x = self.ff_dropout(x)
341
+ return x
342
+
343
+
344
+ class HLMForMaskedLM(HLMPreTrainedModel):
345
+ _tied_weights_keys = ["cls.decoder.weight", "cls.decoder.bias"]
346
+
347
+ def __init__(self, config):
348
+ super().__init__(config)
349
+
350
+ # NOTE: This property name must match "base_model_prefix" in the base class
351
+ self.hlm = HLMModel(config)
352
+ self.cls = HLMLMPredictionHead(config)
353
+
354
+ # Initialize weights and apply final processing
355
+ self.post_init()
356
+
357
+ def get_output_embeddings(self):
358
+ return self.cls.decoder
359
+
360
+ def set_output_embeddings(self, new_embeddings):
361
+ self.cls.decoder = new_embeddings
362
+
363
+ def forward(
364
+ self,
365
+ input_ids: Optional[torch.Tensor] = None,
366
+ labels: Optional[torch.Tensor] = None,
367
+ char_input_mask: Optional[torch.Tensor] = None,
368
+ word_input_mask: Optional[torch.Tensor] = None,
369
+ word_type_ids: Optional[torch.Tensor] = None,
370
+ output_hidden_states: Optional[bool] = None,
371
+ return_dict: Optional[bool] = True,
372
+ ) -> Union[Tuple, MaskedLMOutput]:
373
+ r"""
374
+ labels (`torch.LongTensor` of shape `(batch_size, num_words, max_chars_per_word)`, *optional*):
375
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
376
+ config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
377
+ loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
378
+ """
379
+
380
+ outputs = self.hlm(
381
+ input_ids,
382
+ char_input_mask=char_input_mask,
383
+ word_input_mask=word_input_mask,
384
+ word_type_ids=word_type_ids,
385
+ output_hidden_states=output_hidden_states,
386
+ return_dict=return_dict,
387
+ combined_word_embeddings=False,
388
+ )
389
+
390
+ prediction_scores = self.cls(outputs,
391
+ freqs_cos=self.hlm.freqs_cos[:self.config.max_word_length],
392
+ freqs_sin=self.hlm.freqs_sin[:self.config.max_word_length])
393
+
394
+ masked_lm_loss = None
395
+ if labels is not None:
396
+ loss_fct = nn.CrossEntropyLoss() # -100 index = padding token
397
+ masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
398
+
399
+ if not return_dict:
400
+ output = (prediction_scores,) + outputs[1:]
401
+ return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
402
+ else:
403
+ return MaskedLMOutput(
404
+ loss=masked_lm_loss,
405
+ logits=prediction_scores,
406
+ hidden_states=outputs.hidden_states,
407
+ )
408
+
409
+
410
+ class HLMLMPredictionHead(nn.Module):
411
+ def __init__(self, config):
412
+ super().__init__()
413
+
414
+ intra_word_encoder_config = copy.copy(config.intra_word_encoder)
415
+ intra_word_encoder_config.num_hidden_layers = 1
416
+ intra_word_encoder_config.sandwich_size = 0
417
+ self.intra_word_encoder = HLMEncoder(intra_word_encoder_config)
418
+ self.residual_word_embedding = getattr(config, 'residual_word_embedding', False)
419
+ self.config = config
420
+
421
+ if self.config.intra_word_encoder.hidden_size != self.config.inter_word_encoder.hidden_size:
422
+ self.inter_word_project = nn.Linear(config.inter_word_encoder.hidden_size, self.config.intra_word_encoder.hidden_size, bias=False)
423
+
424
+ if getattr(config, "tie_word_embeddings", True):
425
+ # The output weights are the same as the input embeddings, but there is
426
+ # an output-only bias for each token.
427
+ self.decoder = nn.Linear(config.intra_word_encoder.hidden_size, config.vocab_size, bias=False)
428
+ self.bias = nn.Parameter(torch.zeros(config.vocab_size))
429
+ # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
430
+ self.decoder.bias = self.bias
431
+ else:
432
+ self.decoder = nn.Linear(config.intra_word_encoder.hidden_size, config.vocab_size)
433
+
434
+ def forward(self, base_model_output: HLMBaseModelOutput, freqs_cos: torch.Tensor, freqs_sin: torch.Tensor):
435
+ batch_size, num_word, _, _ = base_model_output.input_shape
436
+
437
+ word_embeds = base_model_output.last_hidden_state.reshape(batch_size * num_word, 1, self.config.inter_word_encoder.hidden_size)
438
+
439
+ if self.residual_word_embedding:
440
+ # residual connection between initial word embeddings and contextual word embeddings as mentioned in the paper (section A.3)
441
+ word_embeds += base_model_output.initial_word_embeds.unsqueeze(1)
442
+
443
+ if hasattr(self, "inter_word_project"):
444
+ word_embeds = self.inter_word_project(word_embeds)
445
+
446
+ # concatenate to restore the character-level token sequence
447
+ char_embeds = torch.cat([word_embeds, base_model_output.initial_embeds[:,1:,:]], dim=1)
448
+
449
+ intra_word_output = self.intra_word_encoder(
450
+ char_embeds,
451
+ base_model_output.intra_word_mask,
452
+ freqs_cos, freqs_sin,
453
+ output_hidden_states=False,
454
+ return_dict=True,
455
+ )
456
+
457
+ char_logits = self.decoder(intra_word_output.last_hidden_state)
458
+ batch_size, num_word, num_char, _ = base_model_output.input_shape
459
+ char_logits = char_logits.reshape(batch_size, num_word * num_char, -1)
460
+ return char_logits
461
+
462
+
463
+ class HLMForTokenClassification(HLMPreTrainedModel):
464
+ def __init__(self, config):
465
+ super().__init__(config)
466
+ self.num_labels = config.num_labels
467
+
468
+ self.hlm = HLMModel(config)
469
+ self.cls = nn.Linear(config.inter_word_encoder.hidden_size*2, config.num_labels)
470
+
471
+ # Initialize weights and apply final processing
472
+ self.post_init()
473
+
474
+ def forward(
475
+ self,
476
+ input_ids: Optional[torch.Tensor] = None,
477
+ char_input_mask: Optional[torch.Tensor] = None,
478
+ word_input_mask: Optional[torch.Tensor] = None,
479
+ labels: Optional[torch.Tensor] = None,
480
+ output_hidden_states: Optional[bool] = None,
481
+ return_dict: Optional[bool] = None,
482
+ ) -> Union[Tuple, TokenClassifierOutput]:
483
+ r"""
484
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
485
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
486
+ """
487
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
488
+
489
+ outputs = self.hlm(
490
+ input_ids,
491
+ char_input_mask=char_input_mask,
492
+ word_input_mask=word_input_mask,
493
+ output_hidden_states=output_hidden_states,
494
+ combined_word_embeddings=True,
495
+ )
496
+
497
+ logits = self.cls(outputs.last_hidden_state)
498
+
499
+ loss = None
500
+ if labels is not None:
501
+ loss_fct = nn.CrossEntropyLoss()
502
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
503
+
504
+ if not return_dict:
505
+ output = (logits,) + outputs[1:]
506
+ return ((loss,) + output) if loss is not None else output
507
+
508
+ return TokenClassifierOutput(
509
+ loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
510
+ )
511
+
512
+
513
+ class HLMForSequenceClassification(HLMPreTrainedModel):
514
+ def __init__(self, config):
515
+ super().__init__(config)
516
+
517
+ self.config = config
518
+ self.num_labels = getattr(config, 'num_labels', 2)
519
+ self.hlm = HLMModel(config)
520
+
521
+ self.dense = nn.Linear(config.inter_word_encoder.hidden_size, config.inter_word_encoder.hidden_size)
522
+ self.dropout = nn.Dropout(0.1)
523
+ self.classifier = nn.Linear(config.inter_word_encoder.hidden_size, config.num_labels)
524
+ #self.activation = SwiGLU()
525
+ self.activation = nn.GELU()
526
+
527
+ # Initialize weights and apply final processing
528
+ self.post_init()
529
+
530
+ def forward(
531
+ self,
532
+ input_ids: Optional[torch.Tensor] = None,
533
+ char_input_mask: Optional[torch.Tensor] = None,
534
+ word_input_mask: Optional[torch.Tensor] = None,
535
+ word_type_ids: Optional[torch.Tensor] = None,
536
+ labels: Optional[torch.Tensor] = None,
537
+ output_hidden_states: Optional[bool] = None,
538
+ return_dict: Optional[bool] = None,
539
+ ) -> Union[Tuple, SequenceClassifierOutput]:
540
+ r"""
541
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
542
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
543
+ """
544
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
545
+
546
+ outputs = self.hlm(
547
+ input_ids,
548
+ char_input_mask=char_input_mask,
549
+ word_input_mask=word_input_mask,
550
+ word_type_ids=word_type_ids,
551
+ output_hidden_states=output_hidden_states,
552
+ combined_word_embeddings=False,
553
+ )
554
+
555
+ emb = outputs.last_hidden_state[:, 0]
556
+ emb = self.dense(emb)
557
+ emb = self.activation(emb)
558
+ emb = self.dropout(emb)
559
+ logits = self.classifier(emb)
560
+
561
+ loss = None
562
+ if labels is not None:
563
+ if self.config.problem_type is None:
564
+ if self.num_labels == 1:
565
+ # regression task
566
+ loss_fn = nn.MSELoss()
567
+ logits = logits.view(-1).to(labels.dtype)
568
+ loss = loss_fn(logits, labels.view(-1))
569
+ elif labels.dim() == 1 or labels.size(-1) == 1:
570
+ label_index = (labels >= 0).nonzero()
571
+ labels = labels.long()
572
+ if label_index.size(0) > 0:
573
+ labeled_logits = torch.gather(
574
+ logits, 0, label_index.expand(label_index.size(0), logits.size(1))
575
+ )
576
+ labels = torch.gather(labels, 0, label_index.view(-1))
577
+ loss_fct = nn.CrossEntropyLoss()
578
+ loss = loss_fct(labeled_logits.view(-1, self.num_labels).float(), labels.view(-1))
579
+ else:
580
+ loss = torch.tensor(0).to(logits)
581
+ else:
582
+ log_softmax = nn.LogSoftmax(-1)
583
+ loss = -((log_softmax(logits) * labels).sum(-1)).mean()
584
+ elif self.config.problem_type == "regression":
585
+ loss_fct = nn.MSELoss()
586
+ if self.num_labels == 1:
587
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
588
+ else:
589
+ loss = loss_fct(logits, labels)
590
+ elif self.config.problem_type == "single_label_classification":
591
+ loss_fct = nn.CrossEntropyLoss()
592
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
593
+ elif self.config.problem_type == "multi_label_classification":
594
+ loss_fct = nn.BCEWithLogitsLoss()
595
+ loss = loss_fct(logits, labels)
596
+ if not return_dict:
597
+ output = (logits,) + outputs[1:]
598
+ return ((loss,) + output) if loss is not None else output
599
+
600
+ return SequenceClassifierOutput(
601
+ loss=loss, logits=logits, hidden_states=outputs.hidden_states)
602
+
603
+
604
+ AutoConfig.register("hlm", HLMConfig)
605
+ AutoModel.register(HLMConfig, HLMModel)
606
+ AutoModelForTokenClassification.register(HLMConfig, HLMForTokenClassification)
607
+ AutoModelForSequenceClassification.register(HLMConfig, HLMForSequenceClassification)
608
+ AutoModelForMaskedLM.register(HLMConfig, HLMForMaskedLM)
609
+ AutoTokenizer.register(HLMConfig, HLMTokenizer)
610
+ HLMConfig.register_for_auto_class()
611
+ HLMModel.register_for_auto_class("AutoModel")
612
+ HLMForMaskedLM.register_for_auto_class("AutoModelForMaskedLM")
613
+ HLMForSequenceClassification.register_for_auto_class("AutoModelForSequenceClassification")
614
+ HLMForTokenClassification.register_for_auto_class("AutoModelForTokenClassification")
modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ }
14
+ ]
sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 256,
3
+ "do_lower_case": false
4
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "[CLS]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "[SEP]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "[MASK]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "[SEP]",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "[UNK]",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenization_hlm.py ADDED
@@ -0,0 +1,664 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import unicodedata
4
+ from typing import Any, Dict, List, Optional, Tuple, Union
5
+ from collections.abc import Mapping
6
+ from collections import Counter
7
+ import itertools
8
+ import torch
9
+
10
+ from transformers.tokenization_utils import PreTrainedTokenizer, PaddingStrategy, TruncationStrategy, TensorType, BatchEncoding
11
+ from transformers.utils import logging, is_torch_tensor
12
+
13
+ TextInput = str
14
+ PreTokenizedInput = List[str]
15
+ EncodedInput = List[List[int]]
16
+ TextInputPair = Tuple[TextInput, TextInput]
17
+ PreTokenizedInputPair = Tuple[PreTokenizedInput, PreTokenizedInput]
18
+ EncodedInputPair = Tuple[EncodedInput, EncodedInput]
19
+
20
+ logger = logging.get_logger(__name__)
21
+
22
+ VOCAB_FILES_NAMES = {"vocab_file": "vocab.json"}
23
+
24
+ # TODO: add support for return_offsets_mapping
25
+
26
+ class HLMTokenizer(PreTrainedTokenizer):
27
+ r"""
28
+ Constructs a HLM tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
29
+
30
+ Args:
31
+ vocab_file (`str`):
32
+ Path to .json vocab file.
33
+ bos_token (`string`, *optional*, defaults to `"[CLS]"`):
34
+ The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
35
+ When building a sequence using special tokens, this is not the token that is used for the beginning of
36
+ sequence. The token used is the `cls_token`.
37
+ eos_token (`string`, *optional*, defaults to `"[SEP]"`):
38
+ The end of sequence token. When building a sequence using special tokens, this is not the token that is
39
+ used for the end of sequence. The token used is the `sep_token`.
40
+ unk_token (`str`, *optional*, defaults to `"[UNK]"`):
41
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
42
+ token instead.
43
+ sep_token (`str`, *optional*, defaults to `"[SEP]"`):
44
+ The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
45
+ sequence classification or for a text and a question for question answering. It is also used as the last
46
+ token of a sequence built with special tokens.
47
+ pad_token (`str`, *optional*, defaults to `"[PAD]"`):
48
+ The token used for padding, for example when batching sequences of different lengths.
49
+ cls_token (`str`, *optional*, defaults to `"[CLS]"`):
50
+ The classifier token which is used when doing sequence classification (classification of the whole sequence
51
+ instead of per-token classification). It is the first token of the sequence when built with special tokens.
52
+ mask_token (`str`, *optional*, defaults to `"[MASK]"`):
53
+ The token used for masking values. This is the token used when training this model with masked language
54
+ modeling. This is the token which the model will try to predict.
55
+ word_cls_token (`str`, *optional*, defaults to `"[WORD_CLS]"`):
56
+ The classifier token which is used for word representations and word classification.
57
+ It is the first token of each word when built with special tokens.
58
+ """
59
+
60
+ vocab_files_names = VOCAB_FILES_NAMES
61
+ model_input_names: List[str] = ["input_ids", "char_input_mask", "word_input_mask", "word_type_ids"]
62
+ padding_side: str = "right"
63
+ truncation_side: str = "right"
64
+
65
+ def __init__(
66
+ self,
67
+ vocab_file,
68
+ split_by_punct=False,
69
+ bos_token="[CLS]",
70
+ eos_token="[SEP]",
71
+ unk_token="[UNK]",
72
+ sep_token="[SEP]",
73
+ pad_token="[PAD]",
74
+ cls_token="[CLS]",
75
+ mask_token="[MASK]",
76
+ word_cls_token="[WORD_CLS]",
77
+ max_word_length=None,
78
+ model_max_length=None,
79
+ **kwargs,
80
+ ) -> None:
81
+ if not os.path.isfile(vocab_file):
82
+ raise ValueError(
83
+ f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a pretrained"
84
+ " model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
85
+ )
86
+
87
+ if max_word_length is not None:
88
+ self.max_word_length = max_word_length
89
+ else:
90
+ try:
91
+ with open(os.path.dirname(vocab_file) + "/config.json", "r") as f:
92
+ config = json.load(f)
93
+ self.max_word_length = config["max_word_length"]
94
+ if model_max_length is None:
95
+ model_max_length = config.get("max_seq_length", None)
96
+ except:
97
+ raise ValueError("Failed to load max_word_length from config.json. Please specify max_word_length.")
98
+
99
+ self.split_by_punct = split_by_punct
100
+ self.vocab_file = vocab_file
101
+ with open(vocab_file, 'r', encoding='utf-8') as f:
102
+ vocab_data = json.load(f)
103
+ self.vocab = vocab_data["vocab"]
104
+ self.inv_vocab = {v: k for k, v in self.vocab.items()}
105
+
106
+ super().__init__(
107
+ bos_token=bos_token,
108
+ eos_token=eos_token,
109
+ unk_token=unk_token,
110
+ sep_token=sep_token,
111
+ pad_token=pad_token,
112
+ cls_token=cls_token,
113
+ mask_token=mask_token,
114
+ split_by_punct=split_by_punct,
115
+ model_max_length=model_max_length,
116
+ **kwargs,
117
+ )
118
+ self.unk_id = self.vocab["[UNK]"]
119
+ self.word_cls_token = word_cls_token
120
+ self.word_cls_token_id = self._convert_token_to_id(word_cls_token)
121
+ self.label_pad_token_id = -100
122
+ self.special_ids = [self._convert_token_to_id(token) for token in vocab_data["special_tokens"]]
123
+
124
+ #self.pad_word = [[self.word_cls_token_id] + [0]*(self.max_word_length-1)]
125
+ #self.pad_mask_word = [[1] + [0]*(self.max_word_length-1)]
126
+ self.pad_word = [[0] + [0]*(self.max_word_length-1)]
127
+ self.pad_mask_word = [[0] + [0]*(self.max_word_length-1)]
128
+
129
+ @staticmethod
130
+ def train(files: List[Union[str, os.PathLike]], output_dir: Union[str, os.PathLike], vocab_size: int=512, max_lines_to_consider=2_000_000):
131
+ char_maps = []
132
+ # Each input file is weighted equally, regardless of size
133
+ # This is to prevent one language from dominating the character distribution
134
+ for file in files:
135
+ print('Loading char counts from', file)
136
+ counter = Counter()
137
+ line_count = 0
138
+ with open(file, "r", encoding="utf-8") as file:
139
+ while line_count < max_lines_to_consider:
140
+ lines = file.readlines(100*1024)
141
+ if len(lines) == 0:
142
+ break
143
+ for line in lines:
144
+ line = unicodedata.normalize('NFKC', line)
145
+ line_count += 1
146
+ counter.update(line)
147
+ d = {}
148
+ total = counter.total()
149
+ for char, count in counter.items():
150
+ d[char] = count / total
151
+ char_maps.append(d)
152
+
153
+ char_map = {}
154
+ for d in char_maps:
155
+ for char, freq in d.items():
156
+ if not char.isspace():
157
+ char_map[char] = char_map.get(char, 0) + freq
158
+
159
+ special_tokens = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', '[WORD_CLS]']
160
+ chars_to_keep = sorted(list(char_map.keys()), key=lambda c: char_map[c], reverse=True)[:vocab_size-len(special_tokens)]
161
+ vocab_entries = [*special_tokens, *chars_to_keep]
162
+
163
+ vocab = {
164
+ 'special_tokens': special_tokens,
165
+ 'vocab': { key: i for i, key in enumerate(vocab_entries) }
166
+ }
167
+
168
+ assert(len(vocab_entries) == vocab_size)
169
+
170
+ filename = os.path.join(output_dir, VOCAB_FILES_NAMES["vocab_file"])
171
+ os.makedirs(output_dir, exist_ok=True)
172
+ print("Saving vocab to", filename)
173
+ with open(filename, 'w', encoding='utf-8') as f:
174
+ json.dump(vocab, f, ensure_ascii=False, indent=4)
175
+
176
+ return filename
177
+
178
+ @property
179
+ def vocab_size(self):
180
+ return len(self.vocab)
181
+
182
+ def get_vocab(self):
183
+ return self.vocab
184
+
185
+ def _convert_token_to_id(self, token):
186
+ """Converts a token (str) to an id using the vocab."""
187
+ return self.vocab.get(token, self.unk_id)
188
+
189
+ def _convert_id_to_token(self, index):
190
+ """Converts an index (integer) in a token (str) using the vocab."""
191
+ return self.inv_vocab[index] if index < self.vocab_size else self.unk_token
192
+
193
+ def convert_tokens_to_ids(self, tokens: Union[str, List[str], List[List[str]]]):
194
+ if isinstance(tokens, str):
195
+ return self._convert_token_to_id(tokens)
196
+ if len(tokens) > 0 and isinstance(tokens[0], str):
197
+ return [self._convert_token_to_id(token) for token in tokens]
198
+ return [[self._convert_token_to_id(token) for token in word] for word in tokens]
199
+
200
+ def convert_tokens_to_string(self, tokens):
201
+ """Converts a sequence of tokens (string) in a single string."""
202
+ raise NotImplementedError
203
+
204
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
205
+ if token_ids_1 is None:
206
+ return [[self.cls_token_id]] + token_ids_0 + [[self.eos_token_id]]
207
+ return [[self.cls_token_id]] + token_ids_0 + [[self.eos_token_id], [self.cls_token_id]] + token_ids_1 + [[self.eos_token_id]]
208
+
209
+ def num_special_tokens_to_add(self, pair: bool = False) -> int:
210
+ return 3 if pair else 2
211
+
212
+ def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
213
+ raise NotImplementedError
214
+
215
+ def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None, has_special_tokens=False):
216
+ if has_special_tokens:
217
+ return [0] * (len(token_ids_0)+2) + ([1] * (len(token_ids_1)+2) if token_ids_1 is not None else [])
218
+ else:
219
+ return [0] * len(token_ids_0) + ([1] * len(token_ids_1) if token_ids_1 is not None else [])
220
+
221
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
222
+ filename = VOCAB_FILES_NAMES["vocab_file"]
223
+ if filename_prefix is not None:
224
+ filename = filename_prefix + "-" + filename
225
+ full_path = os.path.join(save_directory, filename)
226
+ with open(full_path, "w", encoding="utf-8") as f:
227
+ json.dump({
228
+ "special_tokens": self.all_special_tokens,
229
+ "vocab": self.get_vocab(),
230
+ }, f, ensure_ascii=False, indent=4)
231
+ return (full_path,)
232
+
233
+ def encode(
234
+ self,
235
+ text: Union[TextInput, PreTokenizedInput, EncodedInput],
236
+ text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
237
+ is_split_into_words: bool = False,
238
+ add_special_tokens: bool = False,
239
+ padding: Union[bool, str, PaddingStrategy] = False,
240
+ truncation: Union[bool, str, TruncationStrategy] = None,
241
+ max_length: Optional[int] = None,
242
+ return_tensors: Optional[Union[str, TensorType]] = None,
243
+ **kwargs,
244
+ ) -> List[int]:
245
+ def get_input_ids(text):
246
+ if isinstance(text, str):
247
+ tokens = self.tokenize(text, **kwargs)
248
+ return self.convert_tokens_to_ids(tokens)
249
+ elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
250
+ if is_split_into_words:
251
+ tokens = list(
252
+ itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
253
+ )
254
+ return self.convert_tokens_to_ids(tokens)
255
+ else:
256
+ return self.convert_tokens_to_ids(text)
257
+ elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], List[int]):
258
+ return text
259
+ else:
260
+ raise ValueError(
261
+ f"Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.")
262
+
263
+ first_ids = get_input_ids(text)
264
+ second_ids = get_input_ids(text_pair) if text_pair is not None else None
265
+
266
+ if add_special_tokens:
267
+ sequence = self.build_inputs_with_special_tokens(first_ids, second_ids)
268
+ else:
269
+ sequence = first_ids
270
+
271
+ return sequence
272
+
273
+ def prepare_for_model(
274
+ self,
275
+ ids: List[List[int]],
276
+ pair_ids: Optional[List[List[int]]] = None,
277
+ add_special_tokens: bool = True,
278
+ padding: Union[bool, str, PaddingStrategy] = False,
279
+ truncation: Union[bool, str, TruncationStrategy] = None,
280
+ max_length: Optional[int] = None,
281
+ stride: int = 0,
282
+ pad_to_multiple_of: Optional[int] = None,
283
+ return_tensors: Optional[Union[str, TensorType]] = None,
284
+ return_token_type_ids: Optional[bool] = None,
285
+ return_attention_mask: bool = True,
286
+ return_overflowing_tokens: bool = False,
287
+ return_special_tokens_mask: bool = False,
288
+ return_offsets_mapping: bool = False,
289
+ return_length: bool = False,
290
+ verbose: bool = True,
291
+ add_word_cls: bool = True,
292
+ prepend_batch_axis: bool = False,
293
+ **kwargs,
294
+ ) -> BatchEncoding:
295
+ """
296
+ Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
297
+ adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
298
+ manages a moving window (with user defined stride) for overflowing tokens.
299
+
300
+ Args:
301
+ ids (`List[List[int]]`):
302
+ Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and
303
+ `convert_tokens_to_ids` methods.
304
+ pair_ids (`List[List[int]]`, *optional*):
305
+ Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
306
+ and `convert_tokens_to_ids` methods.
307
+ """
308
+
309
+ # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
310
+ padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
311
+ padding=padding,
312
+ truncation=truncation,
313
+ max_length=max_length,
314
+ pad_to_multiple_of=pad_to_multiple_of,
315
+ verbose=verbose,
316
+ **kwargs,
317
+ )
318
+
319
+ pair = bool(pair_ids is not None)
320
+ len_pair_ids = len(pair_ids) if pair else 0
321
+
322
+ if return_token_type_ids and not add_special_tokens:
323
+ raise ValueError(
324
+ "Asking to return token_type_ids while setting add_special_tokens to False "
325
+ "results in an undefined behavior. Please set add_special_tokens to True or "
326
+ "set return_token_type_ids to None."
327
+ )
328
+
329
+ if (
330
+ return_overflowing_tokens
331
+ and truncation_strategy == TruncationStrategy.LONGEST_FIRST
332
+ and pair_ids is not None
333
+ ):
334
+ raise ValueError(
335
+ "Not possible to return overflowing tokens for pair of sequences with the "
336
+ "`longest_first`. Please select another truncation strategy than `longest_first`, "
337
+ "for instance `only_second` or `only_first`."
338
+ )
339
+
340
+ encoded_inputs = {}
341
+
342
+ # Compute the total size of the returned encodings
343
+ total_len = len(ids) + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)
344
+
345
+ # Truncation: Handle max sequence length
346
+ overflowing_tokens = []
347
+ if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
348
+ ids, pair_ids, overflowing_tokens = self.truncate_sequences(
349
+ ids,
350
+ pair_ids=pair_ids,
351
+ num_tokens_to_remove=total_len - max_length,
352
+ truncation_strategy=truncation_strategy,
353
+ stride=stride,
354
+ )
355
+
356
+ if return_overflowing_tokens:
357
+ encoded_inputs["overflowing_tokens"] = overflowing_tokens
358
+ encoded_inputs["num_truncated_tokens"] = total_len - max_length
359
+
360
+ if add_special_tokens:
361
+ sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
362
+ else:
363
+ sequence = ids + pair_ids if pair else ids
364
+
365
+ if add_word_cls:
366
+ for word in sequence:
367
+ word.insert(0, self.word_cls_token_id)
368
+
369
+ # Build output dictionary
370
+ encoded_inputs["input_ids"] = sequence
371
+ encoded_inputs["char_input_mask"] = [[1]*len(word)+[0]*(self.max_word_length-len(word)) for word in sequence]
372
+ encoded_inputs["word_input_mask"] = [1]*len(sequence)
373
+ if return_token_type_ids or pair:
374
+ encoded_inputs["word_type_ids"] = self.create_token_type_ids_from_sequences(ids, pair_ids, add_special_tokens)
375
+ assert len(encoded_inputs["word_type_ids"]) == len(encoded_inputs["word_input_mask"])
376
+
377
+ # Always pad words
378
+ for word in encoded_inputs["input_ids"]:
379
+ if len(word) < self.max_word_length:
380
+ word.extend([self.pad_token_id] * (self.max_word_length - len(word)))
381
+
382
+ # Padding
383
+ if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
384
+ encoded_inputs = self.pad(
385
+ encoded_inputs,
386
+ max_length=max_length,
387
+ padding=padding_strategy.value,
388
+ pad_to_multiple_of=pad_to_multiple_of,
389
+ return_attention_mask=return_attention_mask,
390
+ )
391
+
392
+ batch_outputs = BatchEncoding(
393
+ encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
394
+ )
395
+
396
+ return batch_outputs
397
+
398
+ def _encode_plus(
399
+ self,
400
+ text: Union[TextInput, PreTokenizedInput, EncodedInput],
401
+ text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
402
+ add_special_tokens: bool = True,
403
+ padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
404
+ truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
405
+ max_length: Optional[int] = None,
406
+ stride: int = 0,
407
+ is_split_into_words: bool = False,
408
+ pad_to_multiple_of: Optional[int] = None,
409
+ return_tensors: Optional[Union[str, TensorType]] = None,
410
+ return_token_type_ids: Optional[bool] = None,
411
+ return_attention_mask: Optional[bool] = None,
412
+ return_overflowing_tokens: bool = False,
413
+ return_special_tokens_mask: bool = False,
414
+ return_offsets_mapping: bool = False,
415
+ return_length: bool = False,
416
+ verbose: bool = True,
417
+ add_word_cls: bool = True,
418
+ **kwargs,
419
+ ) -> BatchEncoding:
420
+ def get_input_ids(text):
421
+ if isinstance(text, str):
422
+ tokens = self.tokenize(text, **kwargs)
423
+ return self.convert_tokens_to_ids(tokens)
424
+ elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
425
+ if is_split_into_words:
426
+ tokens = list(
427
+ itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
428
+ )
429
+ return self.convert_tokens_to_ids(tokens)
430
+ else:
431
+ return self.convert_tokens_to_ids(text)
432
+ elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], List[int]):
433
+ return text
434
+ else:
435
+ raise ValueError(
436
+ f"Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.")
437
+
438
+ if return_offsets_mapping:
439
+ raise NotImplementedError(
440
+ "return_offset_mapping is not available when using Python tokenizers. "
441
+ "To use this feature, change your tokenizer to one deriving from "
442
+ "transformers.PreTrainedTokenizerFast. "
443
+ "More information on available tokenizers at "
444
+ "https://github.com/huggingface/transformers/pull/2674"
445
+ )
446
+
447
+ first_ids = get_input_ids(text)
448
+ second_ids = get_input_ids(text_pair) if text_pair is not None else None
449
+
450
+ return self.prepare_for_model(
451
+ first_ids,
452
+ pair_ids=second_ids,
453
+ add_special_tokens=add_special_tokens,
454
+ padding=padding_strategy.value,
455
+ truncation=truncation_strategy.value,
456
+ max_length=max_length,
457
+ stride=stride,
458
+ pad_to_multiple_of=pad_to_multiple_of,
459
+ return_tensors=return_tensors,
460
+ prepend_batch_axis=True,
461
+ return_attention_mask=return_attention_mask,
462
+ return_token_type_ids=return_token_type_ids,
463
+ return_overflowing_tokens=return_overflowing_tokens,
464
+ return_special_tokens_mask=return_special_tokens_mask,
465
+ return_length=return_length,
466
+ verbose=verbose,
467
+ add_word_cls=add_word_cls,
468
+ )
469
+
470
+ def _batch_encode_plus(
471
+ self,
472
+ batch_text_or_text_pairs: Union[
473
+ List[TextInput],
474
+ List[TextInputPair],
475
+ List[PreTokenizedInput],
476
+ List[PreTokenizedInputPair],
477
+ List[EncodedInput],
478
+ List[EncodedInputPair],
479
+ ],
480
+ add_special_tokens: bool = True,
481
+ padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
482
+ truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
483
+ max_length: Optional[int] = None,
484
+ stride: int = 0,
485
+ is_split_into_words: bool = False,
486
+ pad_to_multiple_of: Optional[int] = None,
487
+ return_tensors: Optional[Union[str, TensorType]] = None,
488
+ return_token_type_ids: Optional[bool] = None,
489
+ return_attention_mask: Optional[bool] = None,
490
+ return_overflowing_tokens: bool = False,
491
+ return_special_tokens_mask: bool = False,
492
+ return_offsets_mapping: bool = False,
493
+ return_length: bool = False,
494
+ verbose: bool = True,
495
+ **kwargs,
496
+ ) -> BatchEncoding:
497
+ def get_input_ids(text):
498
+ if isinstance(text, str):
499
+ tokens = self.tokenize(text, **kwargs)
500
+ return self.convert_tokens_to_ids(tokens)
501
+ elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
502
+ if is_split_into_words:
503
+ tokens = list(
504
+ itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
505
+ )
506
+ return self.convert_tokens_to_ids(tokens)
507
+ else:
508
+ return self.convert_tokens_to_ids(text)
509
+ elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], List[int]):
510
+ return text
511
+ else:
512
+ raise ValueError(
513
+ "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
514
+ )
515
+
516
+ if return_offsets_mapping:
517
+ raise NotImplementedError(
518
+ "return_offset_mapping is not available when using Python tokenizers. "
519
+ "To use this feature, change your tokenizer to one deriving from "
520
+ "transformers.PreTrainedTokenizerFast."
521
+ )
522
+
523
+ input_ids = []
524
+ for ids_or_pair_ids in batch_text_or_text_pairs:
525
+ if not isinstance(ids_or_pair_ids, (list, tuple)):
526
+ ids, pair_ids = ids_or_pair_ids, None
527
+ elif is_split_into_words and not isinstance(ids_or_pair_ids[0], (list, tuple)):
528
+ ids, pair_ids = ids_or_pair_ids, None
529
+ else:
530
+ ids, pair_ids = ids_or_pair_ids
531
+
532
+ first_ids = get_input_ids(ids)
533
+ second_ids = get_input_ids(pair_ids) if pair_ids is not None else None
534
+ input_ids.append((first_ids, second_ids))
535
+
536
+ batch_outputs = self._batch_prepare_for_model(
537
+ input_ids,
538
+ add_special_tokens=add_special_tokens,
539
+ padding_strategy=padding_strategy,
540
+ truncation_strategy=truncation_strategy,
541
+ max_length=max_length,
542
+ stride=stride,
543
+ pad_to_multiple_of=pad_to_multiple_of,
544
+ return_attention_mask=return_attention_mask,
545
+ return_token_type_ids=return_token_type_ids,
546
+ return_overflowing_tokens=return_overflowing_tokens,
547
+ return_special_tokens_mask=return_special_tokens_mask,
548
+ return_length=return_length,
549
+ return_tensors=return_tensors,
550
+ verbose=verbose,
551
+ )
552
+
553
+ return BatchEncoding(batch_outputs)
554
+
555
+ def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, split_long_words: bool = True) -> List[List[str]]:
556
+ text = unicodedata.normalize('NFKC', text)
557
+ if split_long_words:
558
+ tokenized_text = []
559
+ for token in text.split():
560
+ tokens = [char for char in token]
561
+ tokenized_text.extend(
562
+ tokens[i: i + self.max_word_length - 1] for i in range(0, len(tokens), self.max_word_length - 1))
563
+ return tokenized_text
564
+ else:
565
+ return [[char for char in token] for token in text.split()]
566
+
567
+ def pad(
568
+ self,
569
+ encoded_inputs: Union[
570
+ BatchEncoding,
571
+ List[BatchEncoding],
572
+ Dict[str, EncodedInput],
573
+ Dict[str, List[EncodedInput]],
574
+ List[Dict[str, EncodedInput]],
575
+ ],
576
+ padding: Union[bool, str, PaddingStrategy] = True,
577
+ max_length: Optional[int] = None,
578
+ pad_to_multiple_of: Optional[int] = None, # TODO: add support for pad_to_multiple_of
579
+ return_attention_mask: Optional[bool] = None,
580
+ return_tensors: Optional[Union[str, TensorType]] = None,
581
+ #label_pad_token_id=-100,
582
+ verbose: bool = True,
583
+ ) -> BatchEncoding:
584
+ # If we have a list of dicts, let's convert it in a dict of lists
585
+ # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
586
+ if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], Mapping):
587
+ encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}
588
+
589
+ # The model's main input name, usually `input_ids`, has be passed for padding
590
+ #if self.model_input_names[0] not in encoded_inputs:
591
+ # raise ValueError(
592
+ # "You should supply an encoding or a list of encodings to this method "
593
+ # f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
594
+ # )
595
+
596
+ required_input = encoded_inputs["input_ids"]
597
+
598
+ #if required_input is None or (isinstance(required_input, Sized) and len(required_input) == 0):
599
+ # if return_attention_mask:
600
+ # encoded_inputs["char_input_mask"] = []
601
+ # encoded_inputs["word_input_mask"] = []
602
+ # return encoded_inputs
603
+
604
+ # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
605
+ # and rebuild them afterwards if no return_tensors is specified
606
+ # Note that we lose the specific device the tensor may be on for PyTorch
607
+
608
+ #first_element = required_input[0]
609
+ ## At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
610
+ #if not isinstance(first_element, (int, list, tuple)):
611
+ # if is_torch_tensor(first_element):
612
+ # return_tensors = "pt" if return_tensors is None else return_tensors
613
+
614
+ # for key, value in encoded_inputs.items():
615
+ # encoded_inputs[key] = to_py_obj(value)
616
+
617
+ # Convert padding_strategy in PaddingStrategy
618
+ padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
619
+ padding=padding, max_length=max_length, verbose=verbose)
620
+
621
+ if padding_strategy == PaddingStrategy.DO_NOT_PAD:
622
+ return encoded_inputs
623
+
624
+ assert (padding_strategy == PaddingStrategy.LONGEST)
625
+
626
+ longest_in_batch = max(len(f) for f in required_input)
627
+ batch_outputs = {}
628
+ batch_outputs["input_ids"] = [f + self.pad_word*(longest_in_batch - len(f)) for f in encoded_inputs["input_ids"]]
629
+ batch_outputs["char_input_mask"] = [f + self.pad_mask_word*(longest_in_batch - len(f)) for f in encoded_inputs["char_input_mask"]]
630
+
631
+ batch_outputs["word_input_mask"] = \
632
+ [f + [0]*(longest_in_batch - len(f)) for f in encoded_inputs['word_input_mask']]
633
+
634
+ if "word_type_ids" in encoded_inputs:
635
+ batch_outputs["word_type_ids"] = [f + [0]*(longest_in_batch - len(f)) for f in encoded_inputs["word_type_ids"]]
636
+
637
+ batch_outputs["char_input_mask"] = torch.tensor(batch_outputs["char_input_mask"], dtype=torch.bool)
638
+ batch_outputs["word_input_mask"] = torch.tensor(batch_outputs["word_input_mask"], dtype=torch.bool)
639
+
640
+ # TODO: move label names elsewhere
641
+ label_fields = ('labels', 'upos', 'feats', 'heads', 'deprels', 'lemmas')
642
+ label_names = [feature for feature in encoded_inputs.keys() if feature in label_fields]
643
+
644
+ if len(label_names) > 0:
645
+ def to_list(tensor_or_iterable):
646
+ if is_torch_tensor(tensor_or_iterable):
647
+ return tensor_or_iterable.tolist()
648
+ return list(tensor_or_iterable)
649
+
650
+ for label_name in label_names:
651
+ if label_name not in encoded_inputs:
652
+ continue
653
+ labels = encoded_inputs[label_name]
654
+ label_pad_word = [[self.label_pad_token_id]*self.max_word_length]
655
+ if self.padding_side == "right":
656
+ batch_outputs[label_name] = [
657
+ to_list(label) + label_pad_word * (longest_in_batch - len(label)) for label in labels
658
+ ]
659
+ else:
660
+ batch_outputs[label_name] = [
661
+ label_pad_word * (longest_in_batch - len(label)) + to_list(label) for label in labels
662
+ ]
663
+
664
+ return BatchEncoding(batch_outputs, tensor_type=return_tensors)
tokenizer_config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": true,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "[CLS]",
45
+ "clean_up_tokenization_spaces": true,
46
+ "cls_token": "[CLS]",
47
+ "eos_token": "[SEP]",
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 1000000000000000019884624838656,
50
+ "pad_token": "[PAD]",
51
+ "sep_token": "[SEP]",
52
+ "split_by_punct": false,
53
+ "tokenizer_class": "HLMTokenizer",
54
+ "unk_token": "[UNK]"
55
+ }
vocab.json ADDED
@@ -0,0 +1,523 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "special_tokens": [
3
+ "[CLS]",
4
+ "[SEP]",
5
+ "[UNK]",
6
+ "[PAD]",
7
+ "[MASK]"
8
+ ],
9
+ "vocab": {
10
+ "[PAD]": 0,
11
+ "[UNK]": 1,
12
+ "[CLS]": 2,
13
+ "[SEP]": 3,
14
+ "[MASK]": 4,
15
+ "[WORD_CLS]": 5,
16
+ "e": 6,
17
+ "t": 7,
18
+ "ν": 8,
19
+ "a": 9,
20
+ "τ": 10,
21
+ "o": 11,
22
+ "α": 12,
23
+ "n": 13,
24
+ "ο": 14,
25
+ "h": 15,
26
+ "i": 16,
27
+ "s": 17,
28
+ "r": 18,
29
+ "ε": 19,
30
+ "ι": 20,
31
+ "d": 21,
32
+ "l": 22,
33
+ "ς": 23,
34
+ ",": 24,
35
+ "ρ": 25,
36
+ "κ": 26,
37
+ "σ": 27,
38
+ "π": 28,
39
+ "u": 29,
40
+ "μ": 30,
41
+ "λ": 31,
42
+ "m": 32,
43
+ "c": 33,
44
+ "f": 34,
45
+ "w": 35,
46
+ ".": 36,
47
+ "δ": 37,
48
+ "g": 38,
49
+ "y": 39,
50
+ "υ": 40,
51
+ "p": 41,
52
+ "ί": 42,
53
+ "γ": 43,
54
+ "ὶ": 44,
55
+ "b": 45,
56
+ "ω": 46,
57
+ "έ": 47,
58
+ "ἐ": 48,
59
+ "η": 49,
60
+ "θ": 50,
61
+ "ὸ": 51,
62
+ "ά": 52,
63
+ "ό": 53,
64
+ "ἀ": 54,
65
+ "v": 55,
66
+ "ῦ": 56,
67
+ "ὰ": 57,
68
+ "χ": 58,
69
+ "φ": 59,
70
+ "k": 60,
71
+ "ῶ": 61,
72
+ "ὐ": 62,
73
+ "ύ": 63,
74
+ "ῖ": 64,
75
+ "̅": 65,
76
+ "ὲ": 66,
77
+ "ὴ": 67,
78
+ "’": 68,
79
+ "I": 69,
80
+ "β": 70,
81
+ "ῆ": 71,
82
+ "ή": 72,
83
+ "ἰ": 73,
84
+ "\"": 74,
85
+ "·": 75,
86
+ "ξ": 76,
87
+ "T": 77,
88
+ ";": 78,
89
+ "ἔ": 79,
90
+ "ὁ": 80,
91
+ "A": 81,
92
+ "ἡ": 82,
93
+ "ώ": 83,
94
+ "ὑ": 84,
95
+ "ῷ": 85,
96
+ "ἄ": 86,
97
+ ":": 87,
98
+ "”": 88,
99
+ "“": 89,
100
+ "ζ": 90,
101
+ "ὺ": 91,
102
+ "ὅ": 92,
103
+ "S": 93,
104
+ "x": 94,
105
+ "H": 95,
106
+ "ἱ": 96,
107
+ "L": 97,
108
+ "-": 98,
109
+ "'": 99,
110
+ "M": 100,
111
+ "ῳ": 101,
112
+ "?": 102,
113
+ "ῇ": 103,
114
+ "ψ": 104,
115
+ "B": 105,
116
+ "W": 106,
117
+ "C": 107,
118
+ "ᾶ": 108,
119
+ "ὡ": 109,
120
+ "ἑ": 110,
121
+ "2": 111,
122
+ "ἴ": 112,
123
+ "ἶ": 113,
124
+ "—": 114,
125
+ "E": 115,
126
+ "Κ": 116,
127
+ "O": 117,
128
+ "Ἀ": 118,
129
+ "Π": 119,
130
+ "ὀ": 120,
131
+ "ῃ": 121,
132
+ "N": 122,
133
+ "D": 123,
134
+ "ὕ": 124,
135
+ "ἢ": 125,
136
+ "!": 126,
137
+ "R": 127,
138
+ "P": 128,
139
+ "q": 129,
140
+ "j": 130,
141
+ "1": 131,
142
+ "G": 132,
143
+ "0": 133,
144
+ "ὖ": 134,
145
+ "F": 135,
146
+ "Τ": 136,
147
+ "Σ": 137,
148
+ "ὄ": 138,
149
+ "Δ": 139,
150
+ "ὼ": 140,
151
+ "ἕ": 141,
152
+ "ᾳ": 142,
153
+ "Μ": 143,
154
+ "z": 144,
155
+ "Θ": 145,
156
+ "Y": 146,
157
+ "ἁ": 147,
158
+ "ἂ": 148,
159
+ "ὔ": 149,
160
+ "ῥ": 150,
161
+ "Ε": 151,
162
+ "Α": 152,
163
+ "ἦ": 153,
164
+ ")": 154,
165
+ "(": 155,
166
+ "ὥ": 156,
167
+ "ἷ": 157,
168
+ "J": 158,
169
+ "Ο": 159,
170
+ "ἵ": 160,
171
+ "Ἰ": 161,
172
+ "‘": 162,
173
+ "ʹ": 163,
174
+ "Ἐ": 164,
175
+ "ἤ": 165,
176
+ "3": 166,
177
+ "Λ": 167,
178
+ "ἅ": 168,
179
+ "Β": 169,
180
+ "ὗ": 170,
181
+ "«": 171,
182
+ "»": 172,
183
+ "Γ": 173,
184
+ "[": 174,
185
+ "]": 175,
186
+ "4": 176,
187
+ "ὃ": 177,
188
+ "Χ": 178,
189
+ "ἠ": 179,
190
+ "*": 180,
191
+ "〉": 181,
192
+ "〈": 182,
193
+ "V": 183,
194
+ "K": 184,
195
+ "U": 185,
196
+ "Ν": 186,
197
+ "Φ": 187,
198
+ "5": 188,
199
+ "ὧ": 189,
200
+ "ἥ": 190,
201
+ "6": 191,
202
+ "8": 192,
203
+ "ᾷ": 193,
204
+ "&": 194,
205
+ "7": 195,
206
+ "9": 196,
207
+ "Ῥ": 197,
208
+ "Ι": 198,
209
+ "ὠ": 199,
210
+ "Ζ": 200,
211
+ "Ὁ": 201,
212
+ "Ἡ": 202,
213
+ "ὦ": 203,
214
+ "Ἄ": 204,
215
+ "Ὅ": 205,
216
+ "ϊ": 206,
217
+ "Ἑ": 207,
218
+ "ἃ": 208,
219
+ "X": 209,
220
+ "ἧ": 210,
221
+ "ἣ": 211,
222
+ "Ἔ": 212,
223
+ "Η": 213,
224
+ "Υ": 214,
225
+ "ἓ": 215,
226
+ "ῴ": 216,
227
+ "Ρ": 217,
228
+ "ᾧ": 218,
229
+ "Ὀ": 219,
230
+ "ΐ": 220,
231
+ "Ἱ": 221,
232
+ "`": 222,
233
+ "ῤ": 223,
234
+ "ὢ": 224,
235
+ "Ϛ": 225,
236
+ "Ω": 226,
237
+ "ῄ": 227,
238
+ "ὤ": 228,
239
+ "ᾖ": 229,
240
+ "̲": 230,
241
+ "ᾗ": 231,
242
+ "ἳ": 232,
243
+ "Ἕ": 233,
244
+ "Q": 234,
245
+ "Z": 235,
246
+ "ὓ": 236,
247
+ "„": 237,
248
+ "Ξ": 238,
249
+ "Ὑ": 239,
250
+ "†": 240,
251
+ "ἆ": 241,
252
+ "ὂ": 242,
253
+ "é": 243,
254
+ "+": 244,
255
+ "Ἴ": 245,
256
+ "ᾠ": 246,
257
+ "Ὡ": 247,
258
+ "ϋ": 248,
259
+ "Ἠ": 249,
260
+ "𐅻": 250,
261
+ "|": 251,
262
+ "ᾴ": 252,
263
+ "Ὥ": 253,
264
+ "ᾔ": 254,
265
+ "ῒ": 255,
266
+ "𐆄": 256,
267
+ "Ψ": 257,
268
+ "Ἁ": 258,
269
+ "Ὠ": 259,
270
+ "Ἥ": 260,
271
+ "ᾤ": 261,
272
+ "Ἅ": 262,
273
+ "#": 263,
274
+ "–": 264,
275
+ "̈": 265,
276
+ "Ἵ": 266,
277
+ "𐅶": 267,
278
+ "_": 268,
279
+ "ö": 269,
280
+ "Ὄ": 270,
281
+ "ᾐ": 271,
282
+ "ᾄ": 272,
283
+ "Ἢ": 273,
284
+ "�": 274,
285
+ "Ἤ": 275,
286
+ "Ὃ": 276,
287
+ "Ἦ": 277,
288
+ "𐅵": 278,
289
+ "‖": 279,
290
+ "}": 280,
291
+ "{": 281,
292
+ "͵": 282,
293
+ "=": 283,
294
+ "⸢": 284,
295
+ "⸥": 285,
296
+ "æ": 286,
297
+ "Ὦ": 287,
298
+ "Ἆ": 288,
299
+ "⸤": 289,
300
+ "⏑": 290,
301
+ "ὣ": 291,
302
+ "ᾰ": 292,
303
+ "⟦": 293,
304
+ "⟧": 294,
305
+ "Ὕ": 295,
306
+ "ᾀ": 296,
307
+ "ᾅ": 297,
308
+ "⸏": 298,
309
+ "‹": 299,
310
+ "›": 300,
311
+ "è": 301,
312
+ "á": 302,
313
+ "Ϟ": 303,
314
+ ">": 304,
315
+ "Ὧ": 305,
316
+ "<": 306,
317
+ "Ϙ": 307,
318
+ "œ": 308,
319
+ "ΰ": 309,
320
+ "□": 310,
321
+ "͜": 311,
322
+ "ᾱ": 312,
323
+ "́": 313,
324
+ "ᾑ": 314,
325
+ "ˈ": 315,
326
+ "ë": 316,
327
+ "Ἂ": 317,
328
+ "′": 318,
329
+ "ῐ": 319,
330
+ "ϝ": 320,
331
+ "Ὢ": 321,
332
+ "ᾆ": 322,
333
+ "ῠ": 323,
334
+ "⩚": 324,
335
+ "►": 325,
336
+ "◄": 326,
337
+ "§": 327,
338
+ "𐆃": 328,
339
+ "ñ": 329,
340
+ "ῑ": 330,
341
+ "×": 331,
342
+ "Ἃ": 332,
343
+ "ῡ": 333,
344
+ "ἲ": 334,
345
+ "ῂ": 335,
346
+ "⸣": 336,
347
+ "±": 337,
348
+ "‵": 338,
349
+ "%": 339,
350
+ "ü": 340,
351
+ "Ἶ": 341,
352
+ "Ὤ": 342,
353
+ "𐆊": 343,
354
+ "ê": 344,
355
+ "à": 345,
356
+ "̄": 346,
357
+ "ç": 347,
358
+ "â": 348,
359
+ "ä": 349,
360
+ "ô": 350,
361
+ "$": 351,
362
+ "𐆆": 352,
363
+ "̓": 353,
364
+ "Ὣ": 354,
365
+ "/": 355,
366
+ "Á": 356,
367
+ "£": 357,
368
+ "Ἧ": 358,
369
+ "Ἓ": 359,
370
+ "ᾂ": 360,
371
+ "Ϡ": 361,
372
+ "^": 362,
373
+ "א": 363,
374
+ "ā": 364,
375
+ "⧙": 365,
376
+ "⧘": 366,
377
+ "̔": 367,
378
+ "Æ": 368,
379
+ "ó": 369,
380
+ "ŭ": 370,
381
+ "É": 371,
382
+ "°": 372,
383
+ "ὒ": 373,
384
+ "̇": 374,
385
+ "⁝": 375,
386
+ "Ἣ": 376,
387
+ "ᾕ": 377,
388
+ "ï": 378,
389
+ "ῗ": 379,
390
+ "ϼ": 380,
391
+ "î": 381,
392
+ "⏒": 382,
393
+ "𐆂": 383,
394
+ "ῼ": 384,
395
+ "í": 385,
396
+ "ᾦ": 386,
397
+ "√": 387,
398
+ "⏔": 388,
399
+ "⸐": 389,
400
+ "ϛ": 390,
401
+ "♃": 391,
402
+ "̆": 392,
403
+ "ἒ": 393,
404
+ "↑": 394,
405
+ "ἇ": 395,
406
+ "ú": 396,
407
+ "û": 397,
408
+ "ē": 398,
409
+ "ᾇ": 399,
410
+ "ῢ": 400,
411
+ "ð": 401,
412
+ "❛": 402,
413
+ "❜": 403,
414
+ "͂": 404,
415
+ "ū": 405,
416
+ "ī": 406,
417
+ "‚": 407,
418
+ "‛": 408,
419
+ "@": 409,
420
+ "⊗": 410,
421
+ "Ϊ": 411,
422
+ "š": 412,
423
+ "ῲ": 413,
424
+ "‧": 414,
425
+ "ś": 415,
426
+ "⏓": 416,
427
+ "⸎": 417,
428
+ "⸓": 418,
429
+ "ṛ": 419,
430
+ "ù": 420,
431
+ "ō": 421,
432
+ "̀": 422,
433
+ "ᾲ": 423,
434
+ "ṇ": 424,
435
+ "ᾡ": 425,
436
+ "※": 426,
437
+ "ͅ": 427,
438
+ "Î": 428,
439
+ "ῧ": 429,
440
+ "ě": 430,
441
+ "ῌ": 431,
442
+ "⁄": 432,
443
+ "ž": 433,
444
+ "̶": 434,
445
+ "Ç": 435,
446
+ "ב": 436,
447
+ "Ὶ": 437,
448
+ "•": 438,
449
+ "\\": 439,
450
+ "י": 440,
451
+ "𐆅": 441,
452
+ "þ": 442,
453
+ "͝": 443,
454
+ "À": 444,
455
+ "ṃ": 445,
456
+ "ו": 446,
457
+ "✕": 447,
458
+ "Ί": 448,
459
+ "Ὗ": 449,
460
+ "å": 450,
461
+ "ר": 451,
462
+ "Ϝ": 452,
463
+ "ì": 453,
464
+ "Œ": 454,
465
+ "⸍": 455,
466
+ "𐅷": 456,
467
+ "Ά": 457,
468
+ "⎫": 458,
469
+ "⎬": 459,
470
+ "⎭": 460,
471
+ "☩": 461,
472
+ "ת": 462,
473
+ "ĝ": 463,
474
+ "☉": 464,
475
+ "È": 465,
476
+ "ש": 466,
477
+ "Έ": 467,
478
+ "ᾼ": 468,
479
+ "↕": 469,
480
+ "מ": 470,
481
+ "נ": 471,
482
+ "ϳ": 472,
483
+ "♎": 473,
484
+ "♏": 474,
485
+ "♑": 475,
486
+ "ò": 476,
487
+ "ה": 477,
488
+ "ן": 478,
489
+ "ק": 479,
490
+ "ė": 480,
491
+ "♈": 481,
492
+ "♉": 482,
493
+ "♋": 483,
494
+ "♌": 484,
495
+ "̳": 485,
496
+ "⳨": 486,
497
+ "♐": 487,
498
+ "♒": 488,
499
+ "♓": 489,
500
+ "͡": 490,
501
+ "Ñ": 491,
502
+ "ã": 492,
503
+ "ס": 493,
504
+ "̂": 494,
505
+ "♊": 495,
506
+ "♍": 496,
507
+ "⁚": 497,
508
+ "ᾁ": 498,
509
+ "⏕": 499,
510
+ "∶": 500,
511
+ "ל": 501,
512
+ "פ": 502,
513
+ "ą": 503,
514
+ "ĩ": 504,
515
+ "⎪": 505,
516
+ "ח": 506,
517
+ "∧": 507,
518
+ "צ": 508,
519
+ "Ἇ": 509,
520
+ "Ͻ": 510,
521
+ "҅": 511
522
+ }
523
+ }