Add new SentenceTransformer model.

Browse files

Files changed (13) hide show

1_Pooling/config.json +10 -0
README.md +125 -0
config.json +32 -0
config_sentence_transformers.json +9 -0
configuration_hlm.py +59 -0
model.safetensors +3 -0
modeling_hlm.py +614 -0
modules.json +14 -0
sentence_bert_config.json +4 -0
special_tokens_map.json +51 -0
tokenization_hlm.py +664 -0
tokenizer_config.json +55 -0
vocab.json +523 -0

1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "word_embedding_dimension": 768,
+  "pooling_mode_cls_token": true,
+  "pooling_mode_mean_tokens": false,
+  "pooling_mode_max_tokens": false,
+  "pooling_mode_mean_sqrt_len_tokens": false,
+  "pooling_mode_weightedmean_tokens": false,
+  "pooling_mode_lasttoken": false,
+  "include_prompt": true
+}

README.md ADDED Viewed

	@@ -0,0 +1,125 @@

+---
+library_name: sentence-transformers
+pipeline_tag: sentence-similarity
+tags:
+- sentence-transformers
+- feature-extraction
+- sentence-similarity
+- transformers
+- semantic-search
+---
+# sge-hlm
+## Sentence embeddings for English and Ancient Greek
+The HLM model architecture is based on [Heidelberg-Boston @ SIGTYP 2024 Shared Task: Enhancing Low-Resource Language Analysis With Character-Aware Hierarchical Transformers](https://aclanthology.org/2024.sigtyp-1.16/) but uses a simpler architecture with rotary embeddings (see the implementation in the `HLM` folder) instead of using DeBERTa as a base architecture. This architecture produces superior results compared to the vanilla BERT architecture for low-resource languages like Ancient Greek. It is trained to produce sentence embeddings using the method described in [Sentence Embedding Models for Ancient Greek Using Multiligual Knowledge Distillation](https://aclanthology.org/2023.alp-1.2/).
+This model was distilled from `BAAI/bge-base-en-v1.5` for embedding English and Ancient Greek text.
+## Usage (Sentence-Transformers)
+Using [sentence-transformers](https://www.SBERT.net):
+```
+pip install -U sentence-transformers
+```
+Then you can use the model like this:
+```python
+from sentence_transformers import SentenceTransformer
+sentences = ["This is an example sentence", "Each sentence is converted"]
+model = SentenceTransformer('kevinkrahn/shlm-grc-en')
+embeddings = model.encode(sentences)
+print(embeddings)
+```
+## Usage (HuggingFace Transformers)
+Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
+```python
+from transformers import AutoTokenizer, AutoModel
+import torch
+def cls_pooling(model_output, attention_mask):
+    return model_output[0][:,0]
+# Sentences we want sentence embeddings for
+sentences = ['This is an example sentence', 'Each sentence is converted']
+# Load model from HuggingFace Hub
+tokenizer = AutoTokenizer.from_pretrained('kevinkrahn/shlm-grc-en')
+model = AutoModel.from_pretrained('kevinkrahn/shlm-grc-en')
+# Tokenize sentences
+encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
+# Compute token embeddings
+with torch.no_grad():
+    model_output = model(**encoded_input)
+# Perform pooling. In this case, cls pooling.
+sentence_embeddings = cls_pooling(model_output, encoded_input['attention_mask'])
+print("Sentence embeddings:")
+print(sentence_embeddings)
+```
+## Citing & Authors
+```
+@inproceedings{riemenschneider-krahn-2024-heidelberg,
+    title = "Heidelberg-Boston @ {SIGTYP} 2024 Shared Task: Enhancing Low-Resource Language Analysis With Character-Aware Hierarchical Transformers",
+    author = "Riemenschneider, Frederick  and
+      Krahn, Kevin",
+    editor = "Hahn, Michael  and
+      Sorokin, Alexey  and
+      Kumar, Ritesh  and
+      Shcherbakov, Andreas  and
+      Otmakhova, Yulia  and
+      Yang, Jinrui  and
+      Serikov, Oleg  and
+      Rani, Priya  and
+      Ponti, Edoardo M.  and
+      Murado{\u{g}}lu, Saliha  and
+      Gao, Rena  and
+      Cotterell, Ryan  and
+      Vylomova, Ekaterina",
+    booktitle = "Proceedings of the 6th Workshop on Research in Computational Linguistic Typology and Multilingual NLP",
+    month = mar,
+    year = "2024",
+    address = "St. Julian's, Malta",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2024.sigtyp-1.16",
+    pages = "131--141",
+}
+```
+```
+@inproceedings{krahn-etal-2023-sentence,
+    title = "Sentence Embedding Models for {A}ncient {G}reek Using Multilingual Knowledge Distillation",
+    author = "Krahn, Kevin  and
+      Tate, Derrick  and
+      Lamicela, Andrew C.",
+    editor = "Anderson, Adam  and
+      Gordin, Shai  and
+      Li, Bin  and
+      Liu, Yudong  and
+      Passarotti, Marco C.",
+    booktitle = "Proceedings of the Ancient Language Processing Workshop",
+    month = sep,
+    year = "2023",
+    address = "Varna, Bulgaria",
+    publisher = "INCOMA Ltd., Shoumen, Bulgaria",
+    url = "https://aclanthology.org/2023.alp-1.2",
+    pages = "13--22",
+}
+```

config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "_name_or_path": "models/output/shlm-grc-en",
+  "architectures": [
+    "HLMModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_hlm.HLMConfig",
+    "AutoModel": "modeling_hlm.HLMModel"
+  },
+  "embedding_size": -1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "inter_word_encoder": {
+    "intermediate_size": 2048,
+    "model_type": "",
+    "sandwich_size": 2
+  },
+  "intra_word_encoder": {
+    "intermediate_size": 1536,
+    "model_type": "",
+    "num_hidden_layers": 4
+  },
+  "max_seq_length": 256,
+  "max_word_length": 16,
+  "model_type": "hlm",
+  "pad_token_id": 0,
+  "residual_word_embedding": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.38.2",
+  "type_vocab_size": 2,
+  "vocab_size": 512
+}

config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "__version__": {
+    "sentence_transformers": "2.4.0.dev0",
+    "transformers": "4.39.3",
+    "pytorch": "2.3.0+cu121"
+  },
+  "prompts": {},
+  "default_prompt_name": null
+}

configuration_hlm.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from transformers import PretrainedConfig
+class HLMEncoderConfig(PretrainedConfig):
+    def __init__(
+        self,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_dropout_prob=0.1,
+        layer_norm_eps=1e-7,
+        sandwich=False,
+        sandwich_size=0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout_prob = hidden_dropout_prob
+        self.layer_norm_eps = layer_norm_eps
+        if sandwich:
+            self.sandwich_size = num_hidden_layers // 6
+        else:
+            self.sandwich_size = sandwich_size
+class HLMConfig(PretrainedConfig):
+    model_type = "hlm"
+    def __init__(
+        self,
+        vocab_size=512,
+        type_vocab_size=2,
+        embedding_size=-1,
+        max_seq_length=256,
+        max_word_length=16,
+        initializer_range=0.02,
+        pad_token_id=0,
+        intra_word_encoder={},
+        inter_word_encoder={},
+        residual_word_embedding=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.type_vocab_size = type_vocab_size
+        self.embedding_size = embedding_size
+        self.initializer_range = initializer_range
+        self.max_seq_length = max_seq_length
+        self.max_word_length = max_word_length
+        self.pad_token_id = pad_token_id
+        self.intra_word_encoder = HLMEncoderConfig(**intra_word_encoder)
+        self.inter_word_encoder = HLMEncoderConfig(**inter_word_encoder)
+        self.hidden_size = self.inter_word_encoder.hidden_size
+        self.residual_word_embedding = residual_word_embedding

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d0a6e4c5f4eb9a71f57b56dac6a207932e0def2a9fb3c9956ae28482b39cfe6f
+size 379310632

modeling_hlm.py ADDED Viewed

	@@ -0,0 +1,614 @@

+import torch
+from torch import nn
+import torch.nn.functional as F
+from dataclasses import dataclass
+import copy
+from transformers.modeling_outputs import BaseModelOutput, ModelOutput, MaskedLMOutput, TokenClassifierOutput, SequenceClassifierOutput
+from transformers.modeling_utils import PreTrainedModel
+from transformers import AutoConfig, AutoModel, AutoModelForTokenClassification, AutoModelForMaskedLM, AutoTokenizer, AutoModelForSequenceClassification
+from .configuration_hlm import HLMConfig, HLMEncoderConfig
+from .tokenization_hlm import HLMTokenizer
+from typing import Tuple, Optional, Union
+@dataclass
+class HLMBaseModelOutput(ModelOutput):
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Tuple[torch.FloatTensor] = None
+    attentions: Tuple[torch.FloatTensor] = None # Not currently supported
+    initial_embeds: torch.FloatTensor = None
+    initial_word_embeds: torch.FloatTensor = None
+    intra_word_mask: torch.LongTensor = None
+    char_embeds: torch.LongTensor = None
+    input_shape: Tuple[int, int, int, int] = None
+class HLMEncoder(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        if config.sandwich_size > 0:
+            sandwich_start_index = config.num_hidden_layers // 2 - config.sandwich_size
+            sandwich_indices = [sandwich_start_index + i*2 + 1 for i in range(config.sandwich_size)]
+            #print('Sandwich indices:', sandwich_indices)
+            self.layers = nn.ModuleList([
+                TransformerBlock(config, bias=i in sandwich_indices) for i in range(config.num_hidden_layers)])
+            for i in range(config.sandwich_size):
+                self.layers[sandwich_start_index + i*2+1].make_sandwich(self.layers[sandwich_start_index + i*2])
+        else:
+            self.layers = nn.ModuleList([TransformerBlock(config) for _ in range(config.num_hidden_layers)])
+    def _get_attention_mask(self, attn_mask, dtype):
+        if attn_mask.dim() <= 2:
+            extended_mask = attn_mask.unsqueeze(1).unsqueeze(2)
+            extended_mask = extended_mask*extended_mask.squeeze(-2).unsqueeze(-1)
+        elif attn_mask.dim() == 3:
+            extended_mask = attn_mask.unsqueeze(1)
+        else:
+            extended_mask = attn_mask
+        # Convert to float to avoid zero in denominator of softmax in SDPA, resulting in NaNs
+        min_dtype = torch.finfo(dtype).min
+        extended_mask = ((1.0 - extended_mask.float()) * min_dtype)
+        # SDPA returns NaNs for fully masked rows, so attend to all tokens instead
+        extended_mask = extended_mask.mul(~torch.all(extended_mask==min_dtype, dim=-1, keepdim=True))
+        return extended_mask
+    def forward(self, hidden_states, attention_mask, freqs_cos, freqs_sin, return_dict=True, output_hidden_states=False):
+        all_hidden_states = []
+        attn_mask = self._get_attention_mask(attention_mask, hidden_states.dtype)
+        for layer in self.layers:
+            hidden_states = layer(hidden_states, attn_mask, freqs_cos, freqs_sin)
+            all_hidden_states.append(hidden_states)
+        if return_dict:
+            return BaseModelOutput(
+                last_hidden_state=all_hidden_states[-1],
+                hidden_states=all_hidden_states if output_hidden_states else None,
+                attentions=None,
+            )
+        else:
+            return (all_hidden_states[-1], all_hidden_states) if output_hidden_states else all_hidden_states
+class HLMPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = HLMConfig
+    base_model_prefix = "hlm"
+    _keys_to_ignore_on_load_unexpected = []
+    supports_gradient_checkpointing = True
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+class HLMModel(HLMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.char_embeddings = nn.Embedding(config.vocab_size, config.intra_word_encoder.hidden_size, padding_idx=0)
+        self.char_embedding_dropout = nn.Dropout(config.intra_word_encoder.dropout_prob)
+        if self.config.embedding_size != -1 and self.config.embedding_size != self.config.intra_word_encoder.hidden_size:
+            self.char_embedding_project = nn.Linear(self.config.embedding_size, self.config.intra_word_encoder.hidden_size, bias=False)
+        freqs_cos, freqs_sin = precompute_freqs_cis(config.intra_word_encoder.hidden_size // config.intra_word_encoder.num_attention_heads, config.max_seq_length)
+        self.register_buffer("freqs_cos", freqs_cos)
+        self.register_buffer("freqs_sin", freqs_sin)
+        self.word_type_embeddings = nn.Embedding(config.type_vocab_size, config.intra_word_encoder.hidden_size)
+        self.intra_word_encoder = HLMEncoder(config.intra_word_encoder)
+        if self.config.intra_word_encoder.hidden_size != self.config.inter_word_encoder.hidden_size:
+            self.intra_word_project = nn.Linear(self.config.intra_word_encoder.hidden_size, self.config.inter_word_encoder.hidden_size, bias=False)
+        self.inter_word_encoder = HLMEncoder(config.inter_word_encoder)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.char_embeddings
+    def set_input_embeddings(self, new_embeddings):
+        self.char_embeddings = new_embeddings
+    def forward(self, input_ids, char_input_mask, word_input_mask, word_type_ids=None, combined_word_embeddings: Optional[bool]=False, output_hidden_states: Optional[bool]=False, return_dict: Optional[bool]=True):
+        input_embeds = self.char_embeddings(input_ids)
+        input_embeds = self.char_embedding_dropout(input_embeds)
+        if hasattr(self, "char_embedding_project"):
+            input_embeds = self.char_embedding_project(input_embeds)
+        batch_size, num_word, _, _ = input_embeds.shape
+        num_char = self.config.max_word_length
+        # reshape to attend to intra-word tokens rather than full sequence
+        input_embeds = input_embeds.view(batch_size * num_word, num_char, self.config.intra_word_encoder.hidden_size)
+        intra_word_mask = char_input_mask.view(batch_size * num_word, num_char)
+        intra_word_output = self.intra_word_encoder(
+            input_embeds,
+            intra_word_mask,
+            self.freqs_cos[:num_char],
+            self.freqs_sin[:num_char],
+            output_hidden_states=False,
+            return_dict=True,
+        )
+        initial_embeds = intra_word_output.last_hidden_state
+        # extract [WORD_CLS] embeddings, which are always at the beginning of each word
+        initial_word_embeds = initial_embeds[:,0,:]
+        if word_type_ids is not None:
+            word_type_embeds = self.word_type_embeddings(word_type_ids)
+            word_type_embeds = word_type_embeds.view(batch_size * num_word, self.config.intra_word_encoder.hidden_size)
+            initial_word_embeds = initial_word_embeds + word_type_embeds
+        if hasattr(self, "intra_word_project"):
+            initial_embeds = self.intra_word_project(initial_embeds)
+        # reshape and extract contextualized inter-word representation
+        word_embeds = initial_word_embeds.view(batch_size, num_word, self.config.inter_word_encoder.hidden_size)
+        inter_word_output = self.inter_word_encoder(
+            word_embeds,
+            word_input_mask,
+            self.freqs_cos[:num_word],
+            self.freqs_sin[:num_word],
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+        if combined_word_embeddings:
+            initial_word_embeds = initial_word_embeds.view(batch_size, num_word, self.config.inter_word_encoder.hidden_size)
+            contextual_word_embeds = inter_word_output.last_hidden_state
+            combined_word_embeds = torch.cat([initial_word_embeds, contextual_word_embeds], dim=2)
+            last_hidden_state = combined_word_embeds
+        else:
+            last_hidden_state = inter_word_output.last_hidden_state
+        if return_dict:
+            return HLMBaseModelOutput(
+                last_hidden_state=last_hidden_state,
+                hidden_states=inter_word_output.hidden_states if output_hidden_states else None,
+                initial_embeds=initial_embeds,
+                initial_word_embeds=initial_word_embeds,
+                intra_word_mask=intra_word_mask,
+                char_embeds=input_embeds,
+                input_shape=(batch_size, num_word, num_char, self.config.inter_word_encoder.hidden_size),
+            )
+        else:
+            return (
+                last_hidden_state,
+                inter_word_output.hidden_states if output_hidden_states else None,
+                initial_embeds,
+                initial_word_embeds,
+                intra_word_mask,
+                input_embeds,
+                (batch_size, num_word, num_char, self.config.inter_word_encoder.hidden_size),
+            )
+def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+    assert freqs_cis.shape == (x.shape[1], x.shape[-1])
+    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+    return freqs_cis.view(*shape)
+def apply_rotary_emb(xq: torch.Tensor, xk: torch.Tensor, freqs_cos: torch.Tensor, freqs_sin: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    # reshape xq and xk to match the complex representation
+    xq_r, xq_i = xq.float().reshape(*xq.shape[:-1], -1, 2).unbind(-1)
+    xk_r, xk_i = xk.float().reshape(*xk.shape[:-1], -1, 2).unbind(-1)
+    # reshape freqs_cos and freqs_sin for broadcasting
+    freqs_cos = reshape_for_broadcast(freqs_cos, xq_r)
+    freqs_sin = reshape_for_broadcast(freqs_sin, xq_r)
+    # apply rotation using real numbers
+    xq_out_r = xq_r * freqs_cos - xq_i * freqs_sin
+    xq_out_i = xq_r * freqs_sin + xq_i * freqs_cos
+    xk_out_r = xk_r * freqs_cos - xk_i * freqs_sin
+    xk_out_i = xk_r * freqs_sin + xk_i * freqs_cos
+    # flatten last two dimensions
+    xq_out = torch.stack([xq_out_r, xq_out_i], dim=-1).flatten(3)
+    xk_out = torch.stack([xk_out_r, xk_out_i], dim=-1).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
+    freqs = 1.0 / (
+        theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)
+    )
+    t = torch.arange(end, device=freqs.device)  # type: ignore
+    freqs = torch.outer(t, freqs).float()  # type: ignore
+    freqs_cos = torch.cos(freqs)  # real part
+    freqs_sin = torch.sin(freqs)  # imaginary part
+    return freqs_cos, freqs_sin
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+class TransformerBlock(nn.Module):
+    def __init__(self, config: HLMEncoderConfig, bias: bool = False):
+        super().__init__()
+        self.pad_id = config.pad_token_id
+        self.drop_p = config.dropout_prob
+        self.n_heads = config.num_attention_heads
+        self.d_head = config.hidden_size // config.num_attention_heads
+        self.has_bias = bias
+        dim = config.hidden_size
+        # Attention
+        self.q = nn.Linear(in_features=dim, out_features=dim, bias=bias)
+        self.k = nn.Linear(in_features=dim, out_features=dim, bias=bias)
+        self.v = nn.Linear(in_features=dim, out_features=dim, bias=bias)
+        self.att_proj_linear = nn.Linear(in_features=dim, out_features=dim, bias=bias)
+        self.resid_dropout = nn.Dropout(self.drop_p)
+        # Feedforward layer
+        self.ff_dropout = nn.Dropout(self.drop_p)
+        self.ff_linear_1 = nn.Linear(in_features=dim, out_features=config.intermediate_size, bias=bias)
+        self.ff_linear_2 = nn.Linear(in_features=config.intermediate_size, out_features=dim, bias=bias)
+        self.ff_linear_3 = nn.Linear(in_features=dim, out_features=config.intermediate_size, bias=bias)
+        # Pre-layer norms
+        self.attn_norm = RMSNorm(dim, eps=config.layer_norm_eps)
+        self.ff_norm = RMSNorm(dim, eps=config.layer_norm_eps)
+    def make_sandwich(self, other):
+        assert self.has_bias
+        assert not other.has_bias
+        self.q.weight = other.q.weight
+        self.k.weight = other.k.weight
+        self.v.weight = other.v.weight
+        self.att_proj_linear.weight = other.att_proj_linear.weight
+        self.ff_linear_1.weight = other.ff_linear_1.weight
+        self.ff_linear_2.weight = other.ff_linear_2.weight
+        self.ff_linear_3.weight = other.ff_linear_3.weight
+    def forward(self, x: torch.Tensor, pad_mask: torch.Tensor, freqs_cos: torch.Tensor, freqs_sin: torch.Tensor):
+        x = x + self._attention_block(self.attn_norm(x), pad_mask, freqs_cos, freqs_sin)
+        x = x + self._feedforward_block(self.ff_norm(x))
+        return x
+    def _attention_block(self, x: torch.Tensor, attn_mask: torch.Tensor, freqs_cos: torch.Tensor, freqs_sin: torch.Tensor):
+        batch_size, seq_len, _ = x.shape
+        xq, xk, xv = self.q(x), self.k(x), self.v(x)
+        # Reshape for rotary embeddings
+        xq = xq.view(batch_size, seq_len, self.n_heads, self.d_head)
+        xk = xk.view(batch_size, seq_len, self.n_heads, self.d_head)
+        xv = xv.view(batch_size, seq_len, self.n_heads, self.d_head)
+        xq, xk = apply_rotary_emb(xq, xk, freqs_cos, freqs_sin)
+        # Reshape for attention calculation: (b_sz, n_head, s_len, d_head)
+        xq = xq.transpose(1, 2)
+        xk = xk.transpose(1, 2)
+        xv = xv.transpose(1, 2)
+        att = F.scaled_dot_product_attention(
+            query=xq, key=xk, value=xv,
+            attn_mask=attn_mask,
+            dropout_p=self.drop_p if self.training else 0.0,
+            is_causal=False,
+        )
+        # Shape (b_sz, s_len, n_head, d_head)
+        out = att.transpose(1, 2).contiguous()
+        out = out.view(batch_size, seq_len, self.n_heads * self.d_head)
+        return self.resid_dropout(self.att_proj_linear(out))
+    def _feedforward_block(self, x: torch.Tensor):
+        # SWiGLU activation
+        x = self.ff_linear_2(F.silu(self.ff_linear_1(x)) * self.ff_linear_3(x))
+        x = self.ff_dropout(x)
+        return x
+class HLMForMaskedLM(HLMPreTrainedModel):
+    _tied_weights_keys = ["cls.decoder.weight", "cls.decoder.bias"]
+    def __init__(self, config):
+        super().__init__(config)
+        # NOTE: This property name must match "base_model_prefix" in the base class
+        self.hlm = HLMModel(config)
+        self.cls = HLMLMPredictionHead(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_output_embeddings(self):
+        return self.cls.decoder
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.decoder = new_embeddings
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        char_input_mask: Optional[torch.Tensor] = None,
+        word_input_mask: Optional[torch.Tensor] = None,
+        word_type_ids: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple, MaskedLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, num_words, max_chars_per_word)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        outputs = self.hlm(
+            input_ids,
+            char_input_mask=char_input_mask,
+            word_input_mask=word_input_mask,
+            word_type_ids=word_type_ids,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            combined_word_embeddings=False,
+        )
+        prediction_scores = self.cls(outputs,
+                                    freqs_cos=self.hlm.freqs_cos[:self.config.max_word_length],
+                                    freqs_sin=self.hlm.freqs_sin[:self.config.max_word_length])
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+        if not return_dict:
+            output = (prediction_scores,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+        else:
+            return MaskedLMOutput(
+                loss=masked_lm_loss,
+                logits=prediction_scores,
+                hidden_states=outputs.hidden_states,
+            )
+class HLMLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        intra_word_encoder_config = copy.copy(config.intra_word_encoder)
+        intra_word_encoder_config.num_hidden_layers = 1
+        intra_word_encoder_config.sandwich_size = 0
+        self.intra_word_encoder = HLMEncoder(intra_word_encoder_config)
+        self.residual_word_embedding = getattr(config, 'residual_word_embedding', False)
+        self.config = config
+        if self.config.intra_word_encoder.hidden_size != self.config.inter_word_encoder.hidden_size:
+            self.inter_word_project = nn.Linear(config.inter_word_encoder.hidden_size, self.config.intra_word_encoder.hidden_size, bias=False)
+        if getattr(config, "tie_word_embeddings", True):
+            # The output weights are the same as the input embeddings, but there is
+            # an output-only bias for each token.
+            self.decoder = nn.Linear(config.intra_word_encoder.hidden_size, config.vocab_size, bias=False)
+            self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+            # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+            self.decoder.bias = self.bias
+        else:
+            self.decoder = nn.Linear(config.intra_word_encoder.hidden_size, config.vocab_size)
+    def forward(self, base_model_output: HLMBaseModelOutput, freqs_cos: torch.Tensor, freqs_sin: torch.Tensor):
+        batch_size, num_word, _, _ = base_model_output.input_shape
+        word_embeds = base_model_output.last_hidden_state.reshape(batch_size * num_word, 1, self.config.inter_word_encoder.hidden_size)
+        if self.residual_word_embedding:
+          # residual connection between initial word embeddings and contextual word embeddings as mentioned in the paper (section A.3)
+          word_embeds += base_model_output.initial_word_embeds.unsqueeze(1)
+        if hasattr(self, "inter_word_project"):
+            word_embeds = self.inter_word_project(word_embeds)
+        # concatenate to restore the character-level token sequence
+        char_embeds = torch.cat([word_embeds, base_model_output.initial_embeds[:,1:,:]], dim=1)
+        intra_word_output = self.intra_word_encoder(
+            char_embeds,
+            base_model_output.intra_word_mask,
+            freqs_cos, freqs_sin,
+            output_hidden_states=False,
+            return_dict=True,
+        )
+        char_logits = self.decoder(intra_word_output.last_hidden_state)
+        batch_size, num_word, num_char, _ = base_model_output.input_shape
+        char_logits = char_logits.reshape(batch_size, num_word * num_char, -1)
+        return char_logits
+class HLMForTokenClassification(HLMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.hlm = HLMModel(config)
+        self.cls = nn.Linear(config.inter_word_encoder.hidden_size*2, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        char_input_mask: Optional[torch.Tensor] = None,
+        word_input_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.hlm(
+            input_ids,
+            char_input_mask=char_input_mask,
+            word_input_mask=word_input_mask,
+            output_hidden_states=output_hidden_states,
+            combined_word_embeddings=True,
+        )
+        logits = self.cls(outputs.last_hidden_state)
+        loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return TokenClassifierOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+class HLMForSequenceClassification(HLMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.num_labels = getattr(config, 'num_labels', 2)
+        self.hlm = HLMModel(config)
+        self.dense = nn.Linear(config.inter_word_encoder.hidden_size, config.inter_word_encoder.hidden_size)
+        self.dropout = nn.Dropout(0.1)
+        self.classifier = nn.Linear(config.inter_word_encoder.hidden_size, config.num_labels)
+        #self.activation = SwiGLU()
+        self.activation = nn.GELU()
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        char_input_mask: Optional[torch.Tensor] = None,
+        word_input_mask: Optional[torch.Tensor] = None,
+        word_type_ids: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.hlm(
+            input_ids,
+            char_input_mask=char_input_mask,
+            word_input_mask=word_input_mask,
+            word_type_ids=word_type_ids,
+            output_hidden_states=output_hidden_states,
+            combined_word_embeddings=False,
+        )
+        emb = outputs.last_hidden_state[:, 0]
+        emb = self.dense(emb)
+        emb = self.activation(emb)
+        emb = self.dropout(emb)
+        logits = self.classifier(emb)
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    # regression task
+                    loss_fn = nn.MSELoss()
+                    logits = logits.view(-1).to(labels.dtype)
+                    loss = loss_fn(logits, labels.view(-1))
+                elif labels.dim() == 1 or labels.size(-1) == 1:
+                    label_index = (labels >= 0).nonzero()
+                    labels = labels.long()
+                    if label_index.size(0) > 0:
+                        labeled_logits = torch.gather(
+                            logits, 0, label_index.expand(label_index.size(0), logits.size(1))
+                        )
+                        labels = torch.gather(labels, 0, label_index.view(-1))
+                        loss_fct = nn.CrossEntropyLoss()
+                        loss = loss_fct(labeled_logits.view(-1, self.num_labels).float(), labels.view(-1))
+                    else:
+                        loss = torch.tensor(0).to(logits)
+                else:
+                    log_softmax = nn.LogSoftmax(-1)
+                    loss = -((log_softmax(logits) * labels).sum(-1)).mean()
+            elif self.config.problem_type == "regression":
+                loss_fct = nn.MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = nn.CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = nn.BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states)
+AutoConfig.register("hlm", HLMConfig)
+AutoModel.register(HLMConfig, HLMModel)
+AutoModelForTokenClassification.register(HLMConfig, HLMForTokenClassification)
+AutoModelForSequenceClassification.register(HLMConfig, HLMForSequenceClassification)
+AutoModelForMaskedLM.register(HLMConfig, HLMForMaskedLM)
+AutoTokenizer.register(HLMConfig, HLMTokenizer)
+HLMConfig.register_for_auto_class()
+HLMModel.register_for_auto_class("AutoModel")
+HLMForMaskedLM.register_for_auto_class("AutoModelForMaskedLM")
+HLMForSequenceClassification.register_for_auto_class("AutoModelForSequenceClassification")
+HLMForTokenClassification.register_for_auto_class("AutoModelForTokenClassification")

modules.json ADDED Viewed

	@@ -0,0 +1,14 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  }
+]

sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "max_seq_length": 256,
+  "do_lower_case": false
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenization_hlm.py ADDED Viewed

	@@ -0,0 +1,664 @@

+import os
+import json
+import unicodedata
+from typing import Any, Dict, List, Optional, Tuple, Union
+from collections.abc import Mapping
+from collections import Counter
+import itertools
+import torch
+from transformers.tokenization_utils import PreTrainedTokenizer, PaddingStrategy, TruncationStrategy, TensorType, BatchEncoding
+from transformers.utils import logging, is_torch_tensor
+TextInput = str
+PreTokenizedInput = List[str]
+EncodedInput = List[List[int]]
+TextInputPair = Tuple[TextInput, TextInput]
+PreTokenizedInputPair = Tuple[PreTokenizedInput, PreTokenizedInput]
+EncodedInputPair = Tuple[EncodedInput, EncodedInput]
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json"}
+# TODO: add support for return_offsets_mapping
+class HLMTokenizer(PreTrainedTokenizer):
+    r"""
+    Constructs a HLM tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
+    Args:
+        vocab_file (`str`):
+            Path to .json vocab file.
+        bos_token (`string`, *optional*, defaults to `"[CLS]"`):
+            The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+        eos_token (`string`, *optional*, defaults to `"[SEP]"`):
+            The end of sequence token. When building a sequence using special tokens, this is not the token that is
+            used for the end of sequence. The token used is the `sep_token`.
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        word_cls_token (`str`, *optional*, defaults to `"[WORD_CLS]"`):
+            The classifier token which is used for word representations and word classification.
+            It is the first token of each word when built with special tokens.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names: List[str] = ["input_ids", "char_input_mask", "word_input_mask", "word_type_ids"]
+    padding_side: str = "right"
+    truncation_side: str = "right"
+    def __init__(
+        self,
+        vocab_file,
+        split_by_punct=False,
+        bos_token="[CLS]",
+        eos_token="[SEP]",
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        word_cls_token="[WORD_CLS]",
+        max_word_length=None,
+        model_max_length=None,
+        **kwargs,
+    ) -> None:
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a pretrained"
+                " model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
+        if max_word_length is not None:
+            self.max_word_length = max_word_length
+        else:
+            try:
+                with open(os.path.dirname(vocab_file) + "/config.json", "r") as f:
+                    config = json.load(f)
+                    self.max_word_length = config["max_word_length"]
+                    if model_max_length is None:
+                        model_max_length = config.get("max_seq_length", None)
+            except:
+                raise ValueError("Failed to load max_word_length from config.json. Please specify max_word_length.")
+        self.split_by_punct = split_by_punct
+        self.vocab_file = vocab_file
+        with open(vocab_file, 'r', encoding='utf-8') as f:
+            vocab_data = json.load(f)
+            self.vocab = vocab_data["vocab"]
+            self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            split_by_punct=split_by_punct,
+            model_max_length=model_max_length,
+            **kwargs,
+        )
+        self.unk_id = self.vocab["[UNK]"]
+        self.word_cls_token = word_cls_token
+        self.word_cls_token_id = self._convert_token_to_id(word_cls_token)
+        self.label_pad_token_id = -100
+        self.special_ids = [self._convert_token_to_id(token) for token in vocab_data["special_tokens"]]
+        #self.pad_word = [[self.word_cls_token_id] + [0]*(self.max_word_length-1)]
+        #self.pad_mask_word = [[1] + [0]*(self.max_word_length-1)]
+        self.pad_word = [[0] + [0]*(self.max_word_length-1)]
+        self.pad_mask_word = [[0] + [0]*(self.max_word_length-1)]
+    @staticmethod
+    def train(files: List[Union[str, os.PathLike]], output_dir: Union[str, os.PathLike], vocab_size: int=512, max_lines_to_consider=2_000_000):
+        char_maps = []
+        # Each input file is weighted equally, regardless of size
+        # This is to prevent one language from dominating the character distribution
+        for file in files:
+            print('Loading char counts from', file)
+            counter = Counter()
+            line_count = 0
+            with open(file, "r", encoding="utf-8") as file:
+                while line_count < max_lines_to_consider:
+                    lines = file.readlines(100*1024)
+                    if len(lines) == 0:
+                        break
+                    for line in lines:
+                        line = unicodedata.normalize('NFKC', line)
+                        line_count += 1
+                        counter.update(line)
+            d = {}
+            total = counter.total()
+            for char, count in counter.items():
+                d[char] = count / total
+            char_maps.append(d)
+        char_map = {}
+        for d in char_maps:
+            for char, freq in d.items():
+                if not char.isspace():
+                    char_map[char] = char_map.get(char, 0) + freq
+        special_tokens = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', '[WORD_CLS]']
+        chars_to_keep = sorted(list(char_map.keys()), key=lambda c: char_map[c], reverse=True)[:vocab_size-len(special_tokens)]
+        vocab_entries = [*special_tokens, *chars_to_keep]
+        vocab = {
+            'special_tokens': special_tokens,
+            'vocab': { key: i for i, key in enumerate(vocab_entries) }
+        }
+        assert(len(vocab_entries) == vocab_size)
+        filename = os.path.join(output_dir, VOCAB_FILES_NAMES["vocab_file"])
+        os.makedirs(output_dir, exist_ok=True)
+        print("Saving vocab to", filename)
+        with open(filename, 'w', encoding='utf-8') as f:
+            json.dump(vocab, f, ensure_ascii=False, indent=4)
+        return filename
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+    def get_vocab(self):
+        return self.vocab
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) to an id using the vocab."""
+        return self.vocab.get(token, self.unk_id)
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.inv_vocab[index] if index < self.vocab_size else self.unk_token
+    def convert_tokens_to_ids(self, tokens: Union[str, List[str], List[List[str]]]):
+        if isinstance(tokens, str):
+            return self._convert_token_to_id(tokens)
+        if len(tokens) > 0 and isinstance(tokens[0], str):
+            return [self._convert_token_to_id(token) for token in tokens]
+        return [[self._convert_token_to_id(token) for token in word] for word in tokens]
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        raise NotImplementedError
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        if token_ids_1 is None:
+            return [[self.cls_token_id]] + token_ids_0 + [[self.eos_token_id]]
+        return [[self.cls_token_id]] + token_ids_0 + [[self.eos_token_id], [self.cls_token_id]] + token_ids_1 + [[self.eos_token_id]]
+    def num_special_tokens_to_add(self, pair: bool = False) -> int:
+        return 3 if pair else 2
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        raise NotImplementedError
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None, has_special_tokens=False):
+        if has_special_tokens:
+            return [0] * (len(token_ids_0)+2) + ([1] * (len(token_ids_1)+2) if token_ids_1 is not None else [])
+        else:
+            return [0] * len(token_ids_0) + ([1] * len(token_ids_1) if token_ids_1 is not None else [])
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        filename = VOCAB_FILES_NAMES["vocab_file"]
+        if filename_prefix is not None:
+            filename = filename_prefix + "-" + filename
+        full_path = os.path.join(save_directory, filename)
+        with open(full_path, "w", encoding="utf-8") as f:
+            json.dump({
+                "special_tokens": self.all_special_tokens,
+                "vocab": self.get_vocab(),
+            }, f, ensure_ascii=False, indent=4)
+        return (full_path,)
+    def encode(
+        self,
+        text: Union[TextInput, PreTokenizedInput, EncodedInput],
+        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
+        is_split_into_words: bool = False,
+        add_special_tokens: bool = False,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> List[int]:
+        def get_input_ids(text):
+            if isinstance(text, str):
+                tokens = self.tokenize(text, **kwargs)
+                return self.convert_tokens_to_ids(tokens)
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
+                if is_split_into_words:
+                    tokens = list(
+                        itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
+                    )
+                    return self.convert_tokens_to_ids(tokens)
+                else:
+                    return self.convert_tokens_to_ids(text)
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], List[int]):
+                return text
+            else:
+                raise ValueError(
+                    f"Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.")
+        first_ids = get_input_ids(text)
+        second_ids = get_input_ids(text_pair) if text_pair is not None else None
+        if add_special_tokens:
+            sequence = self.build_inputs_with_special_tokens(first_ids, second_ids)
+        else:
+            sequence = first_ids
+        return sequence
+    def prepare_for_model(
+        self,
+        ids: List[List[int]],
+        pair_ids: Optional[List[List[int]]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: bool = True,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        add_word_cls: bool = True,
+        prepend_batch_axis: bool = False,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
+        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
+        manages a moving window (with user defined stride) for overflowing tokens.
+        Args:
+            ids (`List[List[int]]`):
+                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and
+                `convert_tokens_to_ids` methods.
+            pair_ids (`List[List[int]]`, *optional*):
+                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
+                and `convert_tokens_to_ids` methods.
+        """
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+        pair = bool(pair_ids is not None)
+        len_pair_ids = len(pair_ids) if pair else 0
+        if return_token_type_ids and not add_special_tokens:
+            raise ValueError(
+                "Asking to return token_type_ids while setting add_special_tokens to False "
+                "results in an undefined behavior. Please set add_special_tokens to True or "
+                "set return_token_type_ids to None."
+            )
+        if (
+            return_overflowing_tokens
+            and truncation_strategy == TruncationStrategy.LONGEST_FIRST
+            and pair_ids is not None
+        ):
+            raise ValueError(
+                "Not possible to return overflowing tokens for pair of sequences with the "
+                "`longest_first`. Please select another truncation strategy than `longest_first`, "
+                "for instance `only_second` or `only_first`."
+            )
+        encoded_inputs = {}
+        # Compute the total size of the returned encodings
+        total_len = len(ids) + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)
+        # Truncation: Handle max sequence length
+        overflowing_tokens = []
+        if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
+            ids, pair_ids, overflowing_tokens = self.truncate_sequences(
+                ids,
+                pair_ids=pair_ids,
+                num_tokens_to_remove=total_len - max_length,
+                truncation_strategy=truncation_strategy,
+                stride=stride,
+            )
+        if return_overflowing_tokens:
+            encoded_inputs["overflowing_tokens"] = overflowing_tokens
+            encoded_inputs["num_truncated_tokens"] = total_len - max_length
+        if add_special_tokens:
+            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
+        else:
+            sequence = ids + pair_ids if pair else ids
+        if add_word_cls:
+            for word in sequence:
+                word.insert(0, self.word_cls_token_id)
+        # Build output dictionary
+        encoded_inputs["input_ids"] = sequence
+        encoded_inputs["char_input_mask"] = [[1]*len(word)+[0]*(self.max_word_length-len(word)) for word in sequence]
+        encoded_inputs["word_input_mask"] = [1]*len(sequence)
+        if return_token_type_ids or pair:
+            encoded_inputs["word_type_ids"] = self.create_token_type_ids_from_sequences(ids, pair_ids, add_special_tokens)
+            assert len(encoded_inputs["word_type_ids"]) == len(encoded_inputs["word_input_mask"])
+        # Always pad words
+        for word in encoded_inputs["input_ids"]:
+            if len(word) < self.max_word_length:
+                word.extend([self.pad_token_id] * (self.max_word_length - len(word)))
+        # Padding
+        if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
+            encoded_inputs = self.pad(
+                encoded_inputs,
+                max_length=max_length,
+                padding=padding_strategy.value,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_attention_mask=return_attention_mask,
+            )
+        batch_outputs = BatchEncoding(
+            encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
+        )
+        return batch_outputs
+    def _encode_plus(
+        self,
+        text: Union[TextInput, PreTokenizedInput, EncodedInput],
+        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        add_word_cls: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        def get_input_ids(text):
+            if isinstance(text, str):
+                tokens = self.tokenize(text, **kwargs)
+                return self.convert_tokens_to_ids(tokens)
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
+                if is_split_into_words:
+                    tokens = list(
+                        itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
+                    )
+                    return self.convert_tokens_to_ids(tokens)
+                else:
+                    return self.convert_tokens_to_ids(text)
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], List[int]):
+                return text
+            else:
+                raise ValueError(
+                    f"Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.")
+        if return_offsets_mapping:
+            raise NotImplementedError(
+                "return_offset_mapping is not available when using Python tokenizers. "
+                "To use this feature, change your tokenizer to one deriving from "
+                "transformers.PreTrainedTokenizerFast. "
+                "More information on available tokenizers at "
+                "https://github.com/huggingface/transformers/pull/2674"
+            )
+        first_ids = get_input_ids(text)
+        second_ids = get_input_ids(text_pair) if text_pair is not None else None
+        return self.prepare_for_model(
+            first_ids,
+            pair_ids=second_ids,
+            add_special_tokens=add_special_tokens,
+            padding=padding_strategy.value,
+            truncation=truncation_strategy.value,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            prepend_batch_axis=True,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_length=return_length,
+            verbose=verbose,
+            add_word_cls=add_word_cls,
+        )
+    def _batch_encode_plus(
+        self,
+        batch_text_or_text_pairs: Union[
+            List[TextInput],
+            List[TextInputPair],
+            List[PreTokenizedInput],
+            List[PreTokenizedInputPair],
+            List[EncodedInput],
+            List[EncodedInputPair],
+        ],
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        def get_input_ids(text):
+            if isinstance(text, str):
+                tokens = self.tokenize(text, **kwargs)
+                return self.convert_tokens_to_ids(tokens)
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
+                if is_split_into_words:
+                    tokens = list(
+                        itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
+                    )
+                    return self.convert_tokens_to_ids(tokens)
+                else:
+                    return self.convert_tokens_to_ids(text)
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], List[int]):
+                return text
+            else:
+                raise ValueError(
+                    "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
+                )
+        if return_offsets_mapping:
+            raise NotImplementedError(
+                "return_offset_mapping is not available when using Python tokenizers. "
+                "To use this feature, change your tokenizer to one deriving from "
+                "transformers.PreTrainedTokenizerFast."
+            )
+        input_ids = []
+        for ids_or_pair_ids in batch_text_or_text_pairs:
+            if not isinstance(ids_or_pair_ids, (list, tuple)):
+                ids, pair_ids = ids_or_pair_ids, None
+            elif is_split_into_words and not isinstance(ids_or_pair_ids[0], (list, tuple)):
+                ids, pair_ids = ids_or_pair_ids, None
+            else:
+                ids, pair_ids = ids_or_pair_ids
+            first_ids = get_input_ids(ids)
+            second_ids = get_input_ids(pair_ids) if pair_ids is not None else None
+            input_ids.append((first_ids, second_ids))
+        batch_outputs = self._batch_prepare_for_model(
+            input_ids,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_length=return_length,
+            return_tensors=return_tensors,
+            verbose=verbose,
+        )
+        return BatchEncoding(batch_outputs)
+    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, split_long_words: bool = True) -> List[List[str]]:
+        text = unicodedata.normalize('NFKC', text)
+        if split_long_words:
+            tokenized_text = []
+            for token in text.split():
+                tokens = [char for char in token]
+                tokenized_text.extend(
+                    tokens[i: i + self.max_word_length - 1] for i in range(0, len(tokens), self.max_word_length - 1))
+            return tokenized_text
+        else:
+            return [[char for char in token] for token in text.split()]
+    def pad(
+        self,
+        encoded_inputs: Union[
+            BatchEncoding,
+            List[BatchEncoding],
+            Dict[str, EncodedInput],
+            Dict[str, List[EncodedInput]],
+            List[Dict[str, EncodedInput]],
+        ],
+        padding: Union[bool, str, PaddingStrategy] = True,
+        max_length: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None, # TODO: add support for pad_to_multiple_of
+        return_attention_mask: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        #label_pad_token_id=-100,
+        verbose: bool = True,
+    ) -> BatchEncoding:
+        # If we have a list of dicts, let's convert it in a dict of lists
+        # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
+        if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], Mapping):
+            encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}
+        # The model's main input name, usually `input_ids`, has be passed for padding
+        #if self.model_input_names[0] not in encoded_inputs:
+        #    raise ValueError(
+        #        "You should supply an encoding or a list of encodings to this method "
+        #        f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
+        #    )
+        required_input = encoded_inputs["input_ids"]
+        #if required_input is None or (isinstance(required_input, Sized) and len(required_input) == 0):
+        #    if return_attention_mask:
+        #        encoded_inputs["char_input_mask"] = []
+        #        encoded_inputs["word_input_mask"] = []
+        #    return encoded_inputs
+        # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
+        # and rebuild them afterwards if no return_tensors is specified
+        # Note that we lose the specific device the tensor may be on for PyTorch
+        #first_element = required_input[0]
+        ## At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
+        #if not isinstance(first_element, (int, list, tuple)):
+        #    if is_torch_tensor(first_element):
+        #        return_tensors = "pt" if return_tensors is None else return_tensors
+        #    for key, value in encoded_inputs.items():
+        #        encoded_inputs[key] = to_py_obj(value)
+        # Convert padding_strategy in PaddingStrategy
+        padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
+            padding=padding, max_length=max_length, verbose=verbose)
+        if padding_strategy == PaddingStrategy.DO_NOT_PAD:
+            return encoded_inputs
+        assert (padding_strategy == PaddingStrategy.LONGEST)
+        longest_in_batch = max(len(f) for f in required_input)
+        batch_outputs = {}
+        batch_outputs["input_ids"] = [f + self.pad_word*(longest_in_batch - len(f)) for f in encoded_inputs["input_ids"]]
+        batch_outputs["char_input_mask"] = [f + self.pad_mask_word*(longest_in_batch - len(f)) for f in encoded_inputs["char_input_mask"]]
+        batch_outputs["word_input_mask"] = \
+            [f + [0]*(longest_in_batch - len(f)) for f in encoded_inputs['word_input_mask']]
+        if "word_type_ids" in encoded_inputs:
+            batch_outputs["word_type_ids"] = [f + [0]*(longest_in_batch - len(f)) for f in encoded_inputs["word_type_ids"]]
+        batch_outputs["char_input_mask"] = torch.tensor(batch_outputs["char_input_mask"], dtype=torch.bool)
+        batch_outputs["word_input_mask"] = torch.tensor(batch_outputs["word_input_mask"], dtype=torch.bool)
+        # TODO: move label names elsewhere
+        label_fields = ('labels', 'upos', 'feats', 'heads', 'deprels', 'lemmas')
+        label_names = [feature for feature in encoded_inputs.keys() if feature in label_fields]
+        if len(label_names) > 0:
+            def to_list(tensor_or_iterable):
+                if is_torch_tensor(tensor_or_iterable):
+                    return tensor_or_iterable.tolist()
+                return list(tensor_or_iterable)
+            for label_name in label_names:
+                if label_name not in encoded_inputs:
+                    continue
+                labels = encoded_inputs[label_name]
+                label_pad_word = [[self.label_pad_token_id]*self.max_word_length]
+                if self.padding_side == "right":
+                    batch_outputs[label_name] = [
+                        to_list(label) + label_pad_word * (longest_in_batch - len(label)) for label in labels
+                    ]
+                else:
+                    batch_outputs[label_name] = [
+                        label_pad_word * (longest_in_batch - len(label)) + to_list(label) for label in labels
+                    ]
+        return BatchEncoding(batch_outputs, tensor_type=return_tensors)

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "[CLS]",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "eos_token": "[SEP]",
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "split_by_punct": false,
+  "tokenizer_class": "HLMTokenizer",
+  "unk_token": "[UNK]"
+}

vocab.json ADDED Viewed

	@@ -0,0 +1,523 @@

+{
+    "special_tokens": [
+        "[CLS]",
+        "[SEP]",
+        "[UNK]",
+        "[PAD]",
+        "[MASK]"
+    ],
+    "vocab": {
+        "[PAD]": 0,
+        "[UNK]": 1,
+        "[CLS]": 2,
+        "[SEP]": 3,
+        "[MASK]": 4,
+        "[WORD_CLS]": 5,
+        "e": 6,
+        "t": 7,
+        "ν": 8,
+        "a": 9,
+        "τ": 10,
+        "o": 11,
+        "α": 12,
+        "n": 13,
+        "ο": 14,
+        "h": 15,
+        "i": 16,
+        "s": 17,
+        "r": 18,
+        "ε": 19,
+        "ι": 20,
+        "d": 21,
+        "l": 22,
+        "ς": 23,
+        ",": 24,
+        "ρ": 25,
+        "κ": 26,
+        "σ": 27,
+        "π": 28,
+        "u": 29,
+        "μ": 30,
+        "λ": 31,
+        "m": 32,
+        "c": 33,
+        "f": 34,
+        "w": 35,
+        ".": 36,
+        "δ": 37,
+        "g": 38,
+        "y": 39,
+        "υ": 40,
+        "p": 41,
+        "ί": 42,
+        "γ": 43,
+        "ὶ": 44,
+        "b": 45,
+        "ω": 46,
+        "έ": 47,
+        "ἐ": 48,
+        "η": 49,
+        "θ": 50,
+        "ὸ": 51,
+        "ά": 52,
+        "ό": 53,
+        "ἀ": 54,
+        "v": 55,
+        "ῦ": 56,
+        "ὰ": 57,
+        "χ": 58,
+        "φ": 59,
+        "k": 60,
+        "ῶ": 61,
+        "ὐ": 62,
+        "ύ": 63,
+        "ῖ": 64,
+        "̅": 65,
+        "ὲ": 66,
+        "ὴ": 67,
+        "’": 68,
+        "I": 69,
+        "β": 70,
+        "ῆ": 71,
+        "ή": 72,
+        "ἰ": 73,
+        "\"": 74,
+        "·": 75,
+        "ξ": 76,
+        "T": 77,
+        ";": 78,
+        "ἔ": 79,
+        "ὁ": 80,
+        "A": 81,
+        "ἡ": 82,
+        "ώ": 83,
+        "ὑ": 84,
+        "ῷ": 85,
+        "ἄ": 86,
+        ":": 87,
+        "”": 88,
+        "“": 89,
+        "ζ": 90,
+        "ὺ": 91,
+        "ὅ": 92,
+        "S": 93,
+        "x": 94,
+        "H": 95,
+        "ἱ": 96,
+        "L": 97,
+        "-": 98,
+        "'": 99,
+        "M": 100,
+        "ῳ": 101,
+        "?": 102,
+        "ῇ": 103,
+        "ψ": 104,
+        "B": 105,
+        "W": 106,
+        "C": 107,
+        "ᾶ": 108,
+        "ὡ": 109,
+        "ἑ": 110,
+        "2": 111,
+        "ἴ": 112,
+        "ἶ": 113,
+        "—": 114,
+        "E": 115,
+        "Κ": 116,
+        "O": 117,
+        "Ἀ": 118,
+        "Π": 119,
+        "ὀ": 120,
+        "ῃ": 121,
+        "N": 122,
+        "D": 123,
+        "ὕ": 124,
+        "ἢ": 125,
+        "!": 126,
+        "R": 127,
+        "P": 128,
+        "q": 129,
+        "j": 130,
+        "1": 131,
+        "G": 132,
+        "0": 133,
+        "ὖ": 134,
+        "F": 135,
+        "Τ": 136,
+        "Σ": 137,
+        "ὄ": 138,
+        "Δ": 139,
+        "ὼ": 140,
+        "ἕ": 141,
+        "ᾳ": 142,
+        "Μ": 143,
+        "z": 144,
+        "Θ": 145,
+        "Y": 146,
+        "ἁ": 147,
+        "ἂ": 148,
+        "ὔ": 149,
+        "ῥ": 150,
+        "Ε": 151,
+        "Α": 152,
+        "ἦ": 153,
+        ")": 154,
+        "(": 155,
+        "ὥ": 156,
+        "ἷ": 157,
+        "J": 158,
+        "Ο": 159,
+        "ἵ": 160,
+        "Ἰ": 161,
+        "‘": 162,
+        "ʹ": 163,
+        "Ἐ": 164,
+        "ἤ": 165,
+        "3": 166,
+        "Λ": 167,
+        "ἅ": 168,
+        "Β": 169,
+        "ὗ": 170,
+        "«": 171,
+        "»": 172,
+        "Γ": 173,
+        "[": 174,
+        "]": 175,
+        "4": 176,
+        "ὃ": 177,
+        "Χ": 178,
+        "ἠ": 179,
+        "*": 180,
+        "〉": 181,
+        "〈": 182,
+        "V": 183,
+        "K": 184,
+        "U": 185,
+        "Ν": 186,
+        "Φ": 187,
+        "5": 188,
+        "ὧ": 189,
+        "ἥ": 190,
+        "6": 191,
+        "8": 192,
+        "ᾷ": 193,
+        "&": 194,
+        "7": 195,
+        "9": 196,
+        "Ῥ": 197,
+        "Ι": 198,
+        "ὠ": 199,
+        "Ζ": 200,
+        "Ὁ": 201,
+        "Ἡ": 202,
+        "ὦ": 203,
+        "Ἄ": 204,
+        "Ὅ": 205,
+        "ϊ": 206,
+        "Ἑ": 207,
+        "ἃ": 208,
+        "X": 209,
+        "ἧ": 210,
+        "ἣ": 211,
+        "Ἔ": 212,
+        "Η": 213,
+        "Υ": 214,
+        "ἓ": 215,
+        "ῴ": 216,
+        "Ρ": 217,
+        "ᾧ": 218,
+        "Ὀ": 219,
+        "ΐ": 220,
+        "Ἱ": 221,
+        "`": 222,
+        "ῤ": 223,
+        "ὢ": 224,
+        "Ϛ": 225,
+        "Ω": 226,
+        "ῄ": 227,
+        "ὤ": 228,
+        "ᾖ": 229,
+        "̲": 230,
+        "ᾗ": 231,
+        "ἳ": 232,
+        "Ἕ": 233,
+        "Q": 234,
+        "Z": 235,
+        "ὓ": 236,
+        "„": 237,
+        "Ξ": 238,
+        "Ὑ": 239,
+        "†": 240,
+        "ἆ": 241,
+        "ὂ": 242,
+        "é": 243,
+        "+": 244,
+        "Ἴ": 245,
+        "ᾠ": 246,
+        "Ὡ": 247,
+        "ϋ": 248,
+        "Ἠ": 249,
+        "𐅻": 250,
+        "|": 251,
+        "ᾴ": 252,
+        "Ὥ": 253,
+        "ᾔ": 254,
+        "ῒ": 255,
+        "𐆄": 256,
+        "Ψ": 257,
+        "Ἁ": 258,
+        "Ὠ": 259,
+        "Ἥ": 260,
+        "ᾤ": 261,
+        "Ἅ": 262,
+        "#": 263,
+        "–": 264,
+        "̈": 265,
+        "Ἵ": 266,
+        "𐅶": 267,
+        "_": 268,
+        "ö": 269,
+        "Ὄ": 270,
+        "ᾐ": 271,
+        "ᾄ": 272,
+        "Ἢ": 273,
+        "�": 274,
+        "Ἤ": 275,
+        "Ὃ": 276,
+        "Ἦ": 277,
+        "𐅵": 278,
+        "‖": 279,
+        "}": 280,
+        "{": 281,
+        "͵": 282,
+        "=": 283,
+        "⸢": 284,
+        "⸥": 285,
+        "æ": 286,
+        "Ὦ": 287,
+        "Ἆ": 288,
+        "⸤": 289,
+        "⏑": 290,
+        "ὣ": 291,
+        "ᾰ": 292,
+        "⟦": 293,
+        "⟧": 294,
+        "Ὕ": 295,
+        "ᾀ": 296,
+        "ᾅ": 297,
+        "⸏": 298,
+        "‹": 299,
+        "›": 300,
+        "è": 301,
+        "á": 302,
+        "Ϟ": 303,
+        ">": 304,
+        "Ὧ": 305,
+        "<": 306,
+        "Ϙ": 307,
+        "œ": 308,
+        "ΰ": 309,
+        "□": 310,
+        "͜": 311,
+        "ᾱ": 312,
+        "́": 313,
+        "ᾑ": 314,
+        "ˈ": 315,
+        "ë": 316,
+        "Ἂ": 317,
+        "′": 318,
+        "ῐ": 319,
+        "ϝ": 320,
+        "Ὢ": 321,
+        "ᾆ": 322,
+        "ῠ": 323,
+        "⩚": 324,
+        "►": 325,
+        "◄": 326,
+        "§": 327,
+        "𐆃": 328,
+        "ñ": 329,
+        "ῑ": 330,
+        "×": 331,
+        "Ἃ": 332,
+        "ῡ": 333,
+        "ἲ": 334,
+        "ῂ": 335,
+        "⸣": 336,
+        "±": 337,
+        "‵": 338,
+        "%": 339,
+        "ü": 340,
+        "Ἶ": 341,
+        "Ὤ": 342,
+        "𐆊": 343,
+        "ê": 344,
+        "à": 345,
+        "̄": 346,
+        "ç": 347,
+        "â": 348,
+        "ä": 349,
+        "ô": 350,
+        "$": 351,
+        "𐆆": 352,
+        "̓": 353,
+        "Ὣ": 354,
+        "/": 355,
+        "Á": 356,
+        "£": 357,
+        "Ἧ": 358,
+        "Ἓ": 359,
+        "ᾂ": 360,
+        "Ϡ": 361,
+        "^": 362,
+        "א": 363,
+        "ā": 364,
+        "⧙": 365,
+        "⧘": 366,
+        "̔": 367,
+        "Æ": 368,
+        "ó": 369,
+        "ŭ": 370,
+        "É": 371,
+        "°": 372,
+        "ὒ": 373,
+        "̇": 374,
+        "⁝": 375,
+        "Ἣ": 376,
+        "ᾕ": 377,
+        "ï": 378,
+        "ῗ": 379,
+        "ϼ": 380,
+        "î": 381,
+        "⏒": 382,
+        "𐆂": 383,
+        "ῼ": 384,
+        "í": 385,
+        "ᾦ": 386,
+        "√": 387,
+        "⏔": 388,
+        "⸐": 389,
+        "ϛ": 390,
+        "♃": 391,
+        "̆": 392,
+        "ἒ": 393,
+        "↑": 394,
+        "ἇ": 395,
+        "ú": 396,
+        "û": 397,
+        "ē": 398,
+        "ᾇ": 399,
+        "ῢ": 400,
+        "ð": 401,
+        "❛": 402,
+        "❜": 403,
+        "͂": 404,
+        "ū": 405,
+        "ī": 406,
+        "‚": 407,
+        "‛": 408,
+        "@": 409,
+        "⊗": 410,
+        "Ϊ": 411,
+        "š": 412,
+        "ῲ": 413,
+        "‧": 414,
+        "ś": 415,
+        "⏓": 416,
+        "⸎": 417,
+        "⸓": 418,
+        "ṛ": 419,
+        "ù": 420,
+        "ō": 421,
+        "̀": 422,
+        "ᾲ": 423,
+        "ṇ": 424,
+        "ᾡ": 425,
+        "※": 426,
+        "ͅ": 427,
+        "Î": 428,
+        "ῧ": 429,
+        "ě": 430,
+        "ῌ": 431,
+        "⁄": 432,
+        "ž": 433,
+        "̶": 434,
+        "Ç": 435,
+        "ב": 436,
+        "Ὶ": 437,
+        "•": 438,
+        "\\": 439,
+        "י": 440,
+        "𐆅": 441,
+        "þ": 442,
+        "͝": 443,
+        "À": 444,
+        "ṃ": 445,
+        "ו": 446,
+        "✕": 447,
+        "Ί": 448,
+        "Ὗ": 449,
+        "å": 450,
+        "ר": 451,
+        "Ϝ": 452,
+        "ì": 453,
+        "Œ": 454,
+        "⸍": 455,
+        "𐅷": 456,
+        "Ά": 457,
+        "⎫": 458,
+        "⎬": 459,
+        "⎭": 460,
+        "☩": 461,
+        "ת": 462,
+        "ĝ": 463,
+        "☉": 464,
+        "È": 465,
+        "ש": 466,
+        "Έ": 467,
+        "ᾼ": 468,
+        "↕": 469,
+        "מ": 470,
+        "נ": 471,
+        "ϳ": 472,
+        "♎": 473,
+        "♏": 474,
+        "♑": 475,
+        "ò": 476,
+        "ה": 477,
+        "ן": 478,
+        "ק": 479,
+        "ė": 480,
+        "♈": 481,
+        "♉": 482,
+        "♋": 483,
+        "♌": 484,
+        "̳": 485,
+        "⳨": 486,
+        "♐": 487,
+        "♒": 488,
+        "♓": 489,
+        "͡": 490,
+        "Ñ": 491,
+        "ã": 492,
+        "ס": 493,
+        "̂": 494,
+        "♊": 495,
+        "♍": 496,
+        "⁚": 497,
+        "ᾁ": 498,
+        "⏕": 499,
+        "∶": 500,
+        "ל": 501,
+        "פ": 502,
+        "ą": 503,
+        "ĩ": 504,
+        "⎪": 505,
+        "ח": 506,
+        "∧": 507,
+        "צ": 508,
+        "Ἇ": 509,
+        "Ͻ": 510,
+        "҅": 511
+    }
+}