Merge branch 'main' of https://huggingface.co/jinaai/jina-embeddings-v2
Browse files- custom_st.py +10 -10
custom_st.py
CHANGED
@@ -1,13 +1,11 @@
|
|
1 |
-
import base64
|
2 |
import json
|
3 |
import os
|
4 |
from io import BytesIO
|
5 |
from typing import Any, Dict, List, Optional, Tuple, Union
|
6 |
|
7 |
-
import requests
|
8 |
import torch
|
9 |
from torch import nn
|
10 |
-
from transformers import AutoConfig,
|
11 |
|
12 |
|
13 |
class Transformer(nn.Module):
|
@@ -35,11 +33,11 @@ class Transformer(nn.Module):
|
|
35 |
def __init__(
|
36 |
self,
|
37 |
model_name_or_path: str,
|
38 |
-
max_seq_length: int
|
39 |
-
model_args:
|
40 |
-
tokenizer_args:
|
41 |
-
config_args:
|
42 |
-
cache_dir: str
|
43 |
do_lower_case: bool = False,
|
44 |
tokenizer_name_or_path: str = None,
|
45 |
) -> None:
|
@@ -121,8 +119,10 @@ class Transformer(nn.Module):
|
|
121 |
return self.auto_model.config.hidden_size
|
122 |
|
123 |
def tokenize(
|
124 |
-
self,
|
125 |
-
|
|
|
|
|
126 |
"""Tokenizes a text and maps tokens to token-ids"""
|
127 |
output = {}
|
128 |
if isinstance(texts[0], str):
|
|
|
|
|
1 |
import json
|
2 |
import os
|
3 |
from io import BytesIO
|
4 |
from typing import Any, Dict, List, Optional, Tuple, Union
|
5 |
|
|
|
6 |
import torch
|
7 |
from torch import nn
|
8 |
+
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
9 |
|
10 |
|
11 |
class Transformer(nn.Module):
|
|
|
33 |
def __init__(
|
34 |
self,
|
35 |
model_name_or_path: str,
|
36 |
+
max_seq_length: int = None,
|
37 |
+
model_args: Dict[str, Any] = None,
|
38 |
+
tokenizer_args: Dict[str, Any] = None,
|
39 |
+
config_args: Dict[str, Any] = None,
|
40 |
+
cache_dir: str = None,
|
41 |
do_lower_case: bool = False,
|
42 |
tokenizer_name_or_path: str = None,
|
43 |
) -> None:
|
|
|
119 |
return self.auto_model.config.hidden_size
|
120 |
|
121 |
def tokenize(
|
122 |
+
self,
|
123 |
+
texts: Union[List[str], List[dict], List[Tuple[str, str]]],
|
124 |
+
padding: Union[str, bool] = True
|
125 |
+
) -> Dict[str, torch.Tensor]:
|
126 |
"""Tokenizes a text and maps tokens to token-ids"""
|
127 |
output = {}
|
128 |
if isinstance(texts[0], str):
|