isoformer-anonymous
commited on
Commit
•
eeaf5b8
1
Parent(s):
c4cb572
Upload tokenizer
Browse files- isoformer_tokenizer.py +1 -6
isoformer_tokenizer.py
CHANGED
@@ -36,7 +36,6 @@ class IsoformerTokenizer(PreTrainedTokenizer):
|
|
36 |
|
37 |
def __init__(
|
38 |
self,
|
39 |
-
config,
|
40 |
**kwargs
|
41 |
):
|
42 |
|
@@ -55,9 +54,6 @@ class IsoformerTokenizer(PreTrainedTokenizer):
|
|
55 |
# protein_hf_tokenizer.eos_token = None # Stops the tokenizer adding an EOS/SEP token at the end
|
56 |
# protein_hf_tokenizer.init_kwargs["eos_token"] = None # Ensures it doesn't come back when reloading
|
57 |
|
58 |
-
self.num_tokens_per_seq_nuctf = config.num_tokens_per_seq_nuctf
|
59 |
-
self.num_tokens_per_seq_nuctf_rna = config.num_tokens_per_seq_nuctf_rna
|
60 |
-
self.num_protein_tokens_per_seq = config.num_protein_tokens_per_seq
|
61 |
self.dna_tokenizer = dna_hf_tokenizer
|
62 |
self.rna_tokenizer = rna_hf_tokenizer
|
63 |
self.protein_tokenizer = protein_hf_tokenizer
|
@@ -65,12 +61,11 @@ class IsoformerTokenizer(PreTrainedTokenizer):
|
|
65 |
self.dna_tokens = open("dna_vocab_list.txt", "r").read() .split("\n")
|
66 |
self.rna_tokens = open("rna_vocab_list.txt", "r").read() .split("\n")
|
67 |
self.protein_tokens = open("protein_vocab_list.txt", "r").read() .split("\n")
|
68 |
-
self.config = config
|
69 |
|
70 |
super().__init__(**kwargs)
|
71 |
|
72 |
def __call__(self, dna_input, rna_input, protein_input):
|
73 |
-
dna_output = self.dna_tokenizer(dna_input)
|
74 |
rna_output = self.rna_tokenizer(rna_input, max_length=1024, padding="max_length")
|
75 |
protein_output = self.protein_tokenizer(protein_input, max_length=1024, padding="max_length")
|
76 |
return dna_output, rna_output, protein_output
|
|
|
36 |
|
37 |
def __init__(
|
38 |
self,
|
|
|
39 |
**kwargs
|
40 |
):
|
41 |
|
|
|
54 |
# protein_hf_tokenizer.eos_token = None # Stops the tokenizer adding an EOS/SEP token at the end
|
55 |
# protein_hf_tokenizer.init_kwargs["eos_token"] = None # Ensures it doesn't come back when reloading
|
56 |
|
|
|
|
|
|
|
57 |
self.dna_tokenizer = dna_hf_tokenizer
|
58 |
self.rna_tokenizer = rna_hf_tokenizer
|
59 |
self.protein_tokenizer = protein_hf_tokenizer
|
|
|
61 |
self.dna_tokens = open("dna_vocab_list.txt", "r").read() .split("\n")
|
62 |
self.rna_tokens = open("rna_vocab_list.txt", "r").read() .split("\n")
|
63 |
self.protein_tokens = open("protein_vocab_list.txt", "r").read() .split("\n")
|
|
|
64 |
|
65 |
super().__init__(**kwargs)
|
66 |
|
67 |
def __call__(self, dna_input, rna_input, protein_input):
|
68 |
+
dna_output = self.dna_tokenizer(dna_input)
|
69 |
rna_output = self.rna_tokenizer(rna_input, max_length=1024, padding="max_length")
|
70 |
protein_output = self.protein_tokenizer(protein_input, max_length=1024, padding="max_length")
|
71 |
return dna_output, rna_output, protein_output
|