isoformer-anonymous commited on
Commit
eeaf5b8
1 Parent(s): c4cb572

Upload tokenizer

Browse files
Files changed (1) hide show
  1. isoformer_tokenizer.py +1 -6
isoformer_tokenizer.py CHANGED
@@ -36,7 +36,6 @@ class IsoformerTokenizer(PreTrainedTokenizer):
36
 
37
  def __init__(
38
  self,
39
- config,
40
  **kwargs
41
  ):
42
 
@@ -55,9 +54,6 @@ class IsoformerTokenizer(PreTrainedTokenizer):
55
  # protein_hf_tokenizer.eos_token = None # Stops the tokenizer adding an EOS/SEP token at the end
56
  # protein_hf_tokenizer.init_kwargs["eos_token"] = None # Ensures it doesn't come back when reloading
57
 
58
- self.num_tokens_per_seq_nuctf = config.num_tokens_per_seq_nuctf
59
- self.num_tokens_per_seq_nuctf_rna = config.num_tokens_per_seq_nuctf_rna
60
- self.num_protein_tokens_per_seq = config.num_protein_tokens_per_seq
61
  self.dna_tokenizer = dna_hf_tokenizer
62
  self.rna_tokenizer = rna_hf_tokenizer
63
  self.protein_tokenizer = protein_hf_tokenizer
@@ -65,12 +61,11 @@ class IsoformerTokenizer(PreTrainedTokenizer):
65
  self.dna_tokens = open("dna_vocab_list.txt", "r").read() .split("\n")
66
  self.rna_tokens = open("rna_vocab_list.txt", "r").read() .split("\n")
67
  self.protein_tokens = open("protein_vocab_list.txt", "r").read() .split("\n")
68
- self.config = config
69
 
70
  super().__init__(**kwargs)
71
 
72
  def __call__(self, dna_input, rna_input, protein_input):
73
- dna_output = self.dna_tokenizer(dna_input) #, max_length=196608, padding="max_length")
74
  rna_output = self.rna_tokenizer(rna_input, max_length=1024, padding="max_length")
75
  protein_output = self.protein_tokenizer(protein_input, max_length=1024, padding="max_length")
76
  return dna_output, rna_output, protein_output
 
36
 
37
  def __init__(
38
  self,
 
39
  **kwargs
40
  ):
41
 
 
54
  # protein_hf_tokenizer.eos_token = None # Stops the tokenizer adding an EOS/SEP token at the end
55
  # protein_hf_tokenizer.init_kwargs["eos_token"] = None # Ensures it doesn't come back when reloading
56
 
 
 
 
57
  self.dna_tokenizer = dna_hf_tokenizer
58
  self.rna_tokenizer = rna_hf_tokenizer
59
  self.protein_tokenizer = protein_hf_tokenizer
 
61
  self.dna_tokens = open("dna_vocab_list.txt", "r").read() .split("\n")
62
  self.rna_tokens = open("rna_vocab_list.txt", "r").read() .split("\n")
63
  self.protein_tokens = open("protein_vocab_list.txt", "r").read() .split("\n")
 
64
 
65
  super().__init__(**kwargs)
66
 
67
  def __call__(self, dna_input, rna_input, protein_input):
68
+ dna_output = self.dna_tokenizer(dna_input)
69
  rna_output = self.rna_tokenizer(rna_input, max_length=1024, padding="max_length")
70
  protein_output = self.protein_tokenizer(protein_input, max_length=1024, padding="max_length")
71
  return dna_output, rna_output, protein_output