Jinze commited on
Commit
0337806
1 Parent(s): 45dfea6

Support streaming

Browse files
Files changed (1) hide show
  1. tokenization_qwen.py +9 -1
tokenization_qwen.py CHANGED
@@ -153,6 +153,10 @@ class QWenTokenizer(PreTrainedTokenizer):
153
  self.box_end_id = self.special_tokens[self.box_end_tag]
154
  self.quad_start_id = self.special_tokens[self.quad_start_tag]
155
  self.quad_end_id = self.special_tokens[self.quad_end_tag]
 
 
 
 
156
 
157
  enc = tiktoken.Encoding(
158
  "Qwen",
@@ -354,7 +358,11 @@ class QWenTokenizer(PreTrainedTokenizer):
354
  token_ids = _replace_closed_tag(token_ids, self.img_start_id, self.img_end_id, _decode_imgurl)
355
 
356
  if skip_special_tokens:
357
- token_ids = [i for i in token_ids if i < self.eod_id]
 
 
 
 
358
  return self.tokenizer.decode(token_ids, errors=errors or self.errors)
359
 
360
  def to_list_format(self, text: str):
 
153
  self.box_end_id = self.special_tokens[self.box_end_tag]
154
  self.quad_start_id = self.special_tokens[self.quad_start_tag]
155
  self.quad_end_id = self.special_tokens[self.quad_end_tag]
156
+ self.image_special_tokens = set([
157
+ self.ref_start_id, self.ref_end_id, self.box_start_id, self.box_end_id,
158
+ self.quad_start_id, self.quad_end_id,
159
+ ])
160
 
161
  enc = tiktoken.Encoding(
162
  "Qwen",
 
358
  token_ids = _replace_closed_tag(token_ids, self.img_start_id, self.img_end_id, _decode_imgurl)
359
 
360
  if skip_special_tokens:
361
+ if kwargs.get('keep_image_special', False):
362
+ token_ids = [i for i in token_ids if i < self.eod_id
363
+ or i in self.image_special_tokens]
364
+ else:
365
+ token_ids = [i for i in token_ids if i < self.eod_id]
366
  return self.tokenizer.decode(token_ids, errors=errors or self.errors)
367
 
368
  def to_list_format(self, text: str):