processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b") got error
I got the following error when i load processor, I assume it's during load tokenizer from pretrain.
Exception Traceback (most recent call last)
Cell In[2], line 7
4 import requests
6 model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b")
----> 7 processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
9 device = "cuda" if torch.cuda.is_available() else "cpu"
10 model.to(device)
File ~/.local/lib/python3.9/site-packages/transformers/models/instructblip/processing_instructblip.py:170, in InstructBlipProcessor.from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
167 @classmethod
168 def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
169 qformer_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="qformer_tokenizer")
--> 170 args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
171 args.append(qformer_tokenizer)
172 return cls(*args)
File ~/.local/lib/python3.9/site-packages/transformers/processing_utils.py:259, in ProcessorMixin._get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
256 else:
257 attribute_class = getattr(transformers_module, class_name)
--> 259 args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
260 return args
File ~/.local/lib/python3.9/site-packages/transformers/models/auto/tokenization_auto.py:692, in AutoTokenizer.from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
688 if tokenizer_class is None:
689 raise ValueError(
690 f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported."
691 )
--> 692 return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
694 # Otherwise we have to be creative.
695 # if model is an encoder decoder, the encoder tokenizer class is used by default
696 if isinstance(config, EncoderDecoderConfig):
File ~/.local/lib/python3.9/site-packages/transformers/tokenization_utils_base.py:1846, in PreTrainedTokenizerBase.from_pretrained(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, *init_inputs, **kwargs)
1843 else:
1844 logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
-> 1846 return cls._from_pretrained(
1847 resolved_vocab_files,
1848 pretrained_model_name_or_path,
1849 init_configuration,
1850 *init_inputs,
1851 use_auth_token=token,
1852 cache_dir=cache_dir,
1853 local_files_only=local_files_only,
1854 _commit_hash=commit_hash,
1855 _is_local=is_local,
1856 **kwargs,
1857 )
File ~/.local/lib/python3.9/site-packages/transformers/tokenization_utils_base.py:2009, in PreTrainedTokenizerBase._from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, use_auth_token, cache_dir, local_files_only, _commit_hash, _is_local, *init_inputs, **kwargs)
2007 # Instantiate tokenizer.
2008 try:
-> 2009 tokenizer = cls(*init_inputs, **init_kwargs)
2010 except OSError:
2011 raise OSError(
2012 "Unable to load vocabulary from file. "
2013 "Please check that the provided vocabulary is accessible and not corrupted."
2014 )
File ~/.local/lib/python3.9/site-packages/transformers/models/llama/tokenization_llama_fast.py:100, in LlamaTokenizerFast.init(self, vocab_file, tokenizer_file, clean_up_tokenization_spaces, unk_token, bos_token, eos_token, add_bos_token, add_eos_token, **kwargs)
88 def init(
89 self,
90 vocab_file=None,
(...)
98 **kwargs,
99 ):
--> 100 super().init(
101 vocab_file=vocab_file,
102 tokenizer_file=tokenizer_file,
103 clean_up_tokenization_spaces=clean_up_tokenization_spaces,
104 unk_token=unk_token,
105 bos_token=bos_token,
106 eos_token=eos_token,
107 **kwargs,
108 )
109 self._add_bos_token = add_bos_token
110 self._add_eos_token = add_eos_token
File ~/.local/lib/python3.9/site-packages/transformers/tokenization_utils_fast.py:111, in PreTrainedTokenizerFast.init(self, *args, **kwargs)
108 fast_tokenizer = copy.deepcopy(tokenizer_object)
109 elif fast_tokenizer_file is not None and not from_slow:
110 # We have a serialization from tokenizers which let us directly build the backend
--> 111 fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
112 elif slow_tokenizer is not None:
113 # We need to convert a slow tokenizer to build the backend
114 fast_tokenizer = convert_slow_tokenizer(slow_tokenizer)
Exception: data did not match any variant of untagged enum PyNormalizerTypeWrapper at line 58 column 3
Hi
@handing2412
Thanks for the issue, can you try to re-run your snippet on the latest transformers and tokenizers versions?
pip install -U transformers tokenizers