from transformers import PretrainedConfig, AutoConfig class CLIPEncoderDecoderConfig(PretrainedConfig): model_type = "clip-encoder-decoder" def __init__( self, decoder={'_name_or_path': '', 'activation_function': 'gelu_new', 'add_cross_attention': True, 'architectures': ['GPT2LMHeadModel'], 'attn_pdrop': 0.1, 'bad_words_ids': None, 'begin_suppress_tokens': None, 'bos_token_id': 50256, 'chunk_size_feed_forward': 0, 'cross_attention_hidden_size': None, 'decoder_start_token_id': None, 'diversity_penalty': 0.0, 'do_sample': False, 'early_stopping': False, 'embd_pdrop': 0.1, 'encoder_no_repeat_ngram_size': 0, 'eos_token_id': 50256, 'exponential_decay_length_penalty': None, 'finetuning_task': None, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'id2label': {'0': 'LABEL_0', '1': 'LABEL_1'}, 'initializer_range': 0.02, 'is_decoder': True, 'is_encoder_decoder': False, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'layer_norm_epsilon': 1e-05, 'length_penalty': 1.0, 'max_length': 20, 'min_length': 0, 'model_type': 'gpt2', 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_inner': None, 'n_layer': 12, 'n_positions': 1024, 'no_repeat_ngram_size': 0, 'num_beam_groups': 1, 'num_beams': 1, 'num_return_sequences': 1, 'output_attentions': False, 'output_hidden_states': False, 'output_scores': False, 'pad_token_id': None, 'prefix': None, 'problem_type': None, 'pruned_heads': {}, 'remove_invalid_values': False, 'reorder_and_upcast_attn': False, 'repetition_penalty': 1.0, 'resid_pdrop': 0.1, 'return_dict': True, 'return_dict_in_generate': False, 'scale_attn_by_inverse_layer_idx': False, 'scale_attn_weights': True, 'sep_token_id': None, 'summary_activation': None, 'summary_first_dropout': 0.1, 'summary_proj_to_labels': True, 'summary_type': 'cls_index', 'summary_use_proj': True, 'suppress_tokens': None, 'task_specific_params': {'text-generation': {'do_sample': True, 'max_length': 50}}, 'temperature': 1.0, 'tf_legacy_loss': False, 'tie_encoder_decoder': False, 'tie_word_embeddings': True, 'tokenizer_class': None, 'top_k': 50, 'top_p': 1.0, 'torch_dtype': None, 'torchscript': False, 'typical_p': 1.0, 'use_bfloat16': False, 'use_cache': True, 'vocab_size': 50257}, **kwargs): super().__init__(**kwargs) self.decoder = AutoConfig.for_model(**decoder) self.is_encoder_decoder = True @classmethod def from_encoder_decoder_configs( cls, encoder_config: PretrainedConfig, decoder_config: PretrainedConfig, **kwargs ) -> PretrainedConfig: r""" Instantiate a [`VisionEncoderDecoderConfig`] (or a derived class) from a pre-trained encoder model configuration and decoder model configuration. Returns: [`VisionEncoderDecoderConfig`]: An instance of a configuration object """ decoder_config.is_decoder = True decoder_config.add_cross_attention = True return cls(encoder=encoder_config.to_dict(), decoder=decoder_config.to_dict(), **kwargs)