--- language: - en library_name: transformers pipeline_tag: image-text-to-text tags: - vision --- # ADD HEAD ``` print('Add Vision...') # ADD HEAD # Combine pre-trained encoder and pre-trained decoder to form a Seq2Seq model Vmodel = VisionEncoderDecoderModel.from_encoder_decoder_pretrained( "google/vit-base-patch16-224-in21k", "LeroyDyer/Mixtral_AI_Tiny" ) _Encoder_ImageProcessor = Vmodel.encoder _Decoder_ImageTokenizer = Vmodel.decoder _VisionEncoderDecoderModel = Vmodel # Add Pad tokems LM_MODEL.VisionEncoderDecoder = _VisionEncoderDecoderModel # Add Sub Components LM_MODEL.Encoder_ImageProcessor = _Encoder_ImageProcessor LM_MODEL.Decoder_ImageTokenizer = _Decoder_ImageTokenizer LM_MODEL ```