--- license: gpl-3.0 language: - zh - en pipeline_tag: visual-question-answering tags: - ziya - fengshenbang - LVLM - visual question answering --- # Ziya-Visual-14B-Chat ## 软件依赖 ``` pip install torch==1.12.1 tokenizers==0.13.3 git+https://github.com/huggingface/transformers ``` ## 模型分类 Model Taxonomy | 需求 Demand | 任务 Task | 系列 Series | 模型 Model | 参数 Parameter | 额外 Extra | | :----: | :----: | :----: | :----: | :----: | :----: | | 多模态 Multi-Modal | 通用 General | 姜子牙-多模态 Ziya-Visual | InstructBLIP LLaMA | 14B | English&Chinese | ## 使用 Usage ```python import gradio as gr from PIL import Image import torch import random from fengshen.models.instruct_ditto.modeling_instruct_ditto import InstructDittoLMForConditionalGeneration, DittoQFromerForPretrain, DittoLMForConditionalGeneration from torchvision.transforms import Compose, ToTensor, Resize, Normalize from transformers import LlamaTokenizer, BertTokenizer, GenerationConfig from torchvision.transforms import Normalize, Compose, RandomResizedCrop, InterpolationMode, ToTensor, RandomHorizontalFlip OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073) OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') _MODEL_PATH = "your model path" transforms = Compose([ RandomResizedCrop( 224, scale=(0.5, 1.0), interpolation=InterpolationMode.BICUBIC, ), RandomHorizontalFlip(), ToTensor(), Normalize(mean=OPENAI_DATASET_MEAN, std=OPENAI_DATASET_STD), ]) model = InstructDittoLMForConditionalGeneration.from_pretrained(_MODEL_PATH).to(device).eval() instruct_tokenizer = BertTokenizer.from_pretrained(os.path.join(_MODEL_PATH, "qformer_tokenizer")) tokenizer = LlamaTokenizer.from_pretrained(_MODEL_PATH, use_fast = False) qformer_prompt = "{prompt}" qformer_prompt_list = [] prompt_prefix = '' llm_prompt = ": {prompt}\n:" llm_prompt_list = [] prompt = ["your prompt"] for i in prompt: qformer_prompt_list.append(qformer_prompt.format_map({"prompt":i})) llm_prompt_list.append(llm_prompt.format_map({"prompt":i})) image_url = ["your image"] imgs = [] for img_url in image_url: imgs.append(transforms(Image.open(img_url).convert('RGB'))) config = GenerationConfig( # do_sample=True, #False # num_beams=3, # 3 # min_length=4, max_new_tokens=128, repetition_penalty=1.18, # length_penalty=1, temperature=0.7, top_p=0.1, bos_token_id=1, eos_token_id=2, pad_token_id=39410, ) imgs = torch.stack(imgs) instruct_tokenizer.padding_side = 'right' tokenizer.padding_side = 'left' for i in range(imgs.shape[0]): prompt_prefix_ids = tokenizer(prompt_prefix, return_tensors="pt").input_ids qformer_instruct_ids = instruct_tokenizer(qformer_prompt_list[i], return_tensors="pt").input_ids llm_instruct_ids = tokenizer(llm_prompt_list[i], return_tensors="pt", add_special_tokens=False).input_ids qformer_instruct_atts = instruct_tokenizer(qformer_prompt_list[i], return_tensors="pt").attention_mask llm_instruct_atts = tokenizer(llm_prompt_list[i], return_tensors="pt", add_special_tokens=False).attention_mask captions = model.generate( imgs[i].unsqueeze(0).to('cuda'), qformer_instruct_ids=qformer_instruct_ids.to('cuda'), prompt_prefix_ids = prompt_prefix_ids.to('cuda'), llm_instruct_ids=llm_instruct_ids.to('cuda'), generation_config=config ) caption = tokenizer.decode(captions[0]) print("问: " + prompt[i] + "\n" + "答: " + caption) ``` ## 引用 Citation 如果您在您的工作中使用了我们的模型,可以引用我们的[论文](https://arxiv.org/abs/2210.08590),[论文](https://arxiv.org/abs/2310.08166): If you are using the resource for your work, please cite the our [paper](https://arxiv.org/abs/2210.08590), [paper](https://arxiv.org/abs/2310.08166):